minilm / transformer.py

Upload transformer.py with huggingface_hub

4915795 verified about 1 month ago

40.2 kB

	"""
	transformer.py
	==============
	Transformer Decoder-only architecture implemented from scratch in PyTorch.

	This module is part of the project:
	"A bilingual PT+EN LLM with BPE tokenizer and training loop
	implemented from scratch, with didactic and documented code"

	Author : André Costa
	License : MIT

	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	THEORETICAL BACKGROUND
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	The Transformer architecture (Vaswani et al., 2017)
	-------------------------------------------------
	The Transformer originally emerged as an encoder-decoder model for
	machine translation. For generative language models, we use only
	the decoder half — called "decoder-only" or "causal LM".

	This is the architecture used by GPT-2, GPT-3, GPT-4, LLaMA, Mistral,
	and virtually all modern LLMs.

	Why decoder-only for text generation?
	--------------------------------------------
	The decoder-only uses causal attention (also called masked attention):
	each token can only "see" previous tokens, never future ones.
	This allows training the model to predict the next token — the standard
	pre-training objective (Language Modeling or LM loss).

	Entrada : [t1, t2, t3, t4]
	Saída : [t2, t3, t4, t5] ← cada posição prevê o próximo token

	Overview of the implemented architecture
	-----------------------------------------
	Our implementation incorporates modern improvements over the original
	2017 Transformer:

	1. RMSNorm (Zhang & Sennrich, 2019) instead of LayerNorm
	→ More efficient: no mean computation, normalizes variance only

	2. RoPE — Rotary Position Embedding (Su et al., 2021) instead of
	absolute positional embeddings
	→ Better generalization to sequences longer than those seen in training

	3. SwiGLU (Shazeer, 2020) instead of FFN with ReLU
	→ Gated activation learns to "filter" information adaptively

	4. Pre-norm (norm before attention/FFN) instead of post-norm
	→ More stable training, healthier gradients

	These are exactly the choices made by LLaMA (Touvron et al., 2023),
	which have become the industry standard.

	Data flow through the model:
	tokens (B, T)
	↓ nn.Embedding
	x (B, T, d_model)
	↓ N × TransformerBlock
	x (B, T, d_model)
	↓ RMSNorm final
	x (B, T, d_model)
	↓ Linear (lm_head)
	logits (B, T, vocab_size)

	where B = batch size, T = seq_len, d_model = model dimension

	Referências:
	- Vaswani, A. et al. (2017). Attention is all you need. NeurIPS.
	- Zhang, B., & Sennrich, R. (2019). Root mean square layer normalization.
	- Su, J. et al. (2021). RoFormer: Enhanced transformer with rotary
	position embedding. arXiv:2104.09864.
	- Shazeer, N. (2020). GLU variants improve transformer. arXiv:2002.05202.
	- Touvron, H. et al. (2023). LLaMA: Open and efficient foundation
	language models. arXiv:2302.13971.
	"""

	import math
	from dataclasses import dataclass
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# ─────────────────────────────────────────────────────────────
	# Model configuration
	# ─────────────────────────────────────────────────────────────

	@dataclass
	class ModelConfig:
	"""
	Model architecture hyperparameters.

	Centralizing configuration in a dataclass allows:
	- Saving and loading the architecture alongside weights
	- Reproducing experiments exactly
	- Varying model sizes without changing the code

	Nomenclature follows literature conventions:
	d_model = embedding space dimension (also called
	"hidden size" or "model dimension")
	n_heads = number of attention heads
	n_layers = number of stacked Transformer blocks
	d_ff = internal FFN (feed-forward network) dimension,
	typically 4 × d_model (original) or 8/3 × d_model (SwiGLU)

	Pre-defined configurations (for reference):
	Tiny (~15M): d_model=256, n_heads=4, n_layers=4, d_ff=1024
	Small (~85M): d_model=512, n_heads=8, n_layers=8, d_ff=2048
	Base (~310M): d_model=768, n_heads=12, n_layers=12, d_ff=3072
	"""
	# Vocabulary and sequence
	vocab_size: int = 16384 # must match vocab_size of BPETokenizer
	seq_len: int = 512 # maximum sequence length

	# Model dimensions
	d_model: int = 512 # embedding dimension
	n_heads: int = 8 # number of attention heads
	n_layers: int = 8 # number of Transformer blocks
	d_ff: int = 1536 # FFN dimension (≈ 3 × d_model for SwiGLU)

	# Regularization
	dropout: float = 0.1 # dropout applied in attention and FFN

	# Precision
	use_flash: bool = True # use Flash Attention if available (PyTorch 2+)

	def __post_init__(self):
	"""Validate hyperparameter consistency."""
	assert self.d_model % self.n_heads == 0, (
	f"d_model ({self.d_model}) must be divisible by "
	f"n_heads ({self.n_heads})"
	)
	# Dimension per attention head
	self.d_head = self.d_model // self.n_heads

	@property
	def n_params(self) -> int:
	"""
	Estimate the number of model parameters.

	Useful for checking whether the model fits in available VRAM before
	instantiation. The estimate is approximate (ignores bias and buffers).

	Main components:
	- Embedding: vocab_size × d_model
	- Per block: attention (4 × d_model²) + FFN (3 × d_model × d_ff)
	- LM head: d_model × vocab_size (usually tied with embedding)
	"""
	embed = self.vocab_size * self.d_model
	attn = self.n_layers * 4 * (self.d_model ** 2)
	ffn = self.n_layers * 3 * self.d_model * self.d_ff
	lm_head = self.d_model * self.vocab_size
	return embed + attn + ffn + lm_head


	# ─────────────────────────────────────────────────────────────
	# RMSNorm — Root Mean Square Layer Normalization
	# ─────────────────────────────────────────────────────────────

	class RMSNorm(nn.Module):
	"""
	Root Mean Square Layer Normalization (Zhang & Sennrich, 2019).

	The original LayerNorm normalizes by mean and standard deviation:
	LayerNorm(x) = (x - μ) / (σ + ε) * γ + β

	RMSNorm simplifies: does not subtract the mean (μ = 0 assumed),
	normalizes only by RMS (root mean square):
	RMSNorm(x) = x / RMS(x) * γ
	RMS(x) = sqrt(mean(x²) + ε)

	Advantages:
	- ~15% faster than LayerNorm (no mean computation)
	- No β (bias) parameter, slightly reducing parameter count
	- Same empirical quality in LLMs (used in LLaMA, Mistral, etc.)

	Args:
	d_model: Dimension of the vector to normalize.
	eps: Numerical stability constant (avoids division by zero).
	"""

	def __init__(self, d_model: int, eps: float = 1e-6):
	super().__init__()
	self.eps = eps
	# γ (gamma): learnable scale parameter, initialized to 1
	self.weight = nn.Parameter(torch.ones(d_model))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""
	Apply RMSNorm to tensor x.

	Args:
	x: Tensor of shape (..., d_model).

	Returns:
	Normalized tensor of same shape as x.
	"""
	# Compute RMS along the last dimension (d_model)
	# x.float() ensures numerical precision even with bf16/fp16
	rms = x.float().pow(2).mean(dim=-1, keepdim=True).add(self.eps).sqrt()

	# Normalize and restore original dtype
	x_norm = (x.float() / rms).to(x.dtype)

	# Apply scale parameter γ
	return x_norm * self.weight


	# ─────────────────────────────────────────────────────────────
	# RoPE — Rotary Position Embedding
	# ─────────────────────────────────────────────────────────────

	def precompute_rope_freqs(d_head: int, seq_len: int, base: float = 10000.0) -> torch.Tensor:
	"""
	Pre-compute complex frequencies for RoPE.

	RoPE (Su et al., 2021) encodes position by rotating query and key
	vectors in the complex space. The rotation at position m uses
	angle θ_i = m / base^(2i/d), where i indexes the dimension pair.

	Geometric intuition:
	- Each pair of dimensions (2i, 2i+1) forms a 2D plane
	- At position m, we rotate in that plane by m × θ_i
	- The dot product q·k preserves only the position difference (m-n)
	- This gives relative position attention automatically

	Advantages over absolute embeddings:
	- Generalization to seq_len > training seq_len (extrapolation)
	- No extra parameters
	- Attention is naturally sensitive to relative distance

	Args:
	d_head: Dimension of each attention head.
	seq_len: Maximum sequence length.
	base: Frequency base (10000 is the RoPE original default).

	Returns:
	Complex tensor of shape (seq_len, d_head // 2) with the frequencies.
	"""
	# θ_i = 1 / base^(2i / d_head), for i = 0, 1, ..., d_head/2 - 1
	theta = 1.0 / (base ** (torch.arange(0, d_head, 2).float() / d_head))

	# Positions: 0, 1, 2, ..., seq_len-1
	positions = torch.arange(seq_len).float()

	# Outer product: freqs[m, i] = m × θ_i
	# Shape: (seq_len, d_head // 2)
	freqs = torch.outer(positions, theta)

	# Convert to complex form: e^(i × freqs) = cos(freqs) + i×sin(freqs)
	freqs_complex = torch.polar(torch.ones_like(freqs), freqs)

	return freqs_complex


	def apply_rope(x: torch.Tensor, freqs_complex: torch.Tensor) -> torch.Tensor:
	"""
	Apply Rotary Position Embedding to a query or key tensor.

	Application works in 3 steps:
	1. Interpret consecutive dimension pairs as complex numbers
	2. Multiply by the rotation factor e^(i × m × θ)
	3. Convert back to real tensor

	Args:
	x: Tensor of shape (B, T, n_heads, d_head).
	freqs_complex: Pre-computed frequencies of shape (T, d_head // 2).

	Returns:
	Rotated tensor of same shape as x.
	"""
	B, T, H, D = x.shape

	# Group dimension pairs: (..., d_head) → (..., d_head//2, 2)
	# and interpret as complex numbers
	x_complex = torch.view_as_complex(x.float().reshape(B, T, H, D // 2, 2))

	# Adjust freqs_complex shape for broadcast: (1, T, 1, d_head//2)
	freqs = freqs_complex.unsqueeze(0).unsqueeze(2)

	# Rotate: complex multiplication applies the rotation
	x_rotated = x_complex * freqs

	# Convert back to real: (B, T, H, d_head//2, 2) → (B, T, H, d_head)
	x_out = torch.view_as_real(x_rotated).reshape(B, T, H, D)

	return x_out.to(x.dtype)


	# ─────────────────────────────────────────────────────────────
	# Causal Self-Attention
	# ─────────────────────────────────────────────────────────────

	class CausalSelfAttention(nn.Module):
	"""
	Causal (masked) multi-head attention with RoPE.

	Attention (Vaswani et al., 2017) computes:
	Attention(Q, K, V) = softmax(QK^T / √d_head) × V

	"Causal" means we add a mask that prevents each position from
	attending to future positions. This is essential for autoregressive
	training (predicting the next token).

	"Multi-head" means we repeat the process n_heads times in different
	subspaces, then concatenate:
	MultiHead(Q,K,V) = Concat(head_1, ..., head_h) × W_O

	Each head learns to attend to different types of relationships:
	some heads learn syntax, others semantics, etc.

	Detailed implementation:
	1. Project x into Q, K, V via linear transformations
	2. Apply RoPE to Q and K (not V)
	3. Compute attention with causal mask
	4. Project output back to d_model

	Args:
	config: Model configuration.
	"""

	def __init__(self, config: ModelConfig):
	super().__init__()
	self.config = config
	self.n_heads = config.n_heads
	self.d_head = config.d_head
	self.d_model = config.d_model

	# Linear projections for Q, K, V — combined into a single matrix
	# for efficiency. Shape: (d_model) → (3 × d_model)
	# Then split into three equal parts.
	self.qkv_proj = nn.Linear(config.d_model, 3 * config.d_model, bias=False)

	# Output projection: head concatenation → d_model
	self.out_proj = nn.Linear(config.d_model, config.d_model, bias=False)

	# Attention dropout (regularization)
	self.attn_dropout = nn.Dropout(config.dropout)

	# Causal mask: lower triangular matrix of 1s
	# Registered as buffer (not a parameter, but saved in state_dict)
	# Shape: (1, 1, seq_len, seq_len) for broadcast with (B, H, T, T)
	mask = torch.tril(torch.ones(config.seq_len, config.seq_len))
	self.register_buffer("causal_mask", mask.view(1, 1, config.seq_len, config.seq_len))

	def forward(
	self,
	x: torch.Tensor,
	freqs_complex: torch.Tensor,
	) -> torch.Tensor:
	"""
	Compute causal multi-head attention.

	Args:
	x: Input tensor, shape (B, T, d_model).
	freqs_complex: Pre-computed RoPE frequencies, shape (T, d_head//2).

	Returns:
	Output tensor, shape (B, T, d_model).
	"""
	B, T, C = x.shape # C = d_model

	# ── Step 1: Project into Q, K, V ─────────────────────────────────
	# qkv shape: (B, T, 3 × d_model)
	qkv = self.qkv_proj(x)

	# Split into Q, K, V: each has shape (B, T, d_model)
	q, k, v = qkv.split(self.d_model, dim=-1)

	# Reshape to (B, T, n_heads, d_head) to apply RoPE per head
	q = q.view(B, T, self.n_heads, self.d_head)
	k = k.view(B, T, self.n_heads, self.d_head)
	v = v.view(B, T, self.n_heads, self.d_head)

	# ── Step 2: Apply RoPE to Q and K ────────────────────────────────
	# V does not receive RoPE — position is encoded in attention via Q·K
	q = apply_rope(q, freqs_complex)
	k = apply_rope(k, freqs_complex)

	# Transpose to (B, n_heads, T, d_head) — format expected by attention
	q = q.transpose(1, 2)
	k = k.transpose(1, 2)
	v = v.transpose(1, 2)

	# ── Step 3: Compute attention ─────────────────────────────────────
	if self.config.use_flash and hasattr(F, "scaled_dot_product_attention"):
	# Flash Attention (PyTorch 2.0+): more memory and speed efficient
	# Implements the same math, but without materializing
	# the full attention matrix (B, H, T, T) in memory
	y = F.scaled_dot_product_attention(
	q, k, v,
	attn_mask=None,
	dropout_p=self.attn_dropout.p if self.training else 0.0,
	is_causal=True, # aplica máscara causal automaticamente
	)
	else:
	# Manual attention — more readable, useful for understanding the mechanism
	# scores shape: (B, n_heads, T, T)
	scale = 1.0 / math.sqrt(self.d_head)
	scores = torch.matmul(q, k.transpose(-2, -1)) * scale

	# Apply causal mask: future positions receive -inf
	# After softmax, -inf → 0 (no attention to future tokens)
	mask = self.causal_mask[:, :, :T, :T]
	scores = scores.masked_fill(mask == 0, float("-inf"))

	# Softmax normalizes scores into a probability distribution
	attn_weights = F.softmax(scores, dim=-1)
	attn_weights = self.attn_dropout(attn_weights)

	# Weighted average of values
	y = torch.matmul(attn_weights, v)

	# ── Step 4: Regroup heads and project output ─────────────────────
	# (B, n_heads, T, d_head) → (B, T, n_heads, d_head) → (B, T, d_model)
	y = y.transpose(1, 2).contiguous().view(B, T, C)

	# Output projection
	return self.out_proj(y)


	# ─────────────────────────────────────────────────────────────
	# SwiGLU Feed-Forward Network
	# ─────────────────────────────────────────────────────────────

	class SwiGLUFFN(nn.Module):
	"""
	Feed-Forward Network with SwiGLU activation (Shazeer, 2020).

	The original Transformer FFN uses two linear layers with ReLU:
	FFN(x) = max(0, xW_1 + b_1)W_2 + b_2

	SwiGLU (Swish-Gated Linear Unit) uses a learnable "gate":
	SwiGLU(x) = (xW_1 ⊙ Swish(xW_gate)) × W_2

	Where ⊙ is element-wise multiplication and Swish(x) = x × σ(x).

	The W_gate learns to filter which activations are relevant,
	giving the model more expressive capacity at similar cost.

	Why 3 matrices instead of 2?
	SwiGLU uses 3 projections (W_1, W_gate, W_2) instead of 2.
	To maintain the same parameter count as the original FFN
	(which uses d_ff = 4 × d_model), we use d_ff ≈ 8/3 × d_model.
	In practice, we round to multiples of 256 for efficiency.

	Args:
	config: Model configuration.
	"""

	def __init__(self, config: ModelConfig):
	super().__init__()

	# Main projection and gate projection — done together for efficiency
	# Shape: d_model → 2 × d_ff (then split in half)
	self.gate_proj = nn.Linear(config.d_model, config.d_ff, bias=False)
	self.up_proj = nn.Linear(config.d_model, config.d_ff, bias=False)

	# Output projection: d_ff → d_model
	self.down_proj = nn.Linear(config.d_ff, config.d_model, bias=False)

	self.dropout = nn.Dropout(config.dropout)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""
	Apply the FFN with SwiGLU.

	Args:
	x: Tensor of shape (B, T, d_model).

	Returns:
	Tensor of shape (B, T, d_model).
	"""
	# gate: passed through Swish (SiLU in PyTorch) — learns the "filter"
	# up: main projection — the "content"
	# Element-wise multiplication is the "gating"
	gate = F.silu(self.gate_proj(x)) # Swish/SiLU: x * sigmoid(x)
	up = self.up_proj(x)

	# Combine gate and up, project back
	hidden = self.dropout(gate * up)
	return self.down_proj(hidden)


	# ─────────────────────────────────────────────────────────────
	# Bloco Transformer
	# ─────────────────────────────────────────────────────────────

	class TransformerBlock(nn.Module):
	"""
	Full Transformer block with pre-norm.

	Each block consists of two sub-modules with residual connections:
	1. Self-Attention (with RoPE and causal mask)
	2. Feed-Forward Network (SwiGLU)

	Pre-norm vs Post-norm:
	The original Transformer (Vaswani et al., 2017) uses post-norm:
	x = LayerNorm(x + SubLayer(x))

	Modern LLMs use pre-norm (also called "pre-LN"):
	x = x + SubLayer(LayerNorm(x))

	Pre-norm has more stable gradients during training, since
	normalization happens before non-linear transformations.
	This allows training deeper networks without extensive warm-up.

	Residual connections (He et al., 2016):
	The addition x + SubLayer(x) creates a "shortcut" that allows
	gradients to flow directly through layers, independent of
	transformations. Fundamental for training deep networks.

	Args:
	config: Model configuration.
	"""

	def __init__(self, config: ModelConfig):
	super().__init__()

	# Normalization before attention (pre-norm)
	self.norm1 = RMSNorm(config.d_model)

	# Causal multi-head attention with RoPE
	self.attn = CausalSelfAttention(config)

	# Normalization before FFN (pre-norm)
	self.norm2 = RMSNorm(config.d_model)

	# Feed-forward with SwiGLU
	self.ffn = SwiGLUFFN(config)

	def forward(
	self,
	x: torch.Tensor,
	freqs_complex: torch.Tensor,
	) -> torch.Tensor:
	"""
	Process x through the Transformer block.

	Args:
	x: Tensor of shape (B, T, d_model).
	freqs_complex: RoPE frequencies of shape (T, d_head//2).

	Returns:
	Tensor of shape (B, T, d_model).
	"""
	# Sub-block 1: attention with residual connection
	# Pre-norm: normalize x before passing through attention
	x = x + self.attn(self.norm1(x), freqs_complex)

	# Sub-block 2: FFN with residual connection
	x = x + self.ffn(self.norm2(x))

	return x


	# ─────────────────────────────────────────────────────────────
	# Modelo completo
	# ─────────────────────────────────────────────────────────────

	class MiniLM(nn.Module):
	"""
	Complete Transformer Decoder-only language model.

	"MiniLM" is the name given to this project's model. Architecture
	based on modern best practices (LLaMA-style).

	Components (in forward pass order):
	1. Token Embedding: maps token IDs to dense vectors
	2. N × TransformerBlock: processes vectors with attention and FFN
	3. Final RMSNorm: normalizes before output projection
	4. LM Head: projects from d_model to vocab_size (logits)

	Weight tying:
	Input embedding and LM head weights are shared (tied weights).
	This reduces parameter count by ~10-20% without quality loss —
	used in GPT-2 and LLaMA.
	Intuition: the embedding learns "what tokens look like", and
	the LM head learns "which tokens are likely" — similar information.

	Args:
	config: Full model configuration.
	"""

	def __init__(self, config: ModelConfig):
	super().__init__()
	self.config = config

	# ── Token embedding ───────────────────────────────────────────────
	# Maps integer IDs (0..vocab_size-1) to d_model-dimensional vectors
	# Weight shape: (vocab_size, d_model)
	self.token_emb = nn.Embedding(config.vocab_size, config.d_model)

	# ── Embedding dropout ─────────────────────────────────────────────
	self.emb_dropout = nn.Dropout(config.dropout)

	# ── Transformer block stack ───────────────────────────────────────
	self.blocks = nn.ModuleList([
	TransformerBlock(config)
	for _ in range(config.n_layers)
	])

	# ── Final normalization ───────────────────────────────────────────
	self.norm_final = RMSNorm(config.d_model)

	# ── LM Head ───────────────────────────────────────────────────────
	# Projects d_model → vocab_size to obtain logits (no bias)
	self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

	# Weight tying: share weights between embedding and lm_head
	# Both have shape (vocab_size, d_model) — they are the same tensor
	self.lm_head.weight = self.token_emb.weight

	# ── RoPE pre-computation ─────────────────────────────────────────
	# Compute rotation frequencies once, for all positions
	# Registered as buffer: saved in checkpoint, but not a parameter
	freqs = precompute_rope_freqs(config.d_head, config.seq_len)
	self.register_buffer("freqs_complex", freqs)

	# ── Weight initialization ─────────────────────────────────────────
	self.apply(self._init_weights)

	# Special initialization for residual projections (GPT-2 style):
	# scale by number of layers to stabilize gradients
	for name, param in self.named_parameters():
	if name.endswith(("out_proj.weight", "down_proj.weight")):
	nn.init.normal_(
	param,
	mean=0.0,
	std=0.02 / math.sqrt(2 * config.n_layers)
	)

	def _init_weights(self, module: nn.Module) -> None:
	"""
	Initialize model weights.

	Follows GPT-2 initialization:
	- Linear and Embedding layers: Normal(0, 0.02)
	- Bias (when present): zeros

	The Normal(0, 0.02) distribution is small enough to keep
	activations at a reasonable scale at the start of training,
	avoiding gradient explosion or vanishing.

	Args:
	module: Module to initialize (called recursively by apply()).
	"""
	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(
	self,
	input_ids: torch.Tensor,
	targets: Optional[torch.Tensor] = None,
	) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""
	Model forward pass.

	Training mode (targets provided):
	Computes logits AND loss efficiently in a single forward pass.

	Inference mode (targets=None):
	Returns only the last position logits.

	Args:
	input_ids: Token ID tensor, shape (B, T).
	targets: Next token IDs, shape (B, T).
	If provided, computes the cross-entropy loss.

	Returns:
	Tuple (logits, loss):
	logits: shape (B, T, vocab_size) — raw probabilities
	loss: scalar if targets provided, None otherwise

	Training example:
	input_ids = [t1, t2, t3, t4] ← input tokens
	targets = [t2, t3, t4, t5] ← next tokens (shift of 1)
	The model learns: given t1, predict t2; given t1,t2, predict t3; etc.
	"""
	B, T = input_ids.shape
	assert T <= self.config.seq_len, (
	f"Sequence of length {T} exceeds seq_len={self.config.seq_len}"
	)

	# ── Token embedding ───────────────────────────────────────────────
	# (B, T) → (B, T, d_model)
	x = self.token_emb(input_ids)
	x = self.emb_dropout(x)

	# ── RoPE frequencies for the current T positions ─────────────────
	# Slicing: take only the first T positions (important for
	# incremental generation where T < seq_len)
	freqs = self.freqs_complex[:T]

	# ── Pass through Transformer blocks ──────────────────────────────
	for block in self.blocks:
	x = block(x, freqs)

	# ── Final normalization ───────────────────────────────────────────
	x = self.norm_final(x)

	# ── LM Head ───────────────────────────────────────────────────────
	if targets is not None:
	# Training mode: compute logits for all positions
	# (B, T, d_model) → (B, T, vocab_size)
	logits = self.lm_head(x)

	# Cross-entropy loss: flatten (B, T, vocab_size) → (B*T, vocab_size)
	# and targets (B, T) → (B*T,)
	loss = F.cross_entropy(
	logits.view(-1, self.config.vocab_size),
	targets.view(-1),
	ignore_index=-1, # -1 is used to mask padding positions
	)
	return logits, loss
	else:
	# Inference mode: compute logits only for the last token
	# More efficient — intermediate logits are not needed
	logits = self.lm_head(x[:, -1:, :])
	return logits, None

	@torch.no_grad()
	def generate(
	self,
	input_ids: torch.Tensor,
	max_new_tokens: int = 100,
	temperature: float = 1.0,
	top_k: Optional[int] = None,
	top_p: Optional[float] = None,
	) -> torch.Tensor:
	"""
	Autoregressive text generation.

	The generation process works in a loop:
	1. Pass the current sequence through the model → next token logits
	2. Apply temperature (controls randomness)
	3. Apply top-k and/or top-p filters (controls diversity)
	4. Sample the next token
	5. Append to sequence and repeat

	Temperature:
	- T → 0: deterministic generation (always the most probable token)
	- T = 1: original model distribution
	- T > 1: more random, more creative (but may be incoherent)

	Top-k sampling:
	Keeps only the k most probable tokens before sampling.
	Prevents very unlikely tokens from being selected.

	Top-p (nucleus) sampling (Holtzman et al., 2019):
	Keeps the smallest set of tokens whose cumulative probability
	≥ p. Adaptively selects more or fewer tokens depending on
	the distribution.

	Args:
	input_ids: Initial context tokens, shape (1, T).
	max_new_tokens: How many new tokens to generate.
	temperature: Randomness control (0.1 to 2.0).
	top_k: Filter to top-k tokens (e.g., 50).
	top_p: Nucleus sampling (e.g., 0.9).

	Returns:
	Tensor with full sequence (context + generated), shape (1, T+N).
	"""
	self.eval()

	for _ in range(max_new_tokens):
	# Truncate context if it exceeds seq_len
	context = input_ids[:, -self.config.seq_len:]

	# Forward pass — only the last token logits
	logits, _ = self(context)
	# logits shape: (1, 1, vocab_size) → (vocab_size,)
	logits = logits[:, -1, :].squeeze(0)

	# Apply temperature
	if temperature != 1.0:
	logits = logits / temperature

	# Apply top-k: zero out logits outside top-k
	if top_k is not None:
	top_k = min(top_k, logits.size(-1))
	values, _ = torch.topk(logits, top_k)
	threshold = values[-1]
	logits = logits.masked_fill(logits < threshold, float("-inf"))

	# Apply top-p (nucleus sampling)
	if top_p is not None:
	sorted_logits, sorted_indices = torch.sort(logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

	# Remove tokens above the cumulative threshold
	sorted_indices_to_remove = cumulative_probs > top_p
	# Keep at least one token (shift right)
	sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
	sorted_indices_to_remove[0] = False

	indices_to_remove = sorted_indices[sorted_indices_to_remove]
	logits[indices_to_remove] = float("-inf")

	# Convert logits to probabilities and sample
	probs = F.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1).unsqueeze(0)

	# Append the new token to the sequence
	input_ids = torch.cat([input_ids, next_token], dim=1)

	return input_ids

	def count_parameters(self) -> dict[str, int]:
	"""
	Count model parameters by component.

	Useful for verifying the parameter distribution and understanding
	where model capacity is concentrated.

	Returns:
	Dictionary with parameter count per component.
	"""
	def count(module):
	return sum(p.numel() for p in module.parameters())

	return {
	"token_embedding": count(self.token_emb),
	"attention_layers": sum(count(b.attn) for b in self.blocks),
	"ffn_layers": sum(count(b.ffn) for b in self.blocks),
	"norm_layers": sum(count(b.norm1) + count(b.norm2) for b in self.blocks),
	"lm_head": 0, # tied weights — not counted twice
	"total": count(self),
	}

	def __repr__(self) -> str:
	params = self.count_parameters()
	return (
	f"MiniLM(\n"
	f" vocab_size={self.config.vocab_size}, "
	f"seq_len={self.config.seq_len}\n"
	f" d_model={self.config.d_model}, "
	f"n_heads={self.config.n_heads}, "
	f"n_layers={self.config.n_layers}\n"
	f" d_ff={self.config.d_ff}, "
	f"d_head={self.config.d_head}\n"
	f" params={params['total'] / 1e6:.1f}M\n"
	f")"
	)


	# ─────────────────────────────────────────────────────────────
	# Utilitários de VRAM
	# ─────────────────────────────────────────────────────────────

	def estimate_vram(config: ModelConfig, batch_size: int = 8, dtype_bytes: int = 2) -> dict:
	"""
	Estimate VRAM usage for training the model.

	Total training memory has four components:
	1. Model parameters
	2. Gradients (same size as parameters)
	3. Optimizer states (AdamW keeps 2 moments per parameter)
	4. Activations (depends on batch size and seq_len)

	This is a conservative estimate — actual usage may vary.

	Args:
	config: Model configuration.
	batch_size: Training batch size.
	dtype_bytes: Bytes per parameter (2 for bf16/fp16, 4 for fp32).

	Returns:
	Dictionary with GB estimates per component.
	"""
	n_params = config.n_params

	# Parameters + gradients (same dtype)
	params_gb = n_params * dtype_bytes / 1e9
	grads_gb = params_gb

	# AdamW: 2 moments in fp32 (8 bytes per parameter)
	optimizer_gb = n_params * 8 / 1e9

	# Activations (approximate estimate)
	# Each block stores: x, attn_weights, ffn_hidden
	activations_per_block = batch_size * config.seq_len * config.d_model * dtype_bytes
	activations_gb = config.n_layers * activations_per_block / 1e9

	total_gb = params_gb + grads_gb + optimizer_gb + activations_gb

	return {
	"parameters": f"{params_gb:.2f} GB",
	"gradients": f"{grads_gb:.2f} GB",
	"optimizer": f"{optimizer_gb:.2f} GB",
	"activations": f"{activations_gb:.2f} GB",
	"total_estimate":f"{total_gb:.2f} GB",
	"n_params": f"{n_params / 1e6:.1f}M",
	}


	# ─────────────────────────────────────────────────────────────
	# Demo
	# ─────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	print("=" * 60)
	print(" MiniLM Demo")
	print("=" * 60)

	# Small configuration (~85M parameters)
	config = ModelConfig(
	vocab_size=16384,
	seq_len=512,
	d_model=512,
	n_heads=8,
	n_layers=8,
	d_ff=1536,
	dropout=0.1,
	)

	print(f"\nAvailable configurations:")
	configs = {
	"Tiny (~15M)": ModelConfig(d_model=256, n_heads=4, n_layers=4, d_ff=768),
	"Small (~85M)": ModelConfig(d_model=512, n_heads=8, n_layers=8, d_ff=1536),
	"Base (~310M)": ModelConfig(d_model=768, n_heads=12, n_layers=12, d_ff=2304),
	}
	for name, cfg in configs.items():
	print(f" {name}: {cfg.n_params / 1e6:.0f}M params")

	print(f"\nInstantiating Small model...")
	model = MiniLM(config)
	print(model)

	# Contagem detalhada de parâmetros
	print("\nParameter distribution:")
	for component, count in model.count_parameters().items():
	if count > 0:
	print(f" {component:<20}: {count / 1e6:.2f}M")

	# Estimativa de VRAM
	print("\nVRAM estimate (batch=8, bf16):")
	vram = estimate_vram(config, batch_size=8, dtype_bytes=2)
	for k, v in vram.items():
	print(f" {k:<20}: {v}")

	# Teste de forward pass
	print("\nForward pass test...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f" Device: {device}")
	model = model.to(device)

	B, T = 2, 64 # batch_size=2, seq_len=64
	input_ids = torch.randint(0, config.vocab_size, (B, T)).to(device)
	targets = torch.randint(0, config.vocab_size, (B, T)).to(device)

	logits, loss = model(input_ids, targets)
	print(f" Input shape : {input_ids.shape}")
	print(f" Logits shape : {logits.shape}")
	print(f" Initial loss : {loss.item():.4f}")
	print(f" Expected loss: {math.log(config.vocab_size):.4f} (maximum entropy)")

	# Teste de geração
	print("\nGeneration test (10 tokens)...")
	prompt = torch.randint(0, config.vocab_size, (1, 5)).to(device)
	generated = model.generate(prompt, max_new_tokens=10, temperature=0.8, top_k=50)
	print(f" Prompt shape : {prompt.shape}")
	print(f" Generated shape: {generated.shape}")
	print(f" New tokens : {generated[0, 5:].tolist()}")
	print("\nForward pass and generation OK.")