gemmagain-trained-s1 / modeling_gemmagain.py

Training in progress, step 146

b1a93b6 verified 4 months ago

24.1 kB

	# coding=utf-8
	# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Gemmagain - Gemma3 text model with layer looping support.

	This model allows running the same physical layers multiple times in sequence,
	enabling parameter-efficient deep networks. Compatible with standard Gemma3 weights.
	"""
	import copy
	from typing import Callable, Optional, Union

	import torch
	import torch.nn as nn
	from torch.nn import CrossEntropyLoss

	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache, DynamicLayer
	from transformers.generation import GenerationMixin
	from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
	from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
	from transformers.modeling_layers import GradientCheckpointingLayer
	from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
	from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
	from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
	from transformers.processing_utils import Unpack
	from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
	from transformers.utils.deprecation import deprecate_kwarg

	from .configuration_gemmagain import GemmagainConfig


	logger = logging.get_logger(__name__)


	class Gemma3TextScaledWordEmbedding(nn.Embedding):
	"""
	This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
	"""

	def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
	super().__init__(num_embeddings, embedding_dim, padding_idx)
	self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)

	def forward(self, input_ids: torch.Tensor):
	return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)


	class Gemma3MLP(nn.Module):
	def __init__(self, config: GemmagainConfig):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
	self.act_fn = ACT2FN[config.hidden_activation]

	def forward(self, x):
	down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
	return down_proj


	class Gemma3RMSNorm(nn.Module):
	def __init__(self, dim: int, eps: float = 1e-6):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.zeros(dim))

	def _norm(self, x):
	return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

	def forward(self, x):
	output = self._norm(x.float())
	# Gemma3 uses (x * w).to(dtype) instead of x.to(dtype) * w
	output = output * (1.0 + self.weight.float())
	return output.type_as(x)

	def extra_repr(self):
	return f"{tuple(self.weight.shape)}, eps={self.eps}"


	class Gemma3RotaryEmbedding(nn.Module):
	inv_freq: torch.Tensor

	def __init__(self, config: GemmagainConfig, device=None):
	super().__init__()
	if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
	self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
	else:
	self.rope_type = "default"
	self.max_seq_len_cached = config.max_position_embeddings
	self.original_max_seq_len = config.max_position_embeddings

	self.config = config
	self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

	inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.original_inv_freq = self.inv_freq

	@torch.no_grad()
	@dynamic_rope_update
	def forward(self, x, position_ids):
	inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
	position_ids_expanded = position_ids[:, None, :].float()

	device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
	with torch.autocast(device_type=device_type, enabled=False):
	freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos = emb.cos() * self.attention_scaling
	sin = emb.sin() * self.attention_scaling

	return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding to the query and key tensors."""
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""Repeat KV heads for GQA."""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	dropout: float = 0.0,
	scaling: Optional[float] = None,
	softcap: Optional[float] = None,
	**kwargs,
	) -> tuple[torch.Tensor, torch.Tensor]:
	if scaling is None:
	scaling = module.head_dim**-0.5

	key_states = repeat_kv(key, module.num_key_value_groups)
	value_states = repeat_kv(value, module.num_key_value_groups)

	attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling

	if softcap is not None:
	attn_weights = attn_weights / softcap
	attn_weights = torch.tanh(attn_weights)
	attn_weights = attn_weights * softcap
	if attention_mask is not None:
	causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
	attn_weights = attn_weights + causal_mask

	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
	attn_output = torch.matmul(attn_weights, value_states)
	attn_output = attn_output.transpose(1, 2).contiguous()
	return attn_output, attn_weights


	class Gemma3Attention(nn.Module):
	"""Multi-headed attention with support for looping (cache_slot_idx)."""

	def __init__(self, config: GemmagainConfig, layer_idx: int):
	super().__init__()
	self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
	self.config = config
	self.layer_idx = layer_idx
	self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
	self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
	self.scaling = config.query_pre_attn_scalar**-0.5
	self.attention_dropout = self.config.attention_dropout
	self.is_causal = not self.config.use_bidirectional_attention

	self.q_proj = nn.Linear(
	config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
	)
	self.k_proj = nn.Linear(
	config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
	)
	self.v_proj = nn.Linear(
	config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
	)
	self.o_proj = nn.Linear(
	config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
	)
	self.attn_logit_softcapping = self.config.attn_logit_softcapping
	self.sliding_window = config.sliding_window if self.is_sliding else None

	self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
	self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)

	@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
	def forward(
	self,
	hidden_states: torch.Tensor,
	position_embeddings: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	past_key_values: Optional[Cache] = None,
	cache_position: Optional[torch.LongTensor] = None,
	cache_slot_idx: Optional[int] = None,
	**kwargs: Unpack[FlashAttentionKwargs],
	) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
	input_shape = hidden_states.shape[:-1]
	hidden_shape = (*input_shape, -1, self.head_dim)

	query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

	query_states = self.q_norm(query_states)
	key_states = self.k_norm(key_states)

	cos, sin = position_embeddings
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

	if past_key_values is not None:
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
	# Use cache_slot_idx for looping support - each visit to a layer gets its own cache slot
	slot_idx = cache_slot_idx if cache_slot_idx is not None else self.layer_idx
	key_states, value_states = past_key_values.update(key_states, value_states, slot_idx, cache_kwargs)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	attn_output, attn_weights = attention_interface(
	self,
	query_states,
	key_states,
	value_states,
	attention_mask,
	dropout=self.attention_dropout if self.training else 0.0,
	scaling=self.scaling,
	sliding_window=self.sliding_window,
	**kwargs,
	)

	attn_output = attn_output.reshape(*input_shape, -1).contiguous()
	attn_output = self.o_proj(attn_output)
	return attn_output, attn_weights


	class Gemma3DecoderLayer(GradientCheckpointingLayer):
	def __init__(self, config: GemmagainConfig, layer_idx: int):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.layer_idx = layer_idx
	self.attention_type = config.layer_types[layer_idx]
	self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
	self.mlp = Gemma3MLP(config)
	self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
	self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
	self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)

	@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
	def forward(
	self,
	hidden_states: torch.Tensor,
	position_embeddings_global: torch.Tensor,
	position_embeddings_local: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	use_cache: Optional[bool] = False,
	cache_position: Optional[torch.LongTensor] = None,
	cache_slot_idx: Optional[int] = None,
	**kwargs,
	) -> torch.Tensor:
	residual = hidden_states

	hidden_states = self.input_layernorm(hidden_states)

	# Apply global RoPE to non-sliding layers, local RoPE to sliding layers
	if self.self_attn.is_sliding:
	position_embeddings = position_embeddings_local
	else:
	position_embeddings = position_embeddings_global

	hidden_states, _ = self.self_attn(
	hidden_states=hidden_states,
	position_embeddings=position_embeddings,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	cache_position=cache_position,
	cache_slot_idx=cache_slot_idx,
	**kwargs,
	)
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = residual + hidden_states

	residual = hidden_states
	hidden_states = self.pre_feedforward_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = self.post_feedforward_layernorm(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states


	@auto_docstring
	class GemmagainPreTrainedModel(PreTrainedModel):
	config_class = GemmagainConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["Gemma3DecoderLayer"]
	_skip_keys_device_placement = ["past_key_values"]
	_supports_flash_attn = True
	_supports_sdpa = True
	_supports_flex_attn = True
	_can_compile_fullgraph = True
	_supports_attention_backend = True
	_can_record_outputs = {
	"hidden_states": Gemma3DecoderLayer,
	"attentions": Gemma3Attention,
	}

	def _init_weights(self, module):
	super()._init_weights(module)
	# Initialize RMSNorm weights to 0 (Gemma3 uses 1 + weight)
	if "RMSNorm" in module.__class__.__name__:
	module.weight.data.zero_()


	def _expand_layer_sequence(layer_sequence, num_hidden_layers):
	"""Expand layer_sequence config into a flat list of layer indices."""
	l_seq = []
	for item in layer_sequence:
	if isinstance(item, int):
	l_seq.append(item)
	elif isinstance(item, list):
	if len(item) == 2:
	start, end = item
	l_seq += list(range(start, min(end, num_hidden_layers)))
	elif len(item) == 3:
	start, end, repeats = item
	l_seq += list(range(start, min(end, num_hidden_layers))) * repeats
	else:
	raise ValueError(f"Invalid layer_sequence item: {item}")
	else:
	raise ValueError(f"Invalid layer_sequence item type: {type(item)}")
	return l_seq


	def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:
	"""Enables a bidirectional mask within the sliding window."""
	def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
	return abs(q_idx - kv_idx) < sliding_window
	return inner_mask


	@auto_docstring
	class GemmagainModel(GemmagainPreTrainedModel):
	def __init__(self, config: GemmagainConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = Gemma3TextScaledWordEmbedding(
	config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=config.hidden_size**0.5
	)
	self.layers = nn.ModuleList(
	[Gemma3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
	)
	self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.rotary_emb = Gemma3RotaryEmbedding(config=config)
	self.gradient_checkpointing = False

	# Create local RoPE with different theta
	local_config = copy.deepcopy(config)
	local_config.rope_theta = config.rope_local_base_freq
	local_config.rope_scaling = {"rope_type": "default"}
	self.rotary_emb_local = Gemma3RotaryEmbedding(config=local_config)

	# Pre-compute expanded layer sequence for looping
	self._layer_sequence = _expand_layer_sequence(config.layer_sequence, config.num_hidden_layers)
	self._num_cache_slots = len(self._layer_sequence)

	self.post_init()

	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> BaseModelOutputWithPast:
	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)

	if use_cache:
	if past_key_values is None:
	# Create cache with enough slots for the full layer sequence
	cache_config = copy.copy(self.config)
	cache_config.num_hidden_layers = self._num_cache_slots
	past_key_values = DynamicCache(config=cache_config)
	elif isinstance(past_key_values, DynamicCache) and len(past_key_values.layers) < self._num_cache_slots:
	# Extend cache if created externally with fewer slots
	while len(past_key_values.layers) < self._num_cache_slots:
	past_key_values.layers.append(DynamicLayer())

	if cache_position is None:
	past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
	cache_position = torch.arange(
	past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
	)

	if position_ids is None:
	position_ids = cache_position.unsqueeze(0)

	# Prepare attention masks
	if not isinstance(causal_mask_mapping := attention_mask, dict):
	mask_kwargs = {
	"config": self.config,
	"input_embeds": inputs_embeds,
	"attention_mask": attention_mask,
	"cache_position": cache_position,
	"past_key_values": past_key_values,
	"position_ids": position_ids,
	}
	sliding_mask_kwargs = mask_kwargs.copy()

	if self.config.use_bidirectional_attention:
	mask_kwargs["or_mask_function"] = lambda *args: torch.tensor(True, dtype=torch.bool)
	sliding_mask_kwargs["or_mask_function"] = _bidirectional_window_overlay(self.config.sliding_window)

	causal_mask_mapping = {
	"full_attention": create_causal_mask(**mask_kwargs),
	"sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
	}

	hidden_states = inputs_embeds
	position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
	position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)

	# Execute layers in the configured sequence with looping support
	for cache_slot_idx, layer_idx in enumerate(self._layer_sequence):
	decoder_layer = self.layers[layer_idx]
	hidden_states = decoder_layer(
	hidden_states,
	position_embeddings_global=position_embeddings_global,
	position_embeddings_local=position_embeddings_local,
	attention_mask=causal_mask_mapping[decoder_layer.attention_type],
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	cache_position=cache_position,
	cache_slot_idx=cache_slot_idx,
	**kwargs,
	)

	hidden_states = self.norm(hidden_states)

	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=past_key_values if use_cache else None,
	)


	@auto_docstring
	class GemmagainForCausalLM(GemmagainPreTrainedModel, GenerationMixin):
	_tied_weights_keys = ["lm_head.weight"]
	_tp_plan = {"lm_head": "colwise_rep"}
	_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

	def __init__(self, config: GemmagainConfig):
	super().__init__(config)
	self.model = GemmagainModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	self.post_init()

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	logits_to_keep: Union[int, torch.Tensor] = 0,
	**kwargs: Unpack[TransformersKwargs],
	) -> CausalLMOutputWithPast:
	outputs: BaseModelOutputWithPast = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	cache_position=cache_position,
	**kwargs,
	)

	hidden_states = outputs.last_hidden_state
	slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
	logits = self.lm_head(hidden_states[:, slice_indices, :])

	if self.config.final_logit_softcapping is not None:
	logits = logits / self.config.final_logit_softcapping
	logits = torch.tanh(logits)
	logits = logits * self.config.final_logit_softcapping

	loss = None
	if labels is not None:
	# Standard loss calculation
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss_fct = CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)


	__all__ = [
	"GemmagainForCausalLM",
	"GemmagainModel",
	"GemmagainPreTrainedModel",
	]