Add model and code

Browse files

Files changed (8) hide show

.gitignore +9 -0
__init__.py +0 -0
backbone_automodel.py +116 -0
backbone_custom_modeling_qwen3.py +179 -0
backbone_encoder_decoder.py +654 -0
denoiser_base.py +464 -0
diffusion.py +1 -1
noise_schedule_noise_schedules.py +80 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+.hf_cache
+.idea
+.ipynb_checkpoints/
+.pytest_cache/
+.ruff_cache/
+.DS_Store
+outputs/
+watch_folder

__init__.py ADDED Viewed

File without changes

backbone_automodel.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from typing import Literal
+import torch
+from torch import nn
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForMaskedLM,
+    DynamicCache,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from .backbone_custom_modeling_qwen3 import CustomQwen3ForCausalLM
+try:
+    from torch.nn.attention.flex_attention import BlockMask
+except ImportError:
+    BlockMask = None
+AUTO_MODEL_CLS = {
+    "AutoModel": AutoModel,
+    "AutoModelForCausalLM": AutoModelForCausalLM,
+    "AutoModelForMaskedLM": AutoModelForMaskedLM,
+}
+class AutoModelFromPreTrained(nn.Module):
+    """Simple wrapper class that enables using AutoModel from pre-trained."""
+    def __init__(
+        self,
+        automodel_cls: Literal[
+            "AutoModel",
+            "AutoModelForCausalLM",
+            "AutoModelForMaskedLM",
+        ],
+        pretrained_model_name_or_path: str,
+        trust_remote_code: bool = True,
+        num_layers: int = -1,
+        keep_top_layers: bool = False,
+        reinit_model: bool = False,
+        use_causal_mask: bool = False,
+        **automodel_init_kwargs,
+    ):
+        super().__init__()
+        self.use_causal_mask = use_causal_mask
+        if reinit_model:
+            auto_config = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                num_hidden_layers=num_layers,
+                trust_remote_code=trust_remote_code,
+                **automodel_init_kwargs,
+            )
+            self.model = CustomQwen3ForCausalLM(auto_config)
+            # self.model = AUTO_MODEL_CLS[automodel_cls].from_config(auto_config)
+        else:
+            self.model = AUTO_MODEL_CLS[automodel_cls].from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                **automodel_init_kwargs,
+            )
+            num_layers = (
+                len(self.model.model.layers) if num_layers == -1 else num_layers
+            )
+            if keep_top_layers:
+                self.model.model.layers = self.model.model.layers[-num_layers:]
+            else:
+                self.model.model.layers = self.model.model.layers[:num_layers]
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.FloatTensor | BlockMask | None = None,
+        position_ids: torch.LongTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        past_key_values: DynamicCache | None = None,
+        fix_cache_length: bool = False,  # False for AR, True for diffusion models
+        return_updated_cache=False,
+        **kwargs,
+    ) -> CausalLMOutputWithPast | BaseModelOutputWithPast:
+        prev_cache_len = None
+        if past_key_values is not None and fix_cache_length:
+            prev_cache_len = [
+                past_key_values[i][0].shape[-2]  # type: ignore
+                for i in range(len(past_key_values))
+            ]
+        if self.use_causal_mask:
+            attention_mask = None  # None --> enforces use of causal mask
+        model_output = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        if return_updated_cache:
+            return BaseModelOutputWithPast(past_key_values=model_output.past_key_values)
+        if (
+            prev_cache_len is not None
+            and model_output.get("past_key_values", None) is not None
+        ):
+            # DynamicCache extends along sequence dimension by default;
+            # truncate back to original cache len
+            for i, cache_len in enumerate(prev_cache_len):
+                model_output.past_key_values.key_cache[i] = (
+                    model_output.past_key_values.key_cache[i][..., :cache_len, :]
+                )
+                model_output.past_key_values.value_cache[i] = (
+                    model_output.past_key_values.value_cache[i][..., :cache_len, :]
+                )
+        return model_output

backbone_custom_modeling_qwen3.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from typing import Callable, Optional, Tuple
+import torch
+from torch import nn
+from transformers.models.qwen3.modeling_qwen3 import (
+    ALL_ATTENTION_FUNCTIONS,
+    Cache,
+    FlashAttentionKwargs,
+    Qwen3Attention,
+    Qwen3Config,
+    Qwen3DecoderLayer,
+    Qwen3ForCausalLM,
+    Qwen3Model,
+    eager_attention_forward,
+    rotate_half,
+)
+from transformers.processing_utils import Unpack
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+def custom_apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1, q_start_idx=0):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos[..., q_start_idx:, :]) + (
+        rotate_half(q) * sin[..., q_start_idx:, :]
+    )
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class CustomQwen3Attention(Qwen3Attention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__(config, layer_idx=layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        q_start_idx: int = 0,  # > 0: decoder pass w/encoder inputs in hidden_states
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        sa_hidden_sates = hidden_states[:, q_start_idx:, :]
+        query_input_shape = sa_hidden_sates.shape[:-1]
+        query_hidden_shape = (*query_input_shape, -1, self.head_dim)
+        query_states = self.q_norm(
+            self.q_proj(sa_hidden_sates).reshape(query_hidden_shape)
+        ).transpose(1, 2)
+        key_states = self.k_norm(
+            self.k_proj(hidden_states).view(hidden_shape)
+        ).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = custom_apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, q_start_idx=q_start_idx
+        )
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models
+            # cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+        # NOTE: downcast for flex-attention compatibility
+        query_states, key_states = (
+            query_states.to(value_states.dtype),
+            key_states.to(value_states.dtype),
+        )
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[
+                self.config._attn_implementation
+            ]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*query_input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class CustomQwen3DecoderLayer(Qwen3DecoderLayer):
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__(config, layer_idx=layer_idx)
+        self.self_attn = CustomQwen3Attention(config=config, layer_idx=layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        q_start_idx: int = 0,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        residual = hidden_states[:, q_start_idx:, ...]
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            q_start_idx=q_start_idx,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # return hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+class CustomQwen3Model(Qwen3Model):
+    def __init__(self, config: Qwen3Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                CustomQwen3DecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+class CustomQwen3ForCausalLM(Qwen3ForCausalLM):
+    def __init__(self, config: Qwen3Config):
+        super().__init__(config)
+        # Initialize a new model with custom layers
+        self.model = CustomQwen3Model(config)
+        # Initialize weights and apply final processing
+        self.post_init()

backbone_encoder_decoder.py ADDED Viewed

	@@ -0,0 +1,654 @@

+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.cache_utils import DynamicCache
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from transformers.processing_utils import Unpack
+from transformers.utils import logging
+from .backbone_custom_modeling_qwen3 import CustomQwen3ForCausalLM
+try:
+    from torch.nn.attention.flex_attention import BlockMask
+except ImportError:
+    BlockMask = None
+logger = logging.get_logger(__name__)
+@dataclass
+class EncoderBaseModelOutputWithPast(ModelOutput):
+    """Custom (encoder) model output.
+    Stores previous decoder and updated encoder cache and encoder last hidden state.
+    """
+    past_key_values: Optional[Union[Tuple[Tuple[torch.FloatTensor]], DynamicCache]] = (
+        None
+    )
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_past_key_values: Optional[
+        Union[Tuple[Tuple[torch.FloatTensor]], DynamicCache]
+    ] = None
+@dataclass
+class DecoderCausalLMOutputWithPast(ModelOutput):
+    """Custom (decoder) model output.
+    Stores previous encoder and updated decoder cache and decoder logits.
+    """
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[Tuple[Tuple[torch.FloatTensor]], DynamicCache]] = (
+        None
+    )
+    encoder_past_key_values: Optional[
+        Union[Tuple[Tuple[torch.FloatTensor]], DynamicCache]
+    ] = None
+class LLMasEncoderDecoder(nn.Module):
+    def __init__(
+        self,
+        pretrained_model_name_or_path: str,
+        max_length: int,
+        attn_backend: str = "sdpa",
+        freeze_encoder: bool = False,
+        reinit_encoder: bool = False,
+        reinit_decoder: bool = False,
+        tie_encoder_decoder_weights: bool = False,
+        use_encoder_causal_mask: bool = False,
+        num_encoder_layers: int = -1,
+        num_decoder_layers: int = -1,
+        keep_top_encoder_layers: bool = False,
+        keep_top_decoder_layers: bool = False,
+        use_gradient_checkpointing: bool = False,
+        **llm_init_kwargs,
+    ):
+        assert not (tie_encoder_decoder_weights and reinit_decoder), (
+            "Cannot tie encoder-decoder weights and reinitialize decoder."
+        )
+        assert not (tie_encoder_decoder_weights and freeze_encoder), (
+            "Cannot freeze encoder weights when tying encoder-decoder weights."
+        )
+        super().__init__()
+        self.use_encoder_causal_mask = use_encoder_causal_mask
+        self.tie_encoder_decoder_weights = tie_encoder_decoder_weights
+        if reinit_encoder:
+            assert num_encoder_layers > 0
+            encoder_config = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=True,
+                num_hidden_layers=num_encoder_layers,
+                attn_implementation=attn_backend,
+                **llm_init_kwargs,
+            )
+            self.encoder = CustomQwen3ForCausalLM(encoder_config)
+        else:
+            self.encoder = CustomQwen3ForCausalLM.from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=True,
+                attn_implementation=attn_backend,
+                **llm_init_kwargs,
+            )
+            assert num_encoder_layers <= len(self.encoder.model.layers), (
+                f"Cannot keep {num_encoder_layers} layers. "
+                f"Pre-trained model only has {len(self.encoder.model.layers)} layers."
+            )
+            num_encoder_layers = (
+                len(self.encoder.model.layers)
+                if num_encoder_layers == -1
+                else num_encoder_layers
+            )
+            if keep_top_encoder_layers:
+                self.encoder.model.layers = self.encoder.model.layers[
+                    -num_encoder_layers:
+                ]
+            else:
+                self.encoder.model.layers = self.encoder.model.layers[
+                    :num_encoder_layers
+                ]
+        if freeze_encoder:
+            for name, param in self.encoder.named_parameters():
+                if "embed_tokens" not in name:
+                    param.requires_grad = False
+        if use_gradient_checkpointing:
+            self.encoder.gradient_checkpointing_enable()
+        if tie_encoder_decoder_weights:
+            self.decoder = self.encoder
+            num_decoder_layers = (
+                len(self.decoder.model.layers)
+                if num_decoder_layers == -1
+                else num_decoder_layers
+            )
+            assert num_decoder_layers <= len(self.decoder.model.layers), (
+                f"Cannot keep {num_decoder_layers} layers. "
+                f"Pre-trained model only has {len(self.decoder.model.layers)} layers."
+            )
+            # Keep **top** layers when tying weights
+            self.decoder_layer_idxs = list(range(len(self.encoder.model.layers)))[
+                -num_decoder_layers:
+            ]
+        else:
+            if reinit_decoder:
+                assert num_decoder_layers > 0
+                decoder_config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    num_hidden_layers=num_decoder_layers,
+                    attn_implementation=attn_backend,
+                    **llm_init_kwargs,
+                )
+                self.decoder = CustomQwen3ForCausalLM(decoder_config)
+            else:
+                self.decoder = CustomQwen3ForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    attn_implementation=attn_backend,
+                    **llm_init_kwargs,
+                )
+                assert num_decoder_layers <= len(self.decoder.model.layers), (
+                    f"Cannot keep {num_decoder_layers} layers. "
+                    f"Pre-trained model only has {len(self.decoder.layers)} layers."
+                )
+                if keep_top_decoder_layers:
+                    self.decoder.model.layers = self.decoder.model.layers[
+                        -num_decoder_layers:
+                    ]
+                else:
+                    self.decoder.model.layers = self.decoder.model.layers[
+                        :num_decoder_layers
+                    ]
+            del self.decoder.model.embed_tokens
+            # if in the original LM, the lm_head is weight-tied to embedding,
+            # point decoder lm_head to encoder's (instead of initializing separately)
+            if (
+                self.encoder.lm_head.weight.data_ptr()
+                == self.encoder.model.embed_tokens.weight.data_ptr()
+            ):
+                self.decoder.lm_head = self.encoder.lm_head
+            else:
+                del self.encoder.lm_head
+            if use_gradient_checkpointing:
+                self.decoder.gradient_checkpointing_enable()
+        self.max_length = max_length
+    def freeze_encoder(self):
+        for p in self.encoder.model.parameters():
+            p.requires_grad = False
+    def unfreeze_encoder(self):
+        for p in self.encoder.model.parameters():
+            p.requires_grad = True
+    # noinspection PyUnusedLocal
+    def forward(
+        self,
+        # Decoder inputs
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[Union[torch.FloatTensor, BlockMask]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[DynamicCache] = None,
+        encoder_last_hidden_state: Optional[torch.FloatTensor] = None,
+        # Encoder inputs
+        encoder_input_ids: Optional[torch.LongTensor] = None,
+        encoder_attention_mask: Optional[Union[torch.FloatTensor, BlockMask]] = None,
+        encoder_position_ids: Optional[torch.LongTensor] = None,
+        encoder_cache_position: Optional[torch.LongTensor] = None,
+        encoder_past_key_values: Optional[DynamicCache] = None,
+        # Additional args
+        fix_cache_length: bool = True,  # Not used; compatibility with other backbones
+        return_updated_cache: bool = False,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[DecoderCausalLMOutputWithPast, EncoderBaseModelOutputWithPast]:
+        # During training/eval encoder_last_hidden_state is None.
+        # During generation encoder_last_hidden_state can be not None.
+        new_seen_tokens = (
+            0
+            if encoder_last_hidden_state is None
+            else encoder_last_hidden_state.shape[1]
+        )
+        # Encode clean tokens
+        if encoder_input_ids is not None:
+            if self.use_encoder_causal_mask:
+                encoder_attention_mask = None  # None --> enforces use of causal mask
+            if encoder_cache_position is None and encoder_position_ids is not None:
+                encoder_cache_position = encoder_position_ids[0]
+            encoder_output = self.encoder.model(
+                input_ids=encoder_input_ids,
+                attention_mask=encoder_attention_mask,
+                position_ids=encoder_position_ids,
+                use_cache=True,
+                past_key_values=encoder_past_key_values,
+                cache_position=encoder_cache_position,
+            )
+            if return_updated_cache:
+                # encoder_output.past_key_values now contains latest encoder input
+                return EncoderBaseModelOutputWithPast(
+                    encoder_last_hidden_state=encoder_output.last_hidden_state,
+                    encoder_past_key_values=encoder_output.past_key_values,
+                    past_key_values=past_key_values,
+                )
+            encoder_last_hidden_state = encoder_output.last_hidden_state
+        # Run decoder with xattn to clean token hidden states
+        if encoder_last_hidden_state is None:  # No new encoder tokens
+            q_start_idx = 0
+            decoder_hidden_states = self.encoder.model.embed_tokens(input_ids)
+            if cache_position is None:
+                if position_ids is not None:
+                    cache_position = position_ids[0]
+                else:
+                    past_seen_tokens = (
+                        past_key_values.get_seq_length()
+                        if past_key_values is not None
+                        else 0
+                    )
+                    cache_position = torch.arange(
+                        past_seen_tokens,
+                        past_seen_tokens + decoder_hidden_states.shape[1],
+                        device=decoder_hidden_states.device,
+                    )
+            if position_ids is None:
+                position_ids = cache_position.unsqueeze(0)
+            decoder_position_embeddings = self.decoder.model.rotary_emb(
+                decoder_hidden_states, position_ids
+            )
+        else:
+            q_start_idx = encoder_last_hidden_state.shape[1]
+            decoder_hidden_states = self.encoder.model.embed_tokens(input_ids)
+            decoder_hidden_states = torch.cat(
+                [
+                    encoder_last_hidden_state,
+                    decoder_hidden_states,
+                ],
+                dim=1,
+            )
+            if cache_position is None:
+                if position_ids is not None:
+                    cache_position = position_ids[0]
+                else:
+                    past_seen_tokens = (
+                        past_key_values.get_seq_length()
+                        if past_key_values is not None
+                        else 0
+                    )
+                    cache_position = torch.cat(
+                        [
+                            torch.arange(  # clean token position ids
+                                past_seen_tokens,
+                                past_seen_tokens + encoder_last_hidden_state.shape[1],
+                                device=decoder_hidden_states.device,
+                            ),
+                            torch.arange(  # noisy position ids
+                                past_seen_tokens + new_seen_tokens,
+                                past_seen_tokens + new_seen_tokens + input_ids.shape[1],
+                                device=decoder_hidden_states.device,
+                            ),
+                        ],
+                        dim=-1,
+                    )
+            if position_ids is None:
+                position_ids = cache_position.unsqueeze(0)
+            decoder_position_embeddings = self.decoder.model.rotary_emb(
+                decoder_hidden_states, position_ids
+            )
+        if hasattr(self.decoder.model, "_update_causal_mask"):  # bc on transformers
+            # noinspection PyProtectedMember
+            attention_mask = self.decoder.model._update_causal_mask(
+                attention_mask=attention_mask,
+                input_tensor=decoder_hidden_states,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                output_attentions=False,
+            )
+        for decoder_layer in self.decoder.model.layers:
+            layer_idx = decoder_layer.self_attn.layer_idx
+            if (
+                self.tie_encoder_decoder_weights
+                and layer_idx not in self.decoder_layer_idxs
+            ):
+                continue
+            # past_key_values gets updated in-place.
+            # Record previous length to re-truncate after each layer forward
+            if past_key_values is not None and len(past_key_values) > layer_idx:
+                prev_cache_len = past_key_values[layer_idx][0].shape[-2]  # type: ignore
+            else:
+                prev_cache_len = 0
+            cache_len = prev_cache_len + new_seen_tokens
+            if self.decoder.model.gradient_checkpointing and self.training:
+                # noinspection PyProtectedMember
+                decoder_hidden_states = self.decoder._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    decoder_hidden_states,  # hidden_states=,
+                    attention_mask,  # attention_mask=,
+                    position_ids,  # position_ids=,
+                    past_key_values,  # past_key_value=,
+                    False,  # output_attentions=,
+                    True,  # use_cache=,
+                    cache_position,  # cache_position=,
+                    decoder_position_embeddings,  # position_embeddings=,
+                    q_start_idx,  # q_start_idx=
+                )[0]  # Shape: (input_ids.shape[0], input_ids.shape[1], hidden_dim)
+            else:
+                decoder_hidden_states = decoder_layer(
+                    hidden_states=decoder_hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=False,
+                    use_cache=True,
+                    cache_position=cache_position,
+                    position_embeddings=decoder_position_embeddings,
+                    q_start_idx=q_start_idx,  # Indicates where to slice output
+                    **flash_attn_kwargs,
+                )[0]  # Shape: (input_ids.shape[0], input_ids.shape[1], hidden_dim)
+            # Update decoder_hidden_states
+            if q_start_idx > 0:
+                decoder_hidden_states = torch.cat(
+                    [
+                        encoder_last_hidden_state,
+                        decoder_hidden_states,
+                    ],
+                    dim=1,
+                )
+            if past_key_values is not None:
+                # DynamicCache extends along sequence dimension by default;
+                # truncate back to original cache len + encoder output length
+                past_key_values.key_cache[layer_idx] = past_key_values.key_cache[
+                    layer_idx
+                ][..., :cache_len, :]
+                past_key_values.value_cache[layer_idx] = past_key_values.value_cache[
+                    layer_idx
+                ][..., :cache_len, :]
+        decoder_hidden_states = self.decoder.model.norm(
+            decoder_hidden_states[:, q_start_idx:, :]
+        )
+        logits = self.decoder.lm_head(decoder_hidden_states)
+        return DecoderCausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=past_key_values,
+            encoder_past_key_values=encoder_past_key_values,
+            # Do not need to store encoder_last_hidden_state.
+            # If it was passed in, then it has become part of the past_key_values cache.
+        )
+class LLMasEncoderDecoderShareKV(nn.Module):
+    def __init__(
+        self,
+        pretrained_model_name_or_path: str,
+        max_length: int,
+        attn_backend: str = "sdpa",
+        freeze_encoder: bool = False,
+        reinit_encoder: bool = False,
+        reinit_decoder: bool = False,
+        tie_encoder_decoder_weights: bool = False,
+        use_encoder_causal_mask: bool = False,
+        num_encoder_layers: int = -1,
+        num_decoder_layers: int = -1,
+        keep_top_encoder_layers: bool = False,
+        keep_top_decoder_layers: bool = False,
+        use_gradient_checkpointing: bool = False,
+        **llm_init_kwargs,
+    ):
+        assert not (tie_encoder_decoder_weights and reinit_decoder), (
+            "Cannot tie encoder-decoder weights and reinitialize decoder."
+        )
+        assert not (tie_encoder_decoder_weights and freeze_encoder), (
+            "Cannot freeze encoder weights when tying encoder-decoder weights."
+        )
+        super().__init__()
+        self.use_encoder_causal_mask = use_encoder_causal_mask
+        self.tie_encoder_decoder_weights = tie_encoder_decoder_weights
+        if reinit_encoder:
+            assert num_encoder_layers > 0
+            encoder_config = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=True,
+                num_hidden_layers=num_encoder_layers,
+                attn_implementation=attn_backend,
+                **llm_init_kwargs,
+            )
+            self.encoder = AutoModelForCausalLM.from_config(encoder_config)
+        else:
+            self.encoder = AutoModelForCausalLM.from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=True,
+                attn_implementation=attn_backend,
+                **llm_init_kwargs,
+            )
+            assert num_encoder_layers <= len(self.encoder.model.layers), (
+                f"Cannot keep {num_encoder_layers} layers. "
+                f"Pre-trained model only has {len(self.encoder.model.layers)} layers."
+            )
+            num_encoder_layers = (
+                len(self.encoder.model.layers)
+                if num_encoder_layers == -1
+                else num_encoder_layers
+            )
+            if keep_top_encoder_layers:
+                self.encoder.model.layers = self.encoder.model.layers[
+                    -num_encoder_layers:
+                ]
+            else:
+                self.encoder.model.layers = self.encoder.model.layers[
+                    :num_encoder_layers
+                ]
+        if freeze_encoder:
+            for name, param in self.encoder.named_parameters():
+                if "embed_tokens" not in name:
+                    param.requires_grad = False
+        if use_gradient_checkpointing:
+            self.encoder.gradient_checkpointing_enable()
+        if tie_encoder_decoder_weights:
+            self.decoder = self.encoder
+            num_decoder_layers = (
+                len(self.decoder.model.layers)
+                if num_decoder_layers == -1
+                else num_decoder_layers
+            )
+            assert num_decoder_layers <= len(self.decoder.model.layers), (
+                f"Cannot keep {num_decoder_layers} layers. "
+                f"Pre-trained model only has {len(self.decoder.model.layers)} layers."
+            )
+            # Keep **top** layers when tying weights
+            self.decoder_layer_idxs = list(range(len(self.encoder.model.layers)))[
+                -num_decoder_layers:
+            ]
+        else:
+            if reinit_decoder:
+                assert num_decoder_layers > 0
+                decoder_config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    num_hidden_layers=num_decoder_layers,
+                    attn_implementation=attn_backend,
+                    **llm_init_kwargs,
+                )
+                self.decoder = AutoModelForCausalLM(decoder_config)
+            else:
+                self.decoder = AutoModelForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=True,
+                    attn_implementation=attn_backend,
+                    **llm_init_kwargs,
+                )
+                assert num_decoder_layers <= len(self.decoder.model.layers), (
+                    f"Cannot keep {num_decoder_layers} layers. "
+                    f"Pre-trained model only has {len(self.decoder.layers)} layers."
+                )
+                if keep_top_decoder_layers:
+                    self.decoder.model.layers = self.decoder.model.layers[
+                        -num_decoder_layers:
+                    ]
+                else:
+                    self.decoder.model.layers = self.decoder.model.layers[
+                        :num_decoder_layers
+                    ]
+            del self.decoder.model.embed_tokens
+            # Even for frozen encoder, ensure embedding tokens are trainable
+            self.encoder.model.embed_tokens.requires_grad_(True)
+            unused_self_attn_params = ["o_proj", "q_norm", "q_proj"]
+            unused_layernorm_params = ["input_layernorm", "post_attention_layernorm"]
+            for unused_param in unused_self_attn_params:
+                if hasattr(self.encoder.model.layers[-1].self_attn, unused_param):
+                    getattr(
+                        self.encoder.model.layers[-1].self_attn, unused_param
+                    ).requires_grad_(False)
+            self.encoder.model.layers[-1].mlp.requires_grad_(False)
+            self.encoder.model.norm.requires_grad_(False)
+            for unused_param in unused_layernorm_params:
+                if hasattr(self.encoder.model.layers[-1], unused_param):
+                    getattr(self.encoder.model.layers[-1], unused_param).requires_grad_(
+                        False
+                    )
+            # if in the original LM, the lm_head is weight-tied to embedding,
+            # point decoder lm_head to encoder's (instead of initializing separately)
+            if (
+                self.encoder.lm_head.weight.data_ptr()
+                == self.encoder.model.embed_tokens.weight.data_ptr()
+            ):
+                self.decoder.lm_head = self.encoder.lm_head
+            else:
+                del self.encoder.lm_head
+            if use_gradient_checkpointing:
+                self.decoder.gradient_checkpointing_enable()
+        self.max_length = max_length
+    def freeze_encoder(self):
+        for p in self.encoder.model.parameters():
+            p.requires_grad = False
+    def unfreeze_encoder(self):
+        for p in self.encoder.model.parameters():
+            p.requires_grad = True
+    # noinspection PyUnusedLocal
+    def forward(
+        self,
+        # Decoder inputs
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[Union[torch.FloatTensor, BlockMask]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[DynamicCache] = None,
+        encoder_last_hidden_state: Optional[torch.FloatTensor] = None,  # Not used
+        # Encoder inputs
+        encoder_input_ids: Optional[torch.LongTensor] = None,
+        encoder_attention_mask: Optional[Union[torch.FloatTensor, BlockMask]] = None,
+        encoder_position_ids: Optional[torch.LongTensor] = None,
+        encoder_cache_position: Optional[torch.LongTensor] = None,
+        encoder_past_key_values: Optional[DynamicCache] = None,  # Not used
+        # Additional args
+        fix_cache_length: bool = True,  # Not used; compatibility with other backbones
+        return_updated_cache: bool = False,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[CausalLMOutputWithPast, BaseModelOutputWithPast]:
+        # Encode clean tokens
+        if encoder_input_ids is not None:
+            if self.use_encoder_causal_mask:
+                encoder_attention_mask = None  # None --> enforces use of causal mask
+            if encoder_cache_position is None and encoder_position_ids is not None:
+                encoder_cache_position = encoder_position_ids[0]
+            past_key_values = self.encoder.model(
+                input_ids=encoder_input_ids,
+                attention_mask=encoder_attention_mask,
+                position_ids=encoder_position_ids,
+                use_cache=True,
+                past_key_values=past_key_values,
+                cache_position=encoder_cache_position,
+            ).past_key_values
+            if return_updated_cache:
+                # encoder_output.past_key_values now contains latest encoder input
+                return BaseModelOutputWithPast(
+                    past_key_values=past_key_values,
+                )
+        # Run decoder with xattn to clean token hidden states
+        decoder_hidden_states = self.encoder.model.embed_tokens(input_ids)
+        if cache_position is None:
+            if position_ids is not None:
+                cache_position = position_ids[0]
+            else:  # During training / validation position_ids are not provided
+                cache_position = torch.arange(
+                    decoder_hidden_states.shape[1],
+                    device=decoder_hidden_states.device,
+                )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        decoder_position_embeddings = self.decoder.model.rotary_emb(
+            decoder_hidden_states, position_ids
+        )
+        if hasattr(self.decoder.model, "_update_causal_mask"):  # bc on transformers
+            # noinspection PyProtectedMember
+            attention_mask = self.decoder.model._update_causal_mask(
+                attention_mask=attention_mask,
+                input_tensor=decoder_hidden_states,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                output_attentions=False,
+            )
+        for decoder_layer in self.decoder.model.layers:
+            layer_idx = decoder_layer.self_attn.layer_idx
+            if (
+                self.tie_encoder_decoder_weights
+                and layer_idx not in self.decoder_layer_idxs
+            ):
+                continue
+            # past_key_values gets updated in-place.
+            # Record previous length to truncate after each layer forward
+            if past_key_values is not None and len(past_key_values) > layer_idx:
+                prev_cache_len = past_key_values[layer_idx][0].shape[-2]  # type: ignore
+            else:
+                prev_cache_len = 0
+            decoder_hidden_states = decoder_layer(
+                hidden_states=decoder_hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=position_ids[0],
+                position_embeddings=decoder_position_embeddings,
+                **flash_attn_kwargs,
+            )[0]  # Shape: (input_ids.shape[0], input_ids.shape[1], hidden_dim)
+            if past_key_values is not None:
+                # DynamicCache extends along sequence dimension by default;
+                # truncate back to original cache len + encoder output length
+                past_key_values.key_cache[layer_idx] = past_key_values.key_cache[
+                    layer_idx
+                ][..., :prev_cache_len, :]
+                past_key_values.value_cache[layer_idx] = past_key_values.value_cache[
+                    layer_idx
+                ][..., :prev_cache_len, :]
+        decoder_hidden_states = self.decoder.model.norm(decoder_hidden_states)
+        logits = self.decoder.lm_head(decoder_hidden_states)
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=past_key_values,
+        )

denoiser_base.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import copy
+import inspect
+import sys
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional, Tuple, Union
+import hydra.utils
+import torch
+from hydra.errors import InstantiationException
+from transformers import (
+    AutoTokenizer,
+    DynamicCache,
+    GenerationConfig,
+    LogitsProcessorList,
+    PretrainedConfig,
+    PreTrainedModel,
+    StoppingCriteriaList,
+)
+from transformers.cache_utils import Cache
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import ModelOutput
+# Local imports not used, but added here so that HF push_to_hub adds them to model repo
+# noinspection PyUnresolvedReferences
+from .backbone_automodel import AutoModelFromPreTrained  # noqa: F401
+from .backbone_encoder_decoder import (  # noqa: F401
+    LLMasEncoderDecoder,
+    LLMasEncoderDecoderShareKV,
+)
+from .noise_schedule_noise_schedules import (  # noqa: F401
+    CosineNoise,
+    ExponentialNoise,
+    LinearNoise,
+    LogarithmicNoise,
+)
+@dataclass
+class DenoiserInput(OrderedDict):
+    """Input to the denoiser model."""
+    xt: torch.LongTensor  # (B, L) token_ids
+    x0: Optional[torch.LongTensor] = None  # (B, L) token_ids (not used in gen.)
+    attention_mask: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[torch.FloatTensor, Cache]] = None
+    context_mask: Optional[torch.FloatTensor] = None
+    tokens_mask: Optional[torch.FloatTensor] = None  # (B, L)
+    t: Optional[torch.FloatTensor] = None  # (B,) | # (B, L)
+    alpha_t: Optional[torch.FloatTensor] = None  # (B,) | (B, 1|L) | (B, 1|L, 1)
+    alpha_t_prime: Optional[torch.FloatTensor] = None  # (B,) | (B, 1|L) | (B, 1|L, 1)
+    backbone_kwargs: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class LossAndNllOutput(OrderedDict):
+    """Loss output for denoiser models."""
+    loss: torch.FloatTensor
+    nlls: torch.FloatTensor
+    other_loss_terms: dict = field(default_factory=dict)
+@dataclass
+class DenoiserOutput(ModelOutput):
+    """Output of the denoiser model."""
+    denoiser_output: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    tokens_mask: Optional[torch.FloatTensor] = None  # Which tokens contribute to loss
+    past_key_values: Optional[Cache] = None
+    loss: Optional[torch.FloatTensor] = None
+    nlls: Optional[torch.FloatTensor] = None
+    other_loss_terms: Optional[dict[str, Any]] = None
+class DenoiserConfig(PretrainedConfig):
+    """Configuration class for Denoiser models.
+    This class is used to initialize the model and contains all the necessary
+    parameters for the model's architecture.
+    """
+    model_type = "denoiser"
+    def __init__(
+        self,
+        length: Optional[int] = None,
+        backbone_config: Optional[Dict[str, Any]] = None,
+        noise_config: Optional[Dict[str, Any]] = None,
+        tokenization_config: Optional[Dict[str, Any]] = None,
+        time_conditioned_backbone: Optional[bool] = None,
+        attn_backend: str = "sdpa",  # "sdpa", "flash_attention_2", "flex_attention"
+        train_on_context: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        for v in [
+            "vocab_size",
+            "mask_token_id",
+            "pad_token_id",
+            "bos_token_id",
+            "eos_token_id",
+            "pad_vocab_size_multiple",
+        ]:
+            if tokenization_config is not None and (
+                getattr(self, v, None) is None or v in tokenization_config
+            ):
+                setattr(self, v, tokenization_config.get(v, None))
+            else:
+                setattr(self, v, None)
+        self.backbone_config = backbone_config
+        self.noise_config = noise_config
+        self.tokenization_config = tokenization_config
+        self.length = length
+        self.time_conditioned_backbone = time_conditioned_backbone
+        self.attn_backend = attn_backend
+        self.train_on_context = train_on_context
+class Denoiser(ABC, PreTrainedModel):
+    """Abstract base class for denoising models.
+    This class defines the interface for AR, Diffusion, and Flow-based parametrizations.
+    """
+    config_class = DenoiserConfig
+    def __init__(
+        self,
+        config: DenoiserConfig,
+        **kwargs,
+    ):
+        """
+        Initialize the Denoiser with a configuration and optional dataset type.
+        Parameters:
+            config (Any): Configuration object for the model.
+        """
+        super().__init__(config)
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.mask_token_id = config.mask_token_id
+        self.pad_token_id = config.pad_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eos_token_id = config.eos_token_id
+        try:
+            self.backbone = hydra.utils.instantiate(config.backbone_config)
+        except InstantiationException:
+            # When using HF and `from_pretrained`, the modules specified in `_target_`
+            # fields in our configs are already being imported under a name with the
+            # following format: transformers_modules.<repo_id>.<commit_id>.
+            # When hydra attempts to instantiate and calls importlib under the hood, the
+            # desired module is not found.
+            # The snippet below aliases the desired module, enabling seamless use of
+            # `hydra.utils.instantiate`.
+            sys_modules = copy.deepcopy(list(sys.modules.keys()))
+            repo_root_module = ".".join(__name__.split(".")[:-1])
+            for name in sys_modules:
+                if name.startswith(repo_root_module):
+                    short = name.split(".")[-1]
+                    if short not in sys.modules:
+                        sys.modules[short] = sys.modules[name]
+            del sys_modules
+            self.backbone = hydra.utils.instantiate(config.backbone_config)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            config.tokenizer_name,
+            trust_remote_code=True,
+        )
+        self.noise_schedule = (
+            hydra.utils.instantiate(config.noise_config)
+            if config.noise_config is not None
+            else None
+        )
+        self.time_conditioned_backbone = (
+            config.time_conditioned_backbone
+            if config.time_conditioned_backbone is not None
+            else "noise" in inspect.getfullargspec(self.backbone.forward).args
+        )
+        # List that can contain any parameters that should not be pushed to HF,
+        # e.g., registered buffers for static attention masks
+        self.skip_params_for_push = []
+    @abstractmethod
+    def _prepare_inputs(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        context_mask: Optional[torch.FloatTensor] = None,
+        t: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+    ) -> DenoiserInput:
+        """
+        Prepare inputs for the model.
+        Parameters:
+            input_ids (LongTensor): Input tensor to the model.
+            attention_mask (Optional[FloatTensor]): Attention mask for the model.
+            t (Optional[FloatTensor]): Time step for the model.
+            past_key_values (Optional[Cache]): Past key values for the model.
+        Returns:
+            Denoiser inputs.
+        """
+        raise NotImplementedError("Denoiser subclasses must implement _prepare_inputs")
+    def _prepare_inputs_inference(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        context: Optional[torch.LongTensor] = None,
+        context_mask: Optional[torch.FloatTensor] = None,
+        cache: Optional[Dict[str, Any]] = None,
+        **backbone_kwargs: Any,
+    ) -> Tuple[DenoiserInput, Dict[str, Any]]:
+        raise NotImplementedError(
+            "Denoiser subclasses must implement _prepare_inputs_inference"
+        )
+        # assert input_ids is not None or context is not None, (
+        #     "Must provide either input_ids or context."
+        # )
+        # cache = cache if cache is not None else {}
+        # past_key_values = cache.pop("past_key_values", DynamicCache())
+        # if context is not None:
+        #     if input_ids is not None:
+        #         if context_mask is None:
+        #             context_mask = torch.cat(
+        #                [torch.ones_like(context), torch.zeros_like(input_ids)], dim=-1
+        #             )
+        #         input_ids = torch.cat([context, input_ids], dim=-1)
+        #     else:
+        #         input_ids = context
+        #         context_mask = torch.ones_like(input_ids)
+        # if attention_mask is None:
+        #     cache_length = self._get_past_key_values_seq_length(past_key_values)
+        #     full_seq_length = cache_length + input_ids.shape[-1]
+        #     attention_mask = torch.ones(
+        #         (input_ids.shape[0], 1, input_ids.shape[1], full_seq_length),
+        #         device=input_ids.device,
+        #     )  # Make attention mask 4D
+        #     attention_mask = self._preprocess_attention_mask(
+        #         attention_mask, dtype=torch.float
+        #     )
+        # return DenoiserInput(
+        #     xt=input_ids,
+        #     attention_mask=attention_mask,
+        #     past_key_values=past_key_values,
+        #     context_mask=context_mask,
+        #     backbone_kwargs=backbone_kwargs,
+        # ), cache
+    @abstractmethod
+    def _compute_loss(
+        self,
+        model_output: torch.FloatTensor,
+        denoiser_inputs: DenoiserInput,
+        **kwargs: Any,
+    ) -> LossAndNllOutput:
+        """
+        Compute the loss for the denoising model.
+        Parameters:
+            model_output (FloatTensor): Output tensor from self.forward.
+            denoiser_inputs (DenoiserInput): Inputs passed to the denoiser model.
+        Returns:
+            LossAndNllOutput: loss (FloatTensor) and nlls (FloatTensor).
+        """
+        raise NotImplementedError("Denoiser subclasses must implement _compute_loss")
+    def _forward(
+        self,
+        backbone_output: torch.FloatTensor,
+        denoiser_inputs: DenoiserInput,
+        **kwargs: Any,
+    ) -> torch.FloatTensor:
+        """
+        Forward pass for the denoiser model returns probabilities over denoised
+        sequence.
+        Some classes may need to override this method.
+        Parameters:
+            backbone_output (FloatTensor): Output tensor from the backbone model.
+            denoiser_inputs (DenoiserInput): Inputs passed to the denoiser model.
+        Returns:
+            Model outputs (FloatTensor).
+        """
+        return torch.log_softmax(backbone_output, dim=-1)  # type: ignore
+    def _backbone_forward(
+        self,
+        denoiser_inputs: DenoiserInput,
+        **backbone_kwargs: Any,
+    ) -> ModelOutput:
+        """Forward pass for the backbone model (should return logits).
+        Some classes may need to override this method.
+        Parameters:
+            denoiser_inputs (DenoiserInput): Inputs passed to the denoiser model.
+            return_updated_cache (bool): If True, return past_key_values instead of
+                logits.
+        Returns:
+            Backbone output (ModelOutput instance).
+        """
+        if self.time_conditioned_backbone:
+            return self.backbone(
+                denoiser_inputs.xt,
+                attention_mask=denoiser_inputs.attention_mask,
+                past_key_values=denoiser_inputs.past_key_values,
+                noise=denoiser_inputs.alpha_t,
+                **denoiser_inputs.backbone_kwargs,
+                **backbone_kwargs,
+            )
+        return self.backbone(
+            denoiser_inputs.xt,
+            attention_mask=denoiser_inputs.attention_mask,
+            past_key_values=denoiser_inputs.past_key_values,
+            **denoiser_inputs.backbone_kwargs,
+            **backbone_kwargs,
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        context_mask: Optional[torch.FloatTensor] = None,
+        t: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        compute_loss: Optional[bool] = True,
+        **kwargs,
+    ) -> DenoiserOutput:
+        """
+        Perform a forward pass through the denoising model and
+        (optionally) compute the loss.
+        Parameters:
+            input_ids (LongTensor): Input tensor to the model.
+            attention_mask (Optional[FloatTensor]): Attention mask for the model.
+            context_mask (Optional[FloatTensor]): Indicator for context tokens.
+            t (Optional[FloatTensor]): Denoising time step for the model.
+            past_key_values (Optional[Cache]): KV cache.
+            compute_loss (Optional[bool]): Flag to compute loss.
+        Returns:
+            DenoiserOutput
+        """
+        denoiser_inputs = self._prepare_inputs(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            context_mask=context_mask,
+            past_key_values=past_key_values,
+            t=t,
+        )
+        backbone_output = self._backbone_forward(denoiser_inputs, **kwargs)
+        new_past_key_values = getattr(backbone_output, "past_key_values", None)
+        backbone_output = getattr(backbone_output, "logits", backbone_output[0])
+        denoiser_output = self._forward(
+            backbone_output,
+            denoiser_inputs,
+            **kwargs,
+        )
+        if compute_loss:
+            loss_and_nll = self._compute_loss(
+                model_output=denoiser_output, denoiser_inputs=denoiser_inputs, **kwargs
+            )
+            loss = loss_and_nll.loss
+            nlls = loss_and_nll.nlls
+            other_loss_terms = loss_and_nll.other_loss_terms
+        else:
+            loss, nlls = None, None
+            other_loss_terms = {}
+        return DenoiserOutput(
+            denoiser_output=denoiser_output,
+            logits=backbone_output,
+            past_key_values=new_past_key_values,
+            tokens_mask=denoiser_inputs.tokens_mask,
+            loss=loss,
+            nlls=nlls,
+            other_loss_terms=other_loss_terms,
+        )
+    @staticmethod
+    def _sample_categorical(categorical_probs, do_sample=True):
+        """Helper function to sample from a categorical distribution."""
+        categorical_probs = categorical_probs.to(torch.float64)
+        if not do_sample:
+            return categorical_probs.argmax(dim=-1)
+        gumbel_norm = (1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()).to(
+            categorical_probs.dtype
+        )
+        return (categorical_probs / gumbel_norm).argmax(dim=-1)
+    @staticmethod
+    def _preprocess_attention_mask(attention_mask, dtype):
+        min_dtype = torch.finfo(dtype).min
+        attention_mask = torch.where(
+            (attention_mask == 0.0).bool(),  # type: ignore
+            min_dtype,
+            0.0,
+        ).to(dtype)
+        return attention_mask
+    @staticmethod
+    def _get_past_key_values_seq_length(past_key_values: DynamicCache):
+        seq_length = 0
+        for i in range(len(past_key_values)):
+            if past_key_values[i][0].shape[0] > 0:  # type: ignore
+                seq_length = max(
+                    past_key_values[i][0].shape[-2],  # type: ignore
+                    seq_length,
+                )
+        return seq_length
+    def update_cache(
+        self,
+        inputs: torch.LongTensor,
+        cache: Optional[Dict[str, Any]] = None,
+        **backbone_kwargs: Any,
+    ) -> Dict[str, Any]:
+        """
+        Cache the key-value pairs for the context.
+        Args:
+            inputs (torch.LongTensor): The context tensor.
+            cache (Dict[str, Any | None): Cache objects, e.g., past_key_values.
+        Returns:
+            Dict: Updated cache objects, e.g., past_key_values.
+        """
+        context_input, cache = self._prepare_inputs_inference(
+            input_ids=inputs, cache=cache, return_updated_cache=True, **backbone_kwargs
+        )
+        backbone_output = self._backbone_forward(
+            context_input,
+            return_updated_cache=True,  # Will get absorbed in backbone_kwargs
+            **cache,
+        )
+        backbone_output = {k: v for k, v in backbone_output.items()}
+        backbone_output.pop("logits", None)  # Do not store logits in cache
+        cache = cache | backbone_output
+        return cache
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.LongTensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        device: Optional[str] = None,
+        **kwargs: Any,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        """Generates sample from denoising model.
+        Follows signature of transformers.GenerationMixin.
+        """
+        raise NotImplementedError("Denoiser subclasses must implement generate")

diffusion.py CHANGED Viewed

@@ -21,7 +21,7 @@ except ImportError:
     BlockMask, and_masks, create_block_mask = None, None, None
-from src.denoiser.base import (
     Denoiser,
     DenoiserConfig,
     DenoiserInput,

     BlockMask, and_masks, create_block_mask = None, None, None
+from .denoiser_base import (
     Denoiser,
     DenoiserConfig,
     DenoiserInput,

noise_schedule_noise_schedules.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from abc import ABC, abstractmethod
+import torch
+class Noise(ABC):
+    """
+    Baseline forward method to get noise parameters at a timestep
+    """
+    def __call__(
+        self, t: torch.Tensor | float
+    ) -> tuple[torch.Tensor | float, torch.Tensor | float]:
+        # Assume time goes from 0 to 1
+        pass
+    @abstractmethod
+    def inverse(self, alpha_t: torch.Tensor) -> torch.Tensor:
+        """
+        Inverse function to compute the timestep t from the noise schedule param.
+        """
+        raise NotImplementedError("Inverse function not implemented")
+class CosineNoise(Noise):
+    def __init__(self, eps=1e-3):
+        super().__init__()
+        self.eps = eps
+        self.name = "cosine"
+    def __call__(self, t):
+        t = t.to(torch.float32)
+        cos = -(1 - self.eps) * torch.cos(t * torch.pi / 2)
+        sin = -(1 - self.eps) * torch.sin(t * torch.pi / 2)
+        move_chance = cos + 1
+        alpha_t_prime = sin * torch.pi / 2
+        return 1 - move_chance, alpha_t_prime
+class ExponentialNoise(Noise):
+    def __init__(self, exp=2, eps=1e-3):
+        super().__init__()
+        self.eps = eps
+        self.exp = exp
+        self.name = f"exp_{exp}"
+    def __call__(self, t):
+        t = t.to(torch.float32)
+        move_chance = torch.pow(t, self.exp)
+        move_chance = torch.clamp(move_chance, min=self.eps)
+        alpha_t_prime = -self.exp * torch.pow(t, self.exp - 1)
+        return alpha_t_prime, 1 - move_chance
+class LogarithmicNoise(Noise):
+    def __init__(self, eps=1e-3):
+        super().__init__()
+        self.eps = eps
+        self.name = "logarithmic"
+    def __call__(self, t):
+        t = t.to(torch.float32)
+        move_chance = torch.log1p(t) / torch.log(torch.tensor(2.0))
+        alpha_t_prime = -1 / (torch.log(torch.tensor(2.0)) * (1 + t))
+        return 1 - move_chance, alpha_t_prime
+class LinearNoise(Noise):
+    def __init__(self):
+        super().__init__()
+        self.name = "linear"
+    def inverse(self, alpha_t):
+        return 1 - alpha_t
+    def __call__(self, t):
+        t = t.to(torch.float32)
+        alpha_t_prime = -torch.ones_like(t)
+        move_chance = t
+        return 1 - move_chance, alpha_t_prime