init

Browse files

Files changed (6) hide show

__init__.py +0 -0
config.json +22 -0
configuration_aria.py +57 -0
modeling_aria.py +748 -0
tokenization_aria.py +195 -0
tokenizer_config.json +11 -0

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "architectures": [
+    "AriaForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "hidden_size": 1536,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 8192,
+  "model_type": "aria",
+  "num_attention_heads": 24,
+  "num_hidden_layers": 16,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0",
+  "use_cache": true,
+  "vocab_size": 17727,
+  "auto_map": {
+    "AutoConfig": "configuration_aria.AriaConfig",
+    "AutoModel": "modeling_aria.AriaModel",
+    "AutoModelForCausalLM": "modeling_aria.AriaForCausalLM"
+  }
+}

configuration_aria.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from transformers import PretrainedConfig
+class AriaConfig(PretrainedConfig):
+    model_type = "aria"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size: int = 17727,
+        hidden_size: int = 1536,
+        embedding_size: int | None = None,
+        num_hidden_layers: int = 16,
+        num_attention_heads: int = 64,
+        intermediate_size: int = 6144,
+        max_position_embeddings: int = 8192,
+        use_cache: bool = True,
+        bos_token_id: int = 0,
+        eos_token_id: int = 1,
+        tie_word_embeddings: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.return_dict = return_dict
+        if self.intermediate_size % self.hidden_size != 0:
+            raise ValueError(
+                "The intermediate size needs to be divisible by hidden size."
+            )
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size needs to be divisible by the number of attention heads."
+            )
+    @property
+    def ff_mult(self):
+        return self.intermediate_size // self.hidden_size
+__all__ = ["AriaConfig"]

modeling_aria.py ADDED Viewed

	@@ -0,0 +1,748 @@

+# This is lightly adapted from https://github.com/EleutherAI/aria/blob/main/aria/model.py
+from typing import Optional, Union, Tuple
+import torch
+import torch.utils.checkpoint
+from torch import nn as nn
+from torch.nn import functional as F, CrossEntropyLoss
+from transformers import Cache, DynamicCache, StaticCache
+from transformers.utils import logging
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    BaseModelOutputWithPoolingAndProjection,
+)
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from .configuration_aria import AriaConfig
+logger = logging.get_logger(__name__)
+class AriaPreTrainedModel(PreTrainedModel):
+    config_class = AriaConfig
+    base_model_prefix = "aria"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AriaBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = False
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range
+            )
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range
+            )
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class TransformerBlock(nn.Module):
+    def __init__(self, model_config: AriaConfig, layer_idx: int):
+        super().__init__()
+        self.drop_p = 0.0
+        self.n_heads = model_config.num_attention_heads
+        self.d_model = model_config.hidden_size
+        self.d_head = (
+            model_config.hidden_size // model_config.num_attention_heads
+        )
+        self.max_seq_len = model_config.max_position_embeddings
+        self.layer_idx = layer_idx
+        # Attention
+        self.mixed_qkv = nn.Linear(
+            in_features=self.d_model,
+            out_features=3 * self.d_model,
+            bias=False,
+        )
+        self.att_proj_linear = nn.Linear(
+            in_features=self.d_model,
+            out_features=self.d_model,
+            bias=False,
+        )
+        # FF Layer
+        self.ff_gate_proj = nn.Linear(
+            in_features=self.d_model,
+            out_features=self.d_model * model_config.ff_mult,
+            bias=False,
+        )
+        self.ff_up_proj = nn.Linear(
+            in_features=self.d_model,
+            out_features=self.d_model * model_config.ff_mult,
+            bias=False,
+        )
+        self.ff_down_proj = nn.Linear(
+            in_features=self.d_model * model_config.ff_mult,
+            out_features=self.d_model,
+            bias=False,
+        )
+        # Pre layer norms
+        self.norm1 = nn.LayerNorm(self.d_model)
+        self.norm2 = nn.LayerNorm(self.d_model)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[
+            Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
+        ] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        attn_output, attn_weights, present = self._att_block(
+            self.norm1(x),
+            attention_mask,
+            freqs_cis,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        x = x + attn_output
+        x = x + self._ff_block(self.norm2(x))
+        outputs = (x, present)
+        if use_cache:
+            outputs = (x, present, attn_weights)
+        else:
+            outputs = (x, attn_weights)
+        return outputs
+    def _att_block(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        past_key_values: Optional[
+            Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
+        ] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_len, _ = x.shape
+        mixed_qkv = self.mixed_qkv(x)
+        xq, xk, xv = mixed_qkv.chunk(3, -1)
+        # Reshape for rotary embeddings
+        # Need contiguous for q, k since in-place RoPE cannot be applied on a view
+        xq = xq.reshape(
+            batch_size, seq_len, self.n_heads, self.d_head
+        ).contiguous()
+        xk = xk.reshape(
+            batch_size, seq_len, self.n_heads, self.d_head
+        ).contiguous()
+        xv = xv.view(batch_size, seq_len, self.n_heads, self.d_head)
+        # apply_rotary_post_emb expects: (b_sz, s_len, n_head, d_head)
+        xq = apply_rotary_emb(xq, freqs_cis)
+        xk = apply_rotary_emb(xk, freqs_cis)
+        xq, xk, xv = map(lambda t: t.transpose(1, 2), (xq, xk, xv))
+        if past_key_values is not None:
+            cache_kwargs = {
+                # "sin": sin,
+                # "cos": cos,
+                # "partial_rotation_size": self.rotary_ndims,
+                "cache_position": cache_position,
+            }
+            xk, xv = past_key_values.update(
+                xk, xv, self.layer_idx, cache_kwargs
+            )
+        # scaled_dot_product_attention expects: (b_sz, n_head, s_len, d_head)
+        att = F.scaled_dot_product_attention(
+            query=xq,
+            key=xk,
+            value=xv,
+            attn_mask=attention_mask,
+            is_causal=True,
+        )
+        # Reshape for out: (b_sz, s_len, n_head, d_head)
+        out = att.transpose(1, 2).contiguous()
+        out = out.view(batch_size, seq_len, self.n_heads * self.d_head)
+        if not output_attentions:
+            att = None
+        return self.att_proj_linear(out), att, past_key_values
+    def _ff_block(self, x: torch.Tensor):
+        return self.ff_down_proj(
+            F.silu(self.ff_gate_proj(x)) * self.ff_up_proj(x)
+        )
+class AriaModel(AriaPreTrainedModel):
+    """Transformer decoder with no language model head.
+    Args:
+        model_config (ModelConfig): Model config settings.
+    """
+    def __init__(self, model_config: AriaConfig):
+        super().__init__(model_config)
+        self.model_config = model_config
+        self.freqs_cis = None
+        self.tok_embeddings = nn.Embedding(
+            num_embeddings=model_config.vocab_size,
+            embedding_dim=model_config.hidden_size,
+        )
+        self.out_layer_norm = nn.LayerNorm(model_config.hidden_size)
+        self.encode_layers = nn.ModuleList()
+        for i in range(model_config.num_hidden_layers):
+            self.encode_layers.append(TransformerBlock(model_config, i))
+        self.gradient_checkpointing = False
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[
+            Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
+        ] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """Forward pass of Transformer.
+        Args:
+            src (torch.tensor): Input to encoder block, of shape (batch_size,
+                seq_len, d_model).
+            attn_mask (Optional[torch.tensor]): Attention mask of shape
+                (batch_size, seq_len). Defaults to None.
+            past_kv (Optional[list[KVCache]]): a list of kv caches. The list index
+                corresponds to the layer index.
+        Returns:
+            torch.tensor: Model outputs with shape (batch_size, seq_len,
+                d_model).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.model_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.model_config.output_hidden_states
+        )
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.model_config.use_return_dict
+        )
+        use_cache = (
+            use_cache if use_cache is not None else self.model_config.use_cache
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.tok_embeddings(input_ids)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(
+                    past_key_values
+                )
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length()
+                if past_key_values is not None
+                else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + seq_length,
+                device=inputs_embeds.device,
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+        )
+        if self.freqs_cis is None:
+            self.freqs_cis = precompute_freqs_cis(
+                seq_len=self.model_config.max_position_embeddings,
+                n_elem=self.model_config.hidden_size
+                // self.model_config.num_attention_heads,
+                base=500000,
+                dtype=hidden_states.dtype,
+            ).to(input_ids.device)
+        freqs_cis = self.freqs_cis[: input_ids.shape[1]]
+        kwargs = {
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "return_dict": return_dict,
+            "cache_position": cache_position,
+        }
+        next_decoder_cache = None
+        if self.gradient_checkpointing:
+            for layer in self.encode_layers:
+                def create_custom_forward(module):
+                    def custom_forward(*args):
+                        return module(*args)[0]
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    causal_mask,
+                    freqs_cis,
+                    **kwargs,
+                    preserve_rng_state=True,
+                    use_reentrant=True,
+                )
+        else:
+            all_attentions = () if output_attentions else None
+            all_hidden_states = () if output_hidden_states else None
+            for layer in self.encode_layers:
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+                outputs = layer(
+                    hidden_states, causal_mask, freqs_cis=freqs_cis, **kwargs
+                )
+                hidden_states = outputs[0]
+                if use_cache is True:
+                    next_decoder_cache = outputs[1]
+                if output_attentions:
+                    all_attentions = all_attentions + (
+                        outputs[2 if use_cache else 1],
+                    )
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.out_layer_norm(hidden_states)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.model_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = (
+            past_key_values.get_seq_length()
+            if past_key_values is not None
+            else 0
+        )
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.model_config._attn_implementation == "sdpa"
+            and not using_static_cache
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = (
+            self._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=target_length,
+                dtype=dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=input_tensor.shape[0],
+            )
+        )
+        if (
+            self.model_config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(
+                causal_mask, min_dtype
+            )
+        return causal_mask
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=device,
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(
+                target_length, device=device
+            ) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(
+                batch_size, 1, -1, -1
+            )
+            if attention_mask is not None:
+                causal_mask = (
+                    causal_mask.clone()
+                )  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = (
+                    causal_mask[:, :, :, :mask_length]
+                    + attention_mask[:, None, None, :]
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+        return causal_mask
+class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
+    """Transformer decoder with head for language modelling.
+    Args:
+        model_config (ModelConfig): Model config settings.
+    """
+    def __init__(self, model_config: AriaConfig):
+        super().__init__(model_config)
+        self.model_config = model_config
+        self.max_seq_len = model_config.max_position_embeddings
+        self.model = AriaModel(model_config)
+        self.lm_head = nn.Linear(
+            model_config.hidden_size, model_config.vocab_size, bias=False
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[
+            Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
+        ] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """Forward pass of Transformer decoder with LM head."""
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.model_config.use_return_dict
+        )
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden = outputs[0]
+        lm_logits = self.lm_head(hidden)
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
+            )
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class AriaForSequenceEmbeddings(AriaPreTrainedModel):
+    """Transformer decoder embedding head for contrastive learning.
+    Args:
+        model_config (ModelConfig): Model config settings.
+    """
+    def __init__(self, model_config: AriaConfig):
+        super().__init__(model_config)
+        assert model_config.embedding_size
+        self.model_config = model_config
+        self.max_seq_len = model_config.max_position_embeddings
+        self.model = AriaModel(model_config)
+        self.emb_head = nn.Linear(
+            model_config.hidden_size, model_config.embedding_size, bias=False
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[
+            Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
+        ] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """Forward pass of Transformer decoder with embedding head. Pooled
+        embedding is extracted from EOS token."""
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.model_config.use_return_dict
+        )
+        if (
+            position_ids is not None
+            or inputs_embeds is not None
+            or past_key_values is not None
+            or labels is not None
+            or cache_position is not None
+            or use_cache
+        ):
+            raise ValueError("Provided args unsupported for embedding head")
+        _batch_size = input_ids.shape[0]
+        eos_mask = input_ids == self.config.eos_token_id
+        if not eos_mask.any(dim=1).all():
+            raise ValueError(
+                "Each sequence must contain at least one EOS token"
+            )
+        eos_pos = eos_mask.int().argmax(dim=1)
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=False,
+        )
+        hidden = outputs[0]
+        embedding = self.emb_head(hidden)
+        pooled_embedding = embedding[
+            torch.arange(_batch_size, device=input_ids.device), eos_pos
+        ]
+        if not return_dict:
+            output = (pooled_embedding,) + outputs[1:]
+            return output
+        return BaseModelOutputWithPoolingAndProjection(
+            last_hidden_state=embedding,
+            pooler_output=pooled_embedding,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+def precompute_freqs_cis(
+    seq_len: int,
+    n_elem: int,
+    base: int = 500000,
+    dtype: torch.dtype = torch.bfloat16,
+):
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=dtype)
+@torch.jit.script
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """
+    In-place RoPE. Credits to Katherine Crowson:
+    x shape (b_sz, s_len, n_head, d_head).
+    cos, sin shape (s_len, d_head // 2).
+    """
+    d = x.shape[-1] // 2
+    cos = freqs_cis[..., 0][None, :, None]
+    sin = freqs_cis[..., 1][None, :, None]
+    x1, x2 = x[..., :d], x[..., d : d * 2]
+    tmp = x1.clone()
+    x1.mul_(cos).addcmul_(x2, sin, value=-1)
+    x2.mul_(cos).addcmul_(tmp, sin, value=1)
+    return x
+__all__ = [
+    "AriaPreTrainedModel",
+    "AriaModel",
+    "TransformerBlock",
+    "AriaForCausalLM",
+    "AriaForSequenceEmbeddings",
+]

tokenization_aria.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from typing import TYPE_CHECKING, List, Optional, Tuple
+from transformers.tokenization_utils import PreTrainedTokenizer, BatchEncoding
+from transformers.utils import logging, TensorType, to_py_obj
+try:
+    from ariautils.midi import MidiDict
+    from ariautils.tokenizer import AbsTokenizer
+    from ariautils.tokenizer._base import Token
+except ImportError:
+    raise ImportError(
+        "ariautils is not installed. Please try `pip install git+https://github.com/EleutherAI/aria-utils.git`."
+    )
+if TYPE_CHECKING:
+    pass
+logger = logging.get_logger(__name__)
+class AriaTokenizer(PreTrainedTokenizer):
+    """
+    Aria Tokenizer is NOT a BPE tokenizer. A midi file will be converted to a MidiDict (note: in fact, a MidiDict is not a single dict. It is more about a list of "notes") which represents a sequence of notes, stops, etc. And then, aria tokenizer is simply a dictionary that maps MidiDict to discrete indices according to a hard-coded rule.
+    For a FIM finetuned model, we also follow a simple FIM format to guide a piece of music to a (possibly very different) suffix according to the prompts:
+    <GUIDANCE-START> ... <GUIDANCE-END> <S> <PROMPT-START> ... <PROMPT-END>
+    This way, we expect a continuation that connects PROMPT and GUIDANCE.
+    """
+    vocab_files_names = {}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        add_bos_token=True,
+        add_eos_token=True,
+        add_dim_token=True,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        **kwargs,
+    ):
+        self._tokenizer = AbsTokenizer()
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.add_dim_token = add_dim_token
+        self.use_default_system_prompt = use_default_system_prompt
+        bos_token = self._tokenizer.bos_tok
+        eos_token = self._tokenizer.eos_tok
+        pad_token = self._tokenizer.pad_tok
+        unk_token = self._tokenizer.unk_tok
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+    def __getstate__(self):
+        return {}
+    def __setstate__(self, d):
+        raise NotImplementedError()
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self._tokenizer.vocab_size
+    def get_vocab(self):
+        return self._tokenizer.tok_to_id
+    def tokenize(
+        self,
+        midi_dict: MidiDict,
+        add_dim_tok: Optional[bool] = None,
+        add_eos_tok: Optional[bool] = None,
+        **kwargs,
+    ) -> List[Token]:
+        return self._tokenizer.tokenize(
+            midi_dict=midi_dict,
+            add_dim_tok=(
+                add_dim_tok if add_dim_tok is not None else self.add_dim_token
+            ),
+            add_eos_tok=(
+                add_eos_tok if add_eos_tok is not None else self.add_eos_token
+            ),
+        )
+    def _tokenize(
+        self,
+        midi_dict: MidiDict,
+        add_dim_tok: Optional[bool] = None,
+        add_eos_tok: Optional[bool] = None,
+        **kwargs,
+    ) -> List[Token]:
+        return self._tokenizer.tokenize(
+            midi_dict=midi_dict,
+            add_dim_tok=(
+                add_dim_tok if add_dim_tok is not None else self.add_dim_token
+            ),
+            add_eos_tok=(
+                add_eos_tok if add_eos_tok is not None else self.add_eos_token
+            ),
+        )
+    def __call__(
+        self,
+        midi_dicts: MidiDict | list[MidiDict],
+        padding: bool = False,
+        max_length: int | None = None,
+        pad_to_multiple_of: int | None = None,
+        return_tensors: str | TensorType | None = None,
+        return_attention_mask: bool | None = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """It is impossible to rely on the parent method because the inputs are MidiDict(s) instead of strings. I do not like the idea of going hacky so that two entirely different types of inputs can marry. So here I reimplement __call__ with limited support of certain useful arguments. I do not expect any conflict with other "string-in-ids-out" tokenizers. If you have to mix up the API of string-based tokenizers and our midi-based tokenizer, there must be a problem with your design."""
+        if isinstance(midi_dicts, MidiDict):
+            midi_dicts = [midi_dicts]
+        all_tokens: list[list[int]] = []
+        all_attn_masks: list[list[int]] = []
+        max_len_encoded = 0
+        for md in midi_dicts:
+            tokens = self._tokenizer.encode(self._tokenizer.tokenize(md))
+            if max_length is not None:
+                tokens = tokens[:max_length]
+            max_len_encoded = max(max_len_encoded, len(tokens))
+            all_tokens.append(tokens)
+            all_attn_masks.append([True] * len(tokens))
+        if pad_to_multiple_of is not None:
+            max_len_encoded = (
+                (max_len_encoded + pad_to_multiple_of) // pad_to_multiple_of
+            ) * pad_to_multiple_of
+        if padding:
+            for tokens, attn_mask in zip(all_tokens, all_attn_masks):
+                tokens.extend(
+                    [self._tokenizer.pad_id] * (max_len_encoded - len(tokens))
+                )
+                attn_mask.extend([False] * (max_len_encoded - len(tokens)))
+        return BatchEncoding(
+            {
+                "input_ids": all_tokens,
+                "attention_masks": all_attn_masks,
+            },
+            tensor_type=return_tensors,
+        )
+    def decode(self, token_ids: List[int], **kwargs) -> MidiDict:
+        token_ids = to_py_obj(token_ids)
+        return self._tokenizer.detokenize(self._tokenizer.decode(token_ids))
+    def batch_decode(
+        self, token_ids_list: List[List[Token]], **kwargs
+    ) -> List[MidiDict]:
+        results = []
+        for token_ids in token_ids_list:
+            results.append(self.decode(token_ids))
+        return results
+    def encode_from_file(self, filename: str, **kwargs) -> BatchEncoding:
+        midi_dict = MidiDict.from_midi(filename)
+        return self(midi_dict, **kwargs)
+    def encode_from_files(
+        self, filenames: list[str], **kwargs
+    ) -> BatchEncoding:
+        midi_dicts = [MidiDict.from_midi(file) for file in filenames]
+        return self(midi_dicts, **kwargs)
+    def _convert_token_to_id(self, token: Token):
+        """Converts a token (tuple or str) into an id."""
+        return self._tokenizer.tok_to_id.get(
+            token, self._tokenizer.tok_to_id[self.unk_token]
+        )
+    def _convert_id_to_token(self, index: int):
+        """Converts an index (integer) in a token (tuple or str)."""
+        return self._tokenizer.id_to_tok.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens: List[Token]) -> MidiDict:
+        """Converts a sequence of tokens into a single MidiDict."""
+        return self._tokenizer.detokenize(tokens)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        raise NotImplementedError()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_aria.AriaTokenizer",
+      null
+    ]
+  },
+  "tokenizer_class": "AriaTokenizer"
+}