Add files using upload-large-folder tool

Browse files

Files changed (9) hide show

README.md +50 -3
config.json +119 -0
configuration_glmasr.py +43 -0
inference.py +182 -0
model.safetensors +3 -0
modeling_audio.py +415 -0
modeling_glmasr.py +149 -0
tokenizer.json +0 -0
tokenizer_config.json +172 -0

README.md CHANGED Viewed

@@ -1,3 +1,50 @@
----
-license: mit
----

+# GLM-ASR-Nano-2512
+<div align="center">
+<img src="https://raw.githubusercontent.com/zai-org/GLM-ASR/refs/heads/main/resources/logo.svg" width="20%"/>
+</div>
+<p align="center">
+    👋 Join our <a href="#" target="_blank">Discord</a> community.
+    <br>
+    🚀 Experience the demo on <a href="#" target="_blank">Hugging Face Spaces</a>.
+    <br>
+    📦 Download model weights on <a href="#" target="_blank">Hugging Face</a> or <a href="#" target="_blank">ModelScope</a>.
+</p>
+## Model Introduction
+**GLM-ASR-Nano-2512** is a robust, open-source speech recognition model with **1.5B parameters**. Designed for
+real-world complexity, it outperforms OpenAI Whisper V3 on multiple benchmarks while maintaining a compact size.
+Key capabilities include:
+* **Exceptional Dialect Support:**
+  Beyond standard Mandarin and English, the model is highly optimized for **Cantonese (粤语)** and other dialects,
+  effectively bridging the gap in dialectal speech recognition.
+* **Low-Volume Speech Robustness:**
+  Specifically trained for **"Whisper/Quiet Speech"** scenarios. It captures and accurately transcribes extremely
+  low-volume audio that traditional models often miss.
+* **SOTA Performance:**
+  Achieves the **lowest average error rate (4.10)** among comparable open-source models, showing significant advantages
+  in Chinese benchmarks (Wenet Meeting, Aishell-1, etc..).
+## Benchmark
+We evaluated GLM-ASR-Nano against leading open-source and closed-source models. The results demonstrate that *
+*GLM-ASR-Nano (1.5B)** achieves superior performance, particularly in challenging acoustic environments.
+![Benchmark results](https://raw.githubusercontent.com/zai-org/GLM-ASR/refs/heads/main/resources/bench.png)
+Notes:
+- Wenet Meeting reflects real-world meeting scenarios with noise and overlapping speech.
+- Aishell-1 is a standard Mandarin benchmark.
+## Inference
+`GLM-ASR-Nano-2512` can be easily integrated using the `transformers` library.
+We will support `transformers 5.x` as well as inference frameworks such as `vLLM` and `SGLang`.
+you can check more code in [Github](https://github.com/zai-org/GLM-ASR).

config.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
+  "model_type": "glmasr",
+  "architectures": [
+    "GlmasrModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_glmasr.GlmasrConfig",
+    "AutoModelForCausalLM": "modeling_glmasr.GlmasrModel"
+  },
+  "torch_dtype": "bfloat16",
+  "attn_implementation": "flash_attention_2",
+  "lm_config": {
+    "architectures": [
+      "LlamaForCausalLM"
+    ],
+    "do_sample": false,
+    "eos_token_id": [
+      59246,
+      59253,
+      59255
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 8192,
+    "min_length": 0,
+    "model_type": "llama",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "num_return_sequences": 1,
+    "pad_token_id": 59260,
+    "return_dict": true,
+    "rms_norm_eps": 1e-05,
+    "rope_dim": 128,
+    "rope_theta": 10000.0,
+    "torch_dtype": "float16",
+    "typical_p": 1.0,
+    "vocab_size": 59264
+  },
+  "whisper_config": {
+    "activation_function": "gelu",
+    "architectures": [
+      "WhisperForConditionalGeneration"
+    ],
+    "begin_suppress_tokens": [
+      220,
+      50257
+    ],
+    "bos_token_id": 50257,
+    "chunk_size_feed_forward": 0,
+    "classifier_proj_size": 256,
+    "d_model": 1280,
+    "decoder_attention_heads": 20,
+    "decoder_ffn_dim": 5120,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 32,
+    "decoder_start_token_id": 50258,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 32,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 50257,
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "length_penalty": 1.0,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_length": 448,
+    "max_source_positions": 1500,
+    "max_target_positions": 448,
+    "median_filter_width": 7,
+    "min_length": 0,
+    "model_type": "whisper",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_mel_bins": 128,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 50256,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_cache": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 51866
+  },
+  "adapter_type": "mlp",
+  "merge_factor": 4,
+  "use_rope": true,
+  "max_whisper_length": 1500,
+  "max_length": 65536,
+  "mlp_adapter_act": "gelu",
+  "transformers_version": "4.51.3"
+}

configuration_glmasr.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from typing import Any, Dict, List, Optional
+from transformers import LlamaConfig, PretrainedConfig, WhisperConfig
+class GlmasrConfig(PretrainedConfig):
+    model_type = "Glmasr"
+    is_composition = True
+    def __init__(
+        self,
+        lm_config: Optional[Dict[str, Any] | LlamaConfig] = None,
+        whisper_config: Optional[Dict[str, Any] | WhisperConfig] = None,
+        adapter_type: str = "mlp",
+        merge_factor: int = 2,
+        spec_aug: bool = False,
+        use_rope: bool = False,
+        max_whisper_length: int = 1500,
+        max_length: int = 1024,
+        mlp_adapter_act: str = "gelu",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if isinstance(lm_config, LlamaConfig):
+            self.lm_config = lm_config
+        else:
+            self.lm_config = LlamaConfig.from_dict(lm_config or {})
+        if isinstance(whisper_config, WhisperConfig):
+            self.whisper_config = whisper_config
+        else:
+            self.whisper_config = WhisperConfig.from_dict(whisper_config or {})
+        self.adapter_type = adapter_type
+        self.merge_factor = merge_factor
+        self.spec_aug = spec_aug
+        self.use_rope = use_rope
+        self.max_whisper_length = max_whisper_length
+        self.max_length = max_length
+        self.mlp_adapter_act = mlp_adapter_act
+__all__ = ["GlmasrConfig"]

inference.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import argparse
+from pathlib import Path
+import torch
+import torchaudio
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    WhisperFeatureExtractor,
+)
+WHISPER_FEAT_CFG = {
+    "chunk_length": 30,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 128,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "processor_class": "WhisperProcessor",
+    "return_attention_mask": False,
+    "sampling_rate": 16000,
+}
+def get_audio_token_length(seconds, merge_factor=2):
+    def get_T_after_cnn(L_in, dilation=1):
+        for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
+            L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+            L_out = 1 + L_out // stride
+            L_in = L_out
+        return L_out
+    mel_len = int(seconds * 100)
+    audio_len_after_cnn = get_T_after_cnn(mel_len)
+    audio_token_num = (audio_len_after_cnn - merge_factor) // merge_factor + 1
+    # TODO: current whisper model can't process longer sequence, maybe cut chunk in the future
+    audio_token_num = min(audio_token_num, 1500 // merge_factor)
+    return audio_token_num
+def build_prompt(
+    audio_path: Path,
+    tokenizer,
+    feature_extractor: WhisperFeatureExtractor,
+    merge_factor: int,
+    chunk_seconds: int = 30,
+) -> dict:
+    audio_path = Path(audio_path)
+    wav, sr = torchaudio.load(str(audio_path))
+    wav = wav[:1, :]
+    if sr != feature_extractor.sampling_rate:
+        wav = torchaudio.transforms.Resample(sr, feature_extractor.sampling_rate)(wav)
+    tokens = []
+    tokens += tokenizer.encode("<|user|>")
+    tokens += tokenizer.encode("\n")
+    audios = []
+    audio_offsets = []
+    audio_length = []
+    chunk_size = chunk_seconds * feature_extractor.sampling_rate
+    for start in range(0, wav.shape[1], chunk_size):
+        chunk = wav[:, start : start + chunk_size]
+        mel = feature_extractor(
+            chunk.numpy(),
+            sampling_rate=feature_extractor.sampling_rate,
+            return_tensors="pt",
+            padding="max_length",
+        )["input_features"]
+        audios.append(mel)
+        seconds = chunk.shape[1] / feature_extractor.sampling_rate
+        num_tokens = get_audio_token_length(seconds, merge_factor)
+        tokens += tokenizer.encode("<|begin_of_audio|>")
+        audio_offsets.append(len(tokens))
+        tokens += [0] * num_tokens
+        tokens += tokenizer.encode("<|end_of_audio|>")
+        audio_length.append(num_tokens)
+    if not audios:
+        raise ValueError("音频内容为空或加载失败。")
+    tokens += tokenizer.encode("<|user|>")
+    tokens += tokenizer.encode("\nPlease transcribe this audio into text")
+    tokens += tokenizer.encode("<|assistant|>")
+    tokens += tokenizer.encode("\n")
+    batch = {
+        "input_ids": torch.tensor([tokens], dtype=torch.long),
+        "audios": torch.cat(audios, dim=0),
+        "audio_offsets": [audio_offsets],
+        "audio_length": [audio_length],
+        "attention_mask": torch.ones(1, len(tokens), dtype=torch.long),
+    }
+    return batch
+def prepare_inputs(batch: dict, device: torch.device) -> tuple[dict, int]:
+    tokens = batch["input_ids"].to(device)
+    attention_mask = batch["attention_mask"].to(device)
+    audios = batch["audios"].to(device)
+    model_inputs = {
+        "inputs": tokens,
+        "attention_mask": attention_mask,
+        "audios": audios.to(torch.bfloat16),
+        "audio_offsets": batch["audio_offsets"],
+        "audio_length": batch["audio_length"],
+    }
+    return model_inputs, tokens.size(1)
+def transcribe(
+    checkpoint_dir: Path,
+    audio_path: Path,
+    tokenizer_path: str | None,
+    max_new_tokens: int,
+    device: str,
+):
+    tokenizer_source = tokenizer_path if tokenizer_path else checkpoint_dir
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)
+    feature_extractor = WhisperFeatureExtractor(**WHISPER_FEAT_CFG)
+    config = AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        checkpoint_dir,
+        config=config,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+    ).to(device)
+    model.eval()
+    batch = build_prompt(
+        audio_path,
+        tokenizer,
+        feature_extractor,
+        merge_factor=config.merge_factor,
+    )
+    model_inputs, prompt_len = prepare_inputs(batch, device)
+    with torch.inference_mode():
+        generated = model.generate(
+            **model_inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+        )
+    transcript_ids = generated[0, prompt_len:].cpu().tolist()
+    transcript = tokenizer.decode(transcript_ids, skip_special_tokens=True).strip()
+    print("----------")
+    print(transcript or "[Empty transcription]")
+def main():
+    parser = argparse.ArgumentParser(description="Minimal ASR transcription demo.")
+    parser.add_argument("--checkpoint_dir", type=str, default=str(Path(__file__).parent))
+    parser.add_argument("--audio", type=str, required=True, help="Path to audio file.")
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default=None,
+        help="Tokenizer directory (defaults to checkpoint dir when omitted).",
+    )
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+    args = parser.parse_args()
+    transcribe(
+        checkpoint_dir=Path(args.checkpoint_dir),
+        audio_path=Path(args.audio),
+        tokenizer_path=args.tokenizer_path,
+        max_new_tokens=args.max_new_tokens,
+        device=args.device,
+    )
+if __name__ == "__main__":
+    main()

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1a7e953150d134cce1a1199d6f18060cb99ee8a9d8e13673ff3bd840da0c096
+size 4524872840

modeling_audio.py ADDED Viewed

	@@ -0,0 +1,415 @@

+from typing import Any, Optional, Tuple
+import torch
+from torch import Tensor, nn
+from transformers import WhisperConfig
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperEncoderLayer, WhisperFlashAttention2
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class RotaryEmbedding:
+    def __init__(self, dim, rope_ratio=1, original_impl=False):
+        super().__init__()
+        self.dim = dim
+        self.original_impl = original_impl
+        self.rope_ratio = rope_ratio
+    def forward_impl(
+        self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        base = base * self.rope_ratio
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+    @torch.no_grad()
+    def get_emb(self, max_seq_len, dtype, device):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=dtype, device=device,
+        )
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [b, np, sq, hn]
+    b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:, :sq]
+    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+class WhisperRoPEFlashAttn(WhisperFlashAttention2):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # WhisperFlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            # raise ValueError("WhisperFlashAttention2 attention does not support output_attentions")
+            logger.warning_once("WhisperFlashAttention2 attention does not support output_attentions, "
+                                "manually calculating attention weights.")
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, q_len, _ = hidden_states.size()
+        # get query proj
+        assert not is_cross_attention, "Cross-attention not supported"
+        key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
+        if rotary_pos_emb is not None:
+            logger.warning_once("Using Rotary Position Embedding in WhisperRoPEFlashAttn. ")
+            query_states, key_states = [apply_rotary_pos_emb(
+                i.transpose(1, 2),
+                rotary_pos_emb,
+            ).transpose(1, 2) for i in (query_states, key_states)]
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_length=q_len,
+            is_causal=self.is_causal,
+            dropout=self.dropout,
+            position_ids=position_ids,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        else:
+            attn_weights = (query_states.transpose(1, 2) * self.scaling) @ key_states.permute(0, 2, 3, 1)
+            if self.is_causal:
+                causal_mask = torch.triu(
+                    torch.ones(q_len, q_len, device=attn_weights.device), diagonal=1,
+                ).unsqueeze(0).unsqueeze(0) * -1e9
+                attn_weights = attn_weights + causal_mask
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        return attn_output, attn_weights, past_key_value
+class WhisperSpecialEncoderLayer(WhisperEncoderLayer):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.self_attn = WhisperRoPEFlashAttn(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> tuple[Tensor, Any]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, kv_cache = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            rotary_pos_emb=rotary_pos_emb,
+            position_ids=position_ids,
+        )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+        outputs = (hidden_states, kv_cache)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class WhisperSpecialEncoder(WhisperEncoder):
+    def __init__(
+        self,
+        config: WhisperConfig,
+        use_rope=False,
+        rope_ratio=1,
+    ):
+        super().__init__(config)
+        self.use_rope = use_rope
+        self.layers = nn.ModuleList(
+            [WhisperSpecialEncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+        if use_rope:
+            self.rotary_embedding = RotaryEmbedding(
+                config.hidden_size // config.encoder_attention_heads // 2,
+                rope_ratio,
+            )
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        position_ids=None,
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        if self.use_rope:
+            rotary_embs = self.rotary_embedding.get_emb(
+                inputs_embeds.shape[1],
+                inputs_embeds.dtype,
+                inputs_embeds.device,
+            )
+            if position_ids is not None:
+                rotary_embs = rotary_embs[position_ids]
+            else:
+                rotary_embs = rotary_embs[None]
+            hidden_states = inputs_embeds
+        else:
+            rotary_embs = None
+            if position_ids is not None:
+                # wrap tail, those are usually paddings to avoid inter-sample conv interfering
+                max_l = self.embed_positions.weight.shape[0]
+                if position_ids.max() >= max_l:
+                    print("Pos id max", position_ids.max(), "wrapping")
+                embed_pos = self.embed_positions.weight[position_ids % max_l]
+            else:
+                embed_pos = self.embed_positions.weight[:inputs_embeds.shape[1]]
+            hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                        rotary_embs,
+                        position_ids,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
+                        output_attentions=output_attentions,
+                        rotary_pos_emb=rotary_embs,
+                        position_ids=position_ids,
+                    )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[2],)
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )

modeling_glmasr.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from typing import Optional
+import torch
+from torch import Tensor, nn
+from transformers import LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_glmasr import GlmasrConfig
+from .modeling_audio import WhisperSpecialEncoder
+class AudioMLPAdapter(nn.Module):
+    def __init__(self, config: GlmasrConfig):
+        super().__init__()
+        whisper_config = config.whisper_config
+        self.merge_factor = config.merge_factor
+        self.whisper = WhisperSpecialEncoder(
+            whisper_config,
+            use_rope=config.use_rope,
+        )
+        self.whisper.layer_norm = nn.Identity()
+        self.layer_norm = nn.LayerNorm(whisper_config.hidden_size)
+        act = {
+            "gelu": nn.GELU(),
+            "relu": nn.ReLU(),
+            "selu": nn.SELU(),
+        }[config.mlp_adapter_act]
+        hidden = whisper_config.hidden_size * self.merge_factor
+        output_dim = config.lm_config.hidden_size
+        self.adapting = nn.Sequential(
+            nn.Linear(hidden, output_dim * 2),
+            act,
+            nn.Linear(output_dim * 2, output_dim),
+        )
+        self.audio_bos_eos_token = nn.Embedding(2, output_dim)
+    def forward(self, audios: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+        bsz = audios.size(0)
+        encoded = self.whisper(audios)[0]
+        encoded = self.layer_norm(encoded)
+        encoded = encoded.reshape(bsz, -1, encoded.size(-1) * self.merge_factor)
+        adapted = self.adapting(encoded)
+        boa = self.audio_bos_eos_token.weight[0][None, :]
+        eoa = self.audio_bos_eos_token.weight[1][None, :]
+        return adapted, boa, eoa
+class GlmasrModel(LlamaForCausalLM):
+    config_class = GlmasrConfig
+    def __init__(self, config: GlmasrConfig):
+        super().__init__(config.lm_config)
+        self.audio_encoder = AudioMLPAdapter(config)
+        self.all_config = config
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        audios: Optional[Tensor] = None,
+        audio_offsets: Optional[list[list[int]]] = None,
+        audio_length: Optional[list[list[int]]] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        past_key_values: Optional[tuple] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        tokens = input_ids
+        vocab_size = self.config.vocab_size
+        tokens = torch.clamp(tokens, 0, vocab_size - 1)
+        language_embs = self.model.embed_tokens(tokens)
+        have_audio = audios is not None and (
+            kwargs.get("past_key_values") is None or len(kwargs["past_key_values"]) == 0
+        )
+        if have_audio:
+            if audio_length is None:
+                raise ValueError("audio_length is required when audio_offsets are provided")
+            audio_embs, boa, eoa = self.audio_encoder(audios)
+            index = 0
+            for batch, (offsets, lengths) in enumerate(zip(audio_offsets, audio_length)):
+                for offset, length in zip(offsets, lengths):
+                    language_embs[batch, offset : offset + length] = audio_embs[index, :length]
+                    language_embs[batch, offset - 1] = boa
+                    language_embs[batch, offset + length] = eoa
+                    index += 1
+        kwargs.pop("inputs_embeds", None)
+        kwargs.pop("is_first_forward", None)
+        outputs = self.model(
+            inputs_embeds=language_embs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        logits = self.lm_head(outputs[0])
+        return CausalLMOutputWithPast(
+            loss=None,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def _update_model_kwargs_for_generation(self, *args, **kwargs):
+        model_kwargs = super()._update_model_kwargs_for_generation(*args, **kwargs)
+        model_kwargs["is_first_forward"] = False
+        position_ids = model_kwargs.get("position_ids")
+        if position_ids is not None:
+            next_pos = position_ids[..., -1:].clone() + 1
+            model_kwargs["position_ids"] = torch.cat([position_ids, next_pos], dim=-1)
+        return model_kwargs
+    def prepare_inputs_for_generation(
+        self,
+        *args,
+        past_key_values: Optional[tuple] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        is_first_forward: bool = True,
+        **kwargs,
+    ):
+        prepared = super().prepare_inputs_for_generation(
+            *args,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            is_first_forward=is_first_forward,
+            **kwargs,
+        )
+        for key, value in kwargs.items():
+            if key not in prepared and key.startswith("audio"):
+                prepared[key] = value
+        if is_first_forward and past_key_values is not None and len(past_key_values) > 0:
+            cached_len = past_key_values[0][0].shape[2]
+            prepared["input_ids"] = prepared["input_ids"][:, cached_len:]
+            if "position_ids" in prepared:
+                prepared["position_ids"] = prepared["position_ids"][:, cached_len:]
+        if not is_first_forward:
+            prepared["audios"] = None
+        return prepared
+__all__ = ["GlmasrModel"]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+  "added_tokens_decoder": {
+    "59246": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59247": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59248": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59249": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59250": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59251": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59252": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59253": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59254": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59255": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59256": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59257": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59258": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59259": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59260": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59261": {
+      "content": "<|begin_of_audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59262": {
+      "content": "<|end_of_audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>",
+    "<|pad|>",
+    "<|begin_of_audio|>",
+    "<|end_of_audio|>"
+  ],
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 65536,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizer"
+}