Spaces:

OpenMOSS-Team
/

MOSS-Audio-8B-Thinking

Running

App Files Files Community

kiiic commited on Apr 16

Commit

15d6fac

1 Parent(s): ea7cb64

Add application file

Browse files

Files changed (10) hide show

.gitignore +2 -0
app.py +240 -0
packages.txt +1 -0
requirements.txt +15 -0
src/__init__.py +1 -0
src/audio_io.py +14 -0
src/configuration_moss_audio.py +129 -0
src/hf_inference.py +102 -0
src/modeling_moss_audio.py +472 -0
src/processing_moss_audio.py +408 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.py[cod]

app.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from __future__ import annotations
+import os
+import subprocess
+import tempfile
+import time
+from functools import lru_cache
+from pathlib import Path
+import gradio as gr
+try:
+    import spaces  # type: ignore[import-not-found]
+except ImportError:
+    class _SpacesFallback:
+        @staticmethod
+        def GPU(func):
+            return func
+    spaces = _SpacesFallback()
+from src.hf_inference import MossAudioHFInference, read_env_model_id, resolve_device
+TITLE = "MOSS-Audio-8B-Thinking Demo"
+DEFAULT_QUESTION = "Describe this audio."
+DEFAULT_MAX_NEW_TOKENS = 1024
+DEFAULT_TEMPERATURE = 1.0
+DEFAULT_TOP_P = 1.0
+DEFAULT_TOP_K = 50
+VIDEO_EXTENSIONS = {".mp4"}
+@lru_cache(maxsize=2)
+def get_inference(model_name_or_path: str, device: str) -> MossAudioHFInference:
+    return MossAudioHFInference(
+        model_name_or_path=model_name_or_path,
+        device=device,
+        torch_dtype="auto",
+        enable_time_marker=True,
+    )
+def format_status(model_name_or_path: str, device: str, elapsed_seconds: float) -> str:
+    return (
+        f"Model: `{model_name_or_path}`  \n"
+        f"Device: `{device}`  \n"
+        f"Elapsed: `{elapsed_seconds:.2f}s`"
+    )
+def convert_media_to_mp3(media_path: str, output_path: str) -> None:
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        media_path,
+        "-vn",
+        "-acodec",
+        "libmp3lame",
+        output_path,
+    ]
+    try:
+        subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+    except subprocess.CalledProcessError as exc:
+        raise gr.Error(
+            f"Failed to extract audio from the uploaded media. Please make sure the mp4 file is valid and decodable.\n{exc.stderr}"
+        ) from exc
+def resolve_media_path(audio_path: str | None, video_path: str | None) -> str | None:
+    if video_path:
+        return video_path
+    return audio_path
+@spaces.GPU
+def run_inference(
+    audio_path: str | None,
+    video_path: str | None,
+    question: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+):
+    prompt = (question or "").strip() or DEFAULT_QUESTION
+    model_name_or_path = read_env_model_id()
+    device = resolve_device()
+    try:
+        inference = get_inference(model_name_or_path, device)
+    except Exception as exc:  # pragma: no cover - runtime environment dependent
+        raise gr.Error(
+            f"Failed to load the model. Please check the weights path or Hugging Face download status.\n{exc}"
+        ) from exc
+    media_path = resolve_media_path(audio_path, video_path)
+    try:
+        started_at = time.perf_counter()
+        with tempfile.TemporaryDirectory(prefix="moss-audio-") as temp_dir:
+            prepared_audio_path = media_path
+            if media_path and Path(media_path).suffix.lower() in VIDEO_EXTENSIONS:
+                prepared_audio_path = os.path.join(temp_dir, "input.mp3")
+                convert_media_to_mp3(media_path, prepared_audio_path)
+            answer = inference.generate(
+                question=prompt,
+                audio_path=prepared_audio_path,
+                max_new_tokens=max_new_tokens,
+                do_sample=temperature > 0,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+            )
+        elapsed_seconds = time.perf_counter() - started_at
+    except Exception as exc:  # pragma: no cover - runtime environment dependent
+        raise gr.Error(
+            f"Inference failed. Please make sure the uploaded file is readable and the format is supported.\n{exc}"
+        ) from exc
+    return answer, format_status(model_name_or_path, device, elapsed_seconds)
+with gr.Blocks(title=TITLE) as demo:
+    gr.Markdown(f"# {TITLE}")
+    with gr.Row():
+        with gr.Column(scale=5):
+            audio_input = gr.Audio(
+                label="Audio",
+                sources=["upload", "microphone"],
+                type="filepath",
+            )
+            with gr.Accordion("Optional Video Input (.mp4)", open=False):
+                gr.Markdown(
+                    "Upload an mp4 only when needed. If a video is provided, its audio track will be extracted and used for inference."
+                )
+                video_input = gr.File(
+                    label="Video File",
+                    file_types=[".mp4"],
+                    type="filepath",
+                )
+            question_input = gr.Textbox(
+                label="Prompt",
+                lines=4,
+                value=DEFAULT_QUESTION,
+                placeholder="For example: Please transcribe this audio. Describe the sounds in this clip. What emotion does the speaker convey?",
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                max_new_tokens_input = gr.Slider(
+                    minimum=64,
+                    maximum=2048,
+                    value=DEFAULT_MAX_NEW_TOKENS,
+                    step=32,
+                    label="Max New Tokens",
+                )
+                temperature_input = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.5,
+                    value=DEFAULT_TEMPERATURE,
+                    step=0.1,
+                    label="Temperature",
+                )
+                top_p_input = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=DEFAULT_TOP_P,
+                    step=0.05,
+                    label="Top-p",
+                )
+                top_k_input = gr.Slider(
+                    minimum=1,
+                    maximum=100,
+                    value=DEFAULT_TOP_K,
+                    step=1,
+                    label="Top-k",
+                )
+            with gr.Row():
+                submit_btn = gr.Button("Generate", variant="primary")
+                gr.ClearButton(
+                    [
+                        audio_input,
+                        video_input,
+                        question_input,
+                        max_new_tokens_input,
+                        temperature_input,
+                        top_p_input,
+                        top_k_input,
+                    ],
+                    value="Clear",
+                )
+        with gr.Column(scale=5):
+            output_text = gr.Textbox(label="Output", lines=16)
+            status_text = gr.Markdown("Waiting for input.")
+    gr.Examples(
+        examples=[
+            ["Describe this audio."],
+            ["Please transcribe this audio."],
+            ["What is happening in this audio clip?"],
+            ["Describe the speaker's voice characteristics in detail."],
+            ["What emotion does the speaker convey?"],
+        ],
+        inputs=[question_input],
+        label="Prompt Examples",
+    )
+    submit_btn.click(
+        fn=run_inference,
+        inputs=[
+            audio_input,
+            video_input,
+            question_input,
+            max_new_tokens_input,
+            temperature_input,
+            top_p_input,
+            top_k_input,
+        ],
+        outputs=[output_text, status_text],
+    )
+if __name__ == "__main__":
+    server_name = os.environ.get("MOSS_AUDIO_SERVER_NAME", "0.0.0.0")
+    server_port = int(os.environ.get("MOSS_AUDIO_SERVER_PORT", "7860"))
+    demo.queue(max_size=8).launch(
+        server_name=server_name,
+        server_port=server_port,
+    )

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+--extra-index-url https://download.pytorch.org/whl/cu128
+accelerate
+einops>=0.8.0
+gradio
+numpy>=2.0
+packaging
+requests
+safetensors>=0.4.0
+scipy>=1.12.0
+soundfile>=0.12.0
+spaces
+tiktoken>=0.12.0
+torch==2.9.1
+torchaudio==2.9.1
+transformers==4.57.1

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """MOSS-Audio source package."""

src/audio_io.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from __future__ import annotations
+import torchaudio
+def load_audio(path: str, sample_rate: int):
+    waveform, original_sample_rate = torchaudio.load(path)
+    if waveform.size(0) > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    if original_sample_rate != sample_rate:
+        waveform = torchaudio.functional.resample(
+            waveform, orig_freq=original_sample_rate, new_freq=sample_rate
+        )
+    return waveform.squeeze(0).cpu().numpy()

src/configuration_moss_audio.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from dataclasses import dataclass, field
+from typing import List, Optional
+from transformers import PretrainedConfig, Qwen3Config
+@dataclass
+class MossAudioEncoderConfig:
+    d_model: int = 1280
+    output_dim: int = 1280
+    num_mel_bins: int = 128
+    encoder_layers: int = 32
+    encoder_attention_heads: int = 20
+    encoder_ffn_dim: int = 5120
+    downsample_rate: int = 8
+    downsample_hidden_size: int = 480
+    encoder_attention_window_size: int = 100
+    max_source_positions: int = 1500
+    dropout: float = 0.1
+    attention_dropout: float = 0.1
+    activation_dropout: float = 0.0
+    activation_function: str = "gelu"
+    layer_norm_eps: float = 1e-5
+    _attn_implementation: str = "eager"
+    pretrained_path: str = ""
+    deepstack_encoder_layer_indexes: List[int] = field(
+        default_factory=lambda: [8, 16, 24]
+    )
+    @classmethod
+    def from_dict(cls, config_dict):
+        if config_dict is None:
+            return cls()
+        allowed_keys = set(cls.__dataclass_fields__.keys())
+        filtered = {k: v for k, v in config_dict.items() if k in allowed_keys}
+        return cls(**filtered)
+    def to_dict(self):
+        return {
+            "d_model": self.d_model,
+            "output_dim": self.output_dim,
+            "num_mel_bins": self.num_mel_bins,
+            "encoder_layers": self.encoder_layers,
+            "encoder_attention_heads": self.encoder_attention_heads,
+            "encoder_ffn_dim": self.encoder_ffn_dim,
+            "downsample_rate": self.downsample_rate,
+            "downsample_hidden_size": self.downsample_hidden_size,
+            "encoder_attention_window_size": self.encoder_attention_window_size,
+            "max_source_positions": self.max_source_positions,
+            "dropout": self.dropout,
+            "attention_dropout": self.attention_dropout,
+            "activation_dropout": self.activation_dropout,
+            "activation_function": self.activation_function,
+            "layer_norm_eps": self.layer_norm_eps,
+            "_attn_implementation": self._attn_implementation,
+            "pretrained_path": self.pretrained_path,
+            "deepstack_encoder_layer_indexes": list(
+                self.deepstack_encoder_layer_indexes or []
+            ),
+        }
+class MossAudioConfig(PretrainedConfig):
+    model_type = "moss_audio"
+    is_composition = True
+    def __init__(
+        self,
+        audio_config=None,
+        language_config=None,
+        adapter_hidden_size=8192,
+        ignore_index=-100,
+        deepstack_num_inject_layers: Optional[int] = None,
+        **kwargs,
+    ):
+        if isinstance(audio_config, dict):
+            audio_config = MossAudioEncoderConfig.from_dict(audio_config)
+        elif audio_config is None:
+            audio_config = MossAudioEncoderConfig()
+        if isinstance(language_config, dict):
+            language_config = Qwen3Config(**language_config)
+        elif language_config is None:
+            language_config = Qwen3Config()
+        self.audio_config = audio_config
+        self.language_config = language_config
+        self.adapter_hidden_size = adapter_hidden_size
+        self.ignore_index = ignore_index
+        self.deepstack_num_inject_layers = deepstack_num_inject_layers
+        propagate_keys = {
+            "num_hidden_layers",
+            "eos_token_id",
+            "bos_token_id",
+            "vocab_size",
+            "tie_word_embeddings",
+        }
+        for key in ("num_hidden_layers", "eos_token_id", "bos_token_id", "vocab_size"):
+            kwargs.setdefault(key, getattr(language_config, key, None))
+        kwargs.setdefault("tie_word_embeddings", False)
+        if hasattr(language_config, "to_dict"):
+            language_keys = set(language_config.to_dict().keys())
+            for key in list(kwargs.keys()):
+                if key in language_keys and key not in propagate_keys:
+                    kwargs.pop(key)
+        super().__init__(**kwargs)
+    def to_dict(self):
+        output = super().to_dict()
+        output["audio_config"] = (
+            self.audio_config.to_dict()
+            if hasattr(self.audio_config, "to_dict")
+            else self.audio_config
+        )
+        output["language_config"] = (
+            self.language_config.to_dict()
+            if hasattr(self.language_config, "to_dict")
+            else self.language_config
+        )
+        output["adapter_hidden_size"] = self.adapter_hidden_size
+        output["ignore_index"] = self.ignore_index
+        output["deepstack_num_inject_layers"] = self.deepstack_num_inject_layers
+        return output
+__all__ = ["MossAudioEncoderConfig", "MossAudioConfig"]

src/hf_inference.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""HuggingFace inference wrapper for MOSS-Audio."""
+from __future__ import annotations
+import os
+from typing import Optional
+import torch
+from src.audio_io import load_audio
+from src.modeling_moss_audio import MossAudioModel
+from src.processing_moss_audio import MossAudioProcessor
+DEFAULT_MODEL_ID = "OpenMOSS-Team/MOSS-Audio-8B-Thinking"
+def read_env_model_id() -> str:
+    return os.environ.get("MOSS_AUDIO_MODEL_ID", DEFAULT_MODEL_ID)
+def resolve_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda:0"
+    return "cpu"
+class MossAudioHFInference:
+    """Thin wrapper that loads model + processor and exposes a single
+    ``generate`` method for both audio-grounded and text-only queries."""
+    def __init__(
+        self,
+        model_name_or_path: str = DEFAULT_MODEL_ID,
+        device: str = "cuda:0",
+        torch_dtype: str = "auto",
+        enable_time_marker: bool = True,
+    ):
+        self.device = device
+        load_kwargs = {
+            "trust_remote_code": True,
+            "torch_dtype": torch_dtype,
+            "low_cpu_mem_usage": True,
+        }
+        load_kwargs["device_map"] = {"": device}
+        self.model = MossAudioModel.from_pretrained(
+            model_name_or_path,
+            **load_kwargs,
+        )
+        self.model.eval()
+        self.processor = MossAudioProcessor.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=True,
+            enable_time_marker=enable_time_marker,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        question: str,
+        audio_path: Optional[str] = None,
+        max_new_tokens: int = 1024,
+        num_beams: int = 1,
+        do_sample: bool = True,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = 50,
+    ) -> str:
+        if audio_path is not None:
+            raw_audio = load_audio(audio_path, sample_rate=self.processor.config.mel_sr)
+            inputs = self.processor(text=question, audios=[raw_audio], return_tensors="pt")
+        else:
+            inputs = self.processor(text=question, return_tensors="pt")
+        inputs = inputs.to(self.model.device)
+        if inputs.get("audio_data") is not None:
+            inputs["audio_data"] = inputs["audio_data"].to(self.model.dtype)
+        audio_input_mask = inputs["input_ids"] == self.processor.audio_token_id
+        inputs["audio_input_mask"] = audio_input_mask
+        gen_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            num_beams=num_beams,
+            use_cache=True,
+        )
+        if do_sample:
+            gen_kwargs.update(
+                do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k
+            )
+        else:
+            gen_kwargs["do_sample"] = False
+        generated_ids = self.model.generate(**inputs, **gen_kwargs)
+        input_len = inputs["input_ids"].shape[1]
+        return self.processor.decode(
+            generated_ids[0, input_len:], skip_special_tokens=True
+        )
+__all__ = ["MossAudioHFInference", "read_env_model_id", "resolve_device"]

src/modeling_moss_audio.py ADDED Viewed

	@@ -0,0 +1,472 @@

+from typing import Any, List, Optional, Tuple, Union
+import math
+import torch
+import torch.nn as nn
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3Model
+from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
+from transformers.utils.auto_docstring import auto_docstring
+from src.configuration_moss_audio import MossAudioConfig, MossAudioEncoderConfig
+class SinusoidsPositionEmbedding(nn.Module):
+    def __init__(self, num_positions: int, embedding_dim: int):
+        super().__init__()
+        max_timescale = 10000.0
+        log_timescale_increment = math.log(max_timescale) / (embedding_dim // 2 - 1)
+        inv_timescales = torch.exp(
+            -log_timescale_increment * torch.arange(embedding_dim // 2).float()
+        )
+        self.register_buffer("inv_timescales", inv_timescales, persistent=False)
+    def forward(self, seq_len: int, device: torch.device):
+        scaled_time = torch.arange(
+            seq_len, device=device, dtype=self.inv_timescales.dtype
+        ).unsqueeze(1) * self.inv_timescales.unsqueeze(0)
+        sin_emb = torch.sin(scaled_time)
+        cos_emb = torch.cos(scaled_time)
+        pos_emb = torch.cat([sin_emb, cos_emb], dim=1)
+        return pos_emb.unsqueeze(0)
+class MossAudioEncoder(nn.Module):
+    """Audio encoder with conv-stem downsampling and Whisper transformer layers."""
+    def __init__(self, config: MossAudioEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.gelu = nn.GELU()
+        self.conv1 = nn.Conv2d(
+            1,
+            config.downsample_hidden_size,
+            kernel_size=(3, 3),
+            stride=(2, 2),
+            padding=(1, 1),
+        )
+        self.conv2 = nn.Conv2d(
+            config.downsample_hidden_size,
+            config.downsample_hidden_size,
+            kernel_size=(3, 3),
+            stride=(2, 2),
+            padding=(1, 1),
+        )
+        self.conv3 = nn.Conv2d(
+            config.downsample_hidden_size,
+            config.downsample_hidden_size,
+            kernel_size=(3, 3),
+            stride=(2, 2),
+            padding=(1, 1),
+        )
+        self.stem_proj = nn.Linear(config.downsample_hidden_size * 16, config.d_model)
+        self.embed_positions = SinusoidsPositionEmbedding(
+            config.max_source_positions, config.d_model
+        )
+        self.layers = nn.ModuleList(
+            [WhisperEncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.out_proj = (
+            nn.Linear(config.d_model, config.output_dim, bias=False)
+            if config.output_dim != config.d_model
+            else nn.Identity()
+        )
+        self._deepstack_indexes_set = set(config.deepstack_encoder_layer_indexes or [])
+    def _compute_downsampled_length(self, lengths: torch.Tensor) -> torch.Tensor:
+        def conv_out_len(length):
+            return (length - 1) // 2 + 1
+        length1 = conv_out_len(lengths)
+        length2 = conv_out_len(length1)
+        length3 = conv_out_len(length2)
+        return length3
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        feature_lens: Optional[torch.Tensor] = None,
+        output_deepstack_hidden_states: bool = True,
+    ):
+        if input_features.dim() == 2:
+            input_features = input_features.unsqueeze(0)
+        if feature_lens is None:
+            feature_lens = torch.full(
+                (input_features.size(0),),
+                input_features.size(-1),
+                device=input_features.device,
+                dtype=torch.long,
+            )
+        downsampled_lengths = self._compute_downsampled_length(feature_lens)
+        x = input_features.unsqueeze(1)
+        x = self.gelu(self.conv1(x))
+        x = self.gelu(self.conv2(x))
+        x = self.gelu(self.conv3(x))
+        x = x.permute(0, 3, 1, 2).contiguous().flatten(2)
+        x = self.stem_proj(x)
+        max_len = int(downsampled_lengths.max().item())
+        if x.size(1) > max_len:
+            x = x[:, :max_len, :]
+        positions = self.embed_positions(x.shape[1], x.device)
+        x = x + positions.to(x.dtype)
+        padding_mask = (
+            torch.arange(x.size(1), device=x.device)[None, :]
+            >= downsampled_lengths[:, None]
+        )
+        attention_mask = (1.0 - (~padding_mask).to(dtype=x.dtype)) * torch.finfo(
+            x.dtype
+        ).min
+        attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+        deepstack_states: List[torch.Tensor] = []
+        for layer_idx, layer in enumerate(self.layers):
+            layer_outputs = layer(
+                x,
+                attention_mask,
+                layer_head_mask=None,
+                output_attentions=False,
+            )
+            x = layer_outputs[0]
+            if output_deepstack_hidden_states and layer_idx in self._deepstack_indexes_set:
+                deepstack_states.append(x)
+        x = self.layer_norm(x)
+        x = self.out_proj(x)
+        return BaseModelOutputWithPast(
+            last_hidden_state=x,
+            hidden_states=tuple(deepstack_states) if output_deepstack_hidden_states else None,
+        )
+class GatedMLP(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super().__init__()
+        self.gate_proj = nn.Linear(input_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(input_size, hidden_size, bias=False)
+        self.down_proj = nn.Linear(hidden_size, output_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+@auto_docstring
+class MossAudioPreTrainedModel(PreTrainedModel):
+    config_class = MossAudioConfig
+    config: MossAudioConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {"hidden_states": Qwen3DecoderLayer}
+class MossAudioModel(MossAudioPreTrainedModel, GenerationMixin):
+    config_class = MossAudioConfig
+    _tied_weights_keys: List[str] = []
+    def __init__(self, config: MossAudioConfig):
+        super().__init__(config)
+        self.audio_encoder = MossAudioEncoder(config.audio_config)
+        self.language_model = Qwen3Model(config.language_config)
+        self.audio_adapter = GatedMLP(
+            input_size=config.audio_config.output_dim,
+            hidden_size=config.adapter_hidden_size,
+            output_size=config.language_config.hidden_size,
+        )
+        deepstack_k = len(
+            getattr(config.audio_config, "deepstack_encoder_layer_indexes", []) or []
+        )
+        if config.deepstack_num_inject_layers is not None:
+            deepstack_k = min(deepstack_k, int(config.deepstack_num_inject_layers))
+        self.deepstack_audio_merger_list = nn.ModuleList(
+            [
+                GatedMLP(
+                    input_size=config.audio_config.output_dim,
+                    hidden_size=config.adapter_hidden_size,
+                    output_size=config.language_config.hidden_size,
+                )
+                for _ in range(deepstack_k)
+            ]
+        )
+        self.vocab_size = config.language_config.vocab_size
+        self.lm_head = nn.Linear(
+            config.language_config.hidden_size, self.vocab_size, bias=False
+        )
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_audio_features(self, input_features, feature_lens):
+        audio_outputs = self.audio_encoder(
+            input_features=input_features,
+            feature_lens=feature_lens,
+            output_deepstack_hidden_states=True,
+        )
+        deepstack = (
+            list(audio_outputs.hidden_states)
+            if audio_outputs.hidden_states is not None
+            else None
+        )
+        return audio_outputs.last_hidden_state, deepstack
+    def _apply_deepstack_to_hidden_states(
+        self,
+        hidden_states: torch.Tensor,
+        audio_input_mask: torch.Tensor,
+        deepstack_embeds: torch.Tensor,
+    ) -> torch.Tensor:
+        audio_input_mask = audio_input_mask.to(hidden_states.device)
+        deepstack_embeds = deepstack_embeds.to(hidden_states.device, hidden_states.dtype)
+        flat = deepstack_embeds.reshape(-1, deepstack_embeds.shape[-1])
+        updated_hidden_states = hidden_states.clone()
+        updated_hidden_states[audio_input_mask] = (
+            updated_hidden_states[audio_input_mask] + flat
+        )
+        return updated_hidden_states
+    def _register_llm_deepstack_hooks(
+        self,
+        audio_input_mask: torch.Tensor,
+        deepstack_audio_embeds: List[torch.Tensor],
+    ):
+        if deepstack_audio_embeds is None or len(deepstack_audio_embeds) == 0:
+            return []
+        layers = getattr(self.language_model, "layers", None)
+        if layers is None:
+            raise RuntimeError(
+                "Qwen3Model does not expose `.layers`; cannot register DeepStack hooks."
+            )
+        num_inject = len(deepstack_audio_embeds)
+        handles = []
+        for layer_idx, layer in enumerate(layers):
+            if layer_idx >= num_inject:
+                break
+            def _make_llm_hook(k: int):
+                def _hook(_module, _inputs, _output):
+                    if isinstance(_output, (tuple, list)):
+                        hidden_states = _output[0]
+                        new_hidden_states = self._apply_deepstack_to_hidden_states(
+                            hidden_states, audio_input_mask, deepstack_audio_embeds[k]
+                        )
+                        return (new_hidden_states,) + tuple(_output[1:])
+                    return self._apply_deepstack_to_hidden_states(
+                        _output, audio_input_mask, deepstack_audio_embeds[k]
+                    )
+                return _hook
+            handles.append(layer.register_forward_hook(_make_llm_hook(layer_idx)))
+        return handles
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        audio_data: Optional[torch.FloatTensor] = None,
+        audio_data_seqlens: Optional[torch.Tensor] = None,
+        audio_input_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        hook_handles = []
+        if audio_data is not None:
+            if audio_input_mask is None:
+                raise ValueError("audio_input_mask is required when audio_data is provided.")
+            audio_embeds, deepstack = self.get_audio_features(
+                audio_data, audio_data_seqlens
+            )
+            audio_embeds = self.audio_adapter(audio_embeds)
+            audio_token_count = int(audio_input_mask.to(torch.int32).sum().item())
+            if audio_token_count != int(audio_embeds.shape[1]):
+                raise ValueError(
+                    f"Audio token count mismatch: audio_input_mask has {audio_token_count} audio tokens, "
+                    f"but audio_embeds has length {int(audio_embeds.shape[1])}."
+                )
+            mask_expanded = audio_input_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = inputs_embeds.clone()
+            inputs_embeds.masked_scatter_(mask_expanded, audio_embeds)
+            if deepstack is not None and len(self.deepstack_audio_merger_list) > 0:
+                deepstack_audio_embeds = []
+                for index, one_hidden_state in enumerate(
+                    deepstack[: len(self.deepstack_audio_merger_list)]
+                ):
+                    deepstack_embed = self.deepstack_audio_merger_list[index](
+                        one_hidden_state
+                    )
+                    if int(deepstack_embed.shape[1]) != audio_token_count:
+                        raise ValueError(
+                            f"DeepStack audio seq_len mismatch at index {index}: "
+                            f"expected {audio_token_count}, got {int(deepstack_embed.shape[1])}."
+                        )
+                    deepstack_audio_embeds.append(deepstack_embed)
+                try:
+                    hook_handles = self._register_llm_deepstack_hooks(
+                        audio_input_mask, deepstack_audio_embeds
+                    )
+                except Exception:
+                    for handle in hook_handles:
+                        handle.remove()
+                    raise
+        try:
+            outputs = self.language_model(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        finally:
+            for handle in hook_handles:
+                handle.remove()
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=self.config.ignore_index)
+            shift_logits = shift_logits.view(
+                -1, self.config.language_config.vocab_size
+            )
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        if cache_position is not None and cache_position[0] > 0:
+            input_ids = input_ids[:, -1:]
+            if position_ids is not None:
+                position_ids = position_ids[:, -1:]
+            audio_data = None
+            audio_input_mask = None
+            audio_data_seqlens = None
+        else:
+            audio_data = kwargs.get("audio_data", None)
+            audio_input_mask = kwargs.get("audio_input_mask", None)
+            audio_data_seqlens = kwargs.get("audio_data_seqlens", None)
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "audio_data": audio_data,
+                "audio_input_mask": audio_input_mask,
+                "audio_data_seqlens": audio_data_seqlens,
+            }
+        )
+        return model_inputs
+__all__ = [
+    "MossAudioEncoderConfig",
+    "MossAudioConfig",
+    "MossAudioModel",
+]

src/processing_moss_audio.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import importlib.util
+import os
+import re
+import sys
+import types
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Union
+import numpy as np
+import torch
+from transformers import AutoTokenizer, BatchEncoding
+@dataclass
+class MelConfig:
+    mel_sr: int = 16000
+    mel_dim: int = 128
+    mel_n_fft: int = 400
+    mel_hop_length: int = 160
+    mel_dtype: torch.dtype = torch.bfloat16
+    use_whisper_feature_extractor: bool = True
+def load_chat_template(template_path: str, mossflux_path: str = None) -> List:
+    if mossflux_path is None:
+        template_dir = os.path.dirname(os.path.abspath(template_path))
+        current = template_dir
+        while current and os.path.basename(current) != "mossLite":
+            parent = os.path.dirname(current)
+            if parent == current:
+                break
+            current = parent
+        if os.path.basename(current) == "mossLite":
+            mossflux_path = os.path.join(current, "mossflux")
+    if mossflux_path and mossflux_path not in sys.path:
+        sys.path.insert(0, mossflux_path)
+    spec = importlib.util.spec_from_file_location("chat_template_module", template_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["chat_template_module"] = module
+    spec.loader.exec_module(module)
+    return module.chat_template
+class MossAudioProcessor:
+    _AUDIO_SPAN_RE = re.compile(r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>")
+    _auto_class = None
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoProcessor"):
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+        cls._auto_class = auto_class
+    def __init__(
+        self,
+        tokenizer,
+        *,
+        mel_config: Optional[MelConfig] = None,
+        template_path: Optional[str] = None,
+        enable_time_marker: bool = True,
+        audio_token_id: int = 151654,
+        audio_start_id: int = 151669,
+        audio_end_id: int = 151670,
+    ):
+        self._base_tokenizer = tokenizer
+        self.tokenizer = tokenizer
+        self.audio_token_id = int(audio_token_id)
+        self.audio_start_id = int(audio_start_id)
+        self.audio_end_id = int(audio_end_id)
+        self.chat_template = (
+            None if template_path is None else load_chat_template(template_path)
+        )
+        self.custom_texts = {}
+        self.enable_time_marker = bool(enable_time_marker)
+        self.config = mel_config or MelConfig()
+        self._whisper_feature_extractor = None
+        alias_map = {
+            "<|AUDIO|>": self.audio_token_id,
+            "<|audio_bos|>": self.audio_start_id,
+            "<|audio_eos|>": self.audio_end_id,
+        }
+        orig_convert_tokens_to_ids = self.tokenizer.convert_tokens_to_ids
+        def _patched_convert_tokens_to_ids(tokenizer_self, tokens):
+            if isinstance(tokens, (list, tuple)):
+                converted = [
+                    _patched_convert_tokens_to_ids(tokenizer_self, token)
+                    for token in tokens
+                ]
+                return converted if isinstance(tokens, list) else tuple(converted)
+            if isinstance(tokens, str) and tokens in alias_map:
+                return alias_map[tokens]
+            return orig_convert_tokens_to_ids(tokens)
+        self.tokenizer.convert_tokens_to_ids = types.MethodType(
+            _patched_convert_tokens_to_ids, self.tokenizer
+        )
+        self._digit_token_ids = {
+            "0": 15,
+            "1": 16,
+            "2": 17,
+            "3": 18,
+            "4": 19,
+            "5": 20,
+            "6": 21,
+            "7": 22,
+            "8": 23,
+            "9": 24,
+        }
+        self.audio_tokens_per_second = 12.5
+        self.time_marker_every_seconds = 2
+        self.time_marker_every_audio_tokens = int(
+            self.audio_tokens_per_second * self.time_marker_every_seconds
+        )
+        self.model_input_names = [
+            "input_ids",
+            "attention_mask",
+            "audio_data",
+            "audio_data_seqlens",
+        ]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        tokenizer_kwargs = {}
+        for key in ["cache_dir", "revision", "token", "local_files_only"]:
+            if key in kwargs:
+                tokenizer_kwargs[key] = kwargs[key]
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            use_fast=False,
+            **tokenizer_kwargs,
+        )
+        mel_config = kwargs.pop("mel_config", None)
+        template_path = kwargs.pop("template_path", None)
+        enable_time_marker = kwargs.pop("enable_time_marker", False)
+        audio_token_id = kwargs.pop("audio_token_id", 151654)
+        audio_start_id = kwargs.pop("audio_start_id", 151669)
+        audio_end_id = kwargs.pop("audio_end_id", 151670)
+        return cls(
+            tokenizer,
+            mel_config=mel_config,
+            template_path=template_path,
+            enable_time_marker=enable_time_marker,
+            audio_token_id=audio_token_id,
+            audio_start_id=audio_start_id,
+            audio_end_id=audio_end_id,
+        )
+    def load_template(self, template_path: str):
+        self.chat_template = load_chat_template(template_path)
+        return self
+    def set_custom_text(self, key: str, text: str):
+        self.custom_texts[key] = text
+        return self
+    def clear_custom_text(self, key: Optional[str] = None):
+        if key is None:
+            self.custom_texts.clear()
+        else:
+            self.custom_texts.pop(key, None)
+        return self
+    def _template_requires_audio(self) -> bool:
+        if self.chat_template is None:
+            return False
+        for segment in self.chat_template:
+            if segment.type in {"audio_contiguous", "audio_token"}:
+                return True
+        return False
+    @staticmethod
+    def _conv3_downsample_len(raw_mel_len: int) -> int:
+        def conv_out_len(length: int) -> int:
+            return (length - 1) // 2 + 1
+        length1 = conv_out_len(int(raw_mel_len))
+        length2 = conv_out_len(length1)
+        length3 = conv_out_len(length2)
+        return int(length3)
+    def _get_whisper_feature_extractor(self):
+        if self._whisper_feature_extractor is not None:
+            return self._whisper_feature_extractor
+        from transformers.models.whisper.feature_extraction_whisper import (
+            WhisperFeatureExtractor,
+        )
+        self._whisper_feature_extractor = WhisperFeatureExtractor(
+            feature_size=int(self.config.mel_dim),
+            sampling_rate=int(self.config.mel_sr),
+            hop_length=int(self.config.mel_hop_length),
+            n_fft=int(self.config.mel_n_fft),
+        )
+        return self._whisper_feature_extractor
+    def _extract_mel(self, audio: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
+        if isinstance(audio, np.ndarray):
+            wav = torch.from_numpy(audio)
+        else:
+            wav = audio
+        wav = wav.to(dtype=torch.float32)
+        if wav.dim() == 1:
+            wav = wav.unsqueeze(0)
+        if bool(getattr(self.config, "use_whisper_feature_extractor", False)):
+            feature_extractor = self._get_whisper_feature_extractor()
+            wav_np = wav.detach().to("cpu", torch.float32).contiguous().numpy()
+            if wav_np.ndim == 2:
+                wav_np = wav_np[0]
+            feats = feature_extractor._np_extract_fbank_features(
+                wav_np[None, ...], device="cpu"
+            )
+            mel = torch.from_numpy(feats[0])
+        return mel.to(dtype=self.config.mel_dtype)
+    def _get_time_marker_token_ids(self, second: int) -> List[int]:
+        return [self._digit_token_ids[digit] for digit in str(second)]
+    def _build_audio_tokens_with_time_markers(self, audio_seq_len: int) -> List[int]:
+        total_duration_seconds = audio_seq_len / self.audio_tokens_per_second
+        num_full_seconds = int(total_duration_seconds)
+        token_ids: List[int] = []
+        audio_tokens_consumed = 0
+        for second in range(
+            self.time_marker_every_seconds,
+            num_full_seconds + 1,
+            self.time_marker_every_seconds,
+        ):
+            marker_pos = (
+                second // self.time_marker_every_seconds
+            ) * self.time_marker_every_audio_tokens
+            audio_segment_len = marker_pos - audio_tokens_consumed
+            if audio_segment_len > 0:
+                token_ids.extend([self.audio_token_id] * audio_segment_len)
+                audio_tokens_consumed += audio_segment_len
+            token_ids.extend(self._get_time_marker_token_ids(second))
+        remaining = audio_seq_len - audio_tokens_consumed
+        if remaining > 0:
+            token_ids.extend([self.audio_token_id] * remaining)
+        return token_ids
+    def _build_audio_placeholder_ids(self, num_audio_tokens: int) -> List[int]:
+        if self.enable_time_marker:
+            return self._build_audio_tokens_with_time_markers(num_audio_tokens)
+        return [self.audio_token_id] * num_audio_tokens
+    def _build_input_from_template(
+        self, num_audio_tokens: int, include_answer: bool = False
+    ) -> List[int]:
+        if self.chat_template is None:
+            raise ValueError("Chat template not loaded.")
+        input_ids: List[int] = []
+        for segment in self.chat_template:
+            seg_type = segment.type
+            if seg_type == "constant_text_token":
+                input_ids.extend(segment.text_ids.tolist())
+            elif seg_type in {"audio_contiguous", "audio_token"}:
+                input_ids.extend(self._build_audio_placeholder_ids(num_audio_tokens))
+            elif seg_type == "text_token":
+                text_token_key = segment.text_token_key
+                if "answer" in text_token_key.lower() and not include_answer:
+                    break
+                if text_token_key not in self.custom_texts:
+                    break
+                text_ids = self._base_tokenizer.encode(
+                    self.custom_texts[text_token_key], add_special_tokens=False
+                )
+                input_ids.extend(text_ids)
+        return input_ids
+    def _build_default_prompt(self, text: str, has_audio: bool) -> str:
+        if has_audio:
+            return (
+                "<|im_start|>system\n"
+                "You are a helpful assistant.<|im_end|>\n"
+                "<|im_start|>user\n"
+                "<|audio_bos|><|AUDIO|><|audio_eos|>\n"
+                f"{text}<|im_end|>\n"
+                "<|im_start|>assistant\n"
+            )
+        return (
+            "<|im_start|>system\n"
+            "You are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\n"
+            f"{text}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+    def _build_input_from_prompt(self, prompt: str, token_lens: List[int]) -> List[int]:
+        spans = list(self._AUDIO_SPAN_RE.finditer(prompt))
+        if len(spans) != len(token_lens):
+            raise ValueError(
+                f"Audio placeholder count mismatch: found {len(spans)} spans in text, "
+                f"but got {len(token_lens)} audio inputs."
+            )
+        input_ids: List[int] = []
+        cursor = 0
+        for index, match in enumerate(spans):
+            prefix = prompt[cursor : match.start()]
+            if prefix:
+                input_ids.extend(
+                    self._base_tokenizer.encode(prefix, add_special_tokens=False)
+                )
+            input_ids.append(self.audio_start_id)
+            input_ids.extend(self._build_audio_placeholder_ids(int(token_lens[index])))
+            input_ids.append(self.audio_end_id)
+            cursor = match.end()
+        suffix = prompt[cursor:]
+        if suffix:
+            input_ids.extend(
+                self._base_tokenizer.encode(suffix, add_special_tokens=False)
+            )
+        return input_ids
+    def __call__(
+        self,
+        *,
+        text: Union[str, Sequence[str], None] = None,
+        audios: Optional[Sequence[Union[np.ndarray, torch.Tensor]]] = None,
+        audio: Optional[Sequence[Union[np.ndarray, torch.Tensor]]] = None,
+        return_tensors: str = "pt",
+        **kwargs,
+    ):
+        if isinstance(text, (list, tuple)):
+            if len(text) != 1:
+                raise ValueError(f"Expected text batch size 1, got {len(text)}")
+            prompt_text = text[0]
+        else:
+            prompt_text = text
+        audio_list = audios if audios is not None else audio
+        audio_list = [] if audio_list is None else list(audio_list)
+        mels: List[torch.Tensor] = []
+        raw_lengths: List[int] = []
+        token_lens: List[int] = []
+        for one_audio in audio_list:
+            mel = self._extract_mel(one_audio)
+            raw_len = int(mel.shape[-1])
+            mels.append(mel)
+            raw_lengths.append(raw_len)
+            token_lens.append(self._conv3_downsample_len(raw_len))
+        if mels:
+            max_length = max(raw_lengths)
+            audio_batch = torch.zeros(
+                (len(mels), self.config.mel_dim, max_length),
+                dtype=self.config.mel_dtype,
+            )
+            for index, mel in enumerate(mels):
+                audio_batch[index, :, : mel.shape[-1]] = mel
+            seqlens_tensor = torch.tensor(raw_lengths, dtype=torch.long)
+        else:
+            audio_batch = None
+            seqlens_tensor = None
+        if prompt_text is not None:
+            if self._AUDIO_SPAN_RE.search(prompt_text) is None and audio_list:
+                prompt_text = self._build_default_prompt(prompt_text, has_audio=True)
+            elif self._AUDIO_SPAN_RE.search(prompt_text) is None and not audio_list:
+                prompt_text = self._build_default_prompt(prompt_text, has_audio=False)
+            input_ids_list = self._build_input_from_prompt(prompt_text, token_lens)
+        elif self.chat_template is not None:
+            input_ids_list = self._build_input_from_template(
+                token_lens[0] if token_lens else 0
+            )
+        else:
+            raise ValueError(
+                "Either provide text or load a chat_template before calling the processor."
+            )
+        input_ids_tensor = torch.tensor([input_ids_list], dtype=torch.long)
+        attention_mask_tensor = torch.ones_like(input_ids_tensor)
+        data = {
+            "input_ids": input_ids_tensor,
+            "attention_mask": attention_mask_tensor,
+        }
+        if audio_batch is not None:
+            data["audio_data"] = audio_batch
+            data["audio_data_seqlens"] = seqlens_tensor
+        return BatchEncoding(data=data, tensor_type=return_tensors)
+    def batch_decode(self, *args, **kwargs):
+        return self._base_tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self._base_tokenizer.decode(*args, **kwargs)
+__all__ = ["MelConfig", "MossAudioProcessor"]