happyme531 commited on Mar 12

Commit

8b1176e

verified ·

1 Parent(s): aa3001b

Upload 36 files

Browse files

Files changed (36) hide show

convert/MiniCPM4-0.5B/added_tokens.json +10 -0
convert/MiniCPM4-0.5B/config.json +37 -0
convert/MiniCPM4-0.5B/configuration_minicpm.py +203 -0
convert/MiniCPM4-0.5B/generation_config.json +12 -0
convert/MiniCPM4-0.5B/modeling_minicpm.py +1514 -0
convert/MiniCPM4-0.5B/special_tokens_map.json +33 -0
convert/MiniCPM4-0.5B/tokenizer.json +0 -0
convert/MiniCPM4-0.5B/tokenizer.model +3 -0
convert/MiniCPM4-0.5B/tokenizer_config.json +117 -0
convert/README.md +53 -0
convert/scripts/build_rk3588_pipeline.py +283 -0
convert/scripts/convert_vox_minicpm_to_hf.py +115 -0
convert/scripts/export_onnx.py +297 -0
convert/scripts/export_rkllm.py +65 -0
convert/src/voxcpm/__init__.py +5 -0
convert/src/voxcpm/cli.py +299 -0
convert/src/voxcpm/core.py +195 -0
convert/src/voxcpm/model/__init__.py +3 -0
convert/src/voxcpm/model/utils.py +122 -0
convert/src/voxcpm/model/voxcpm.py +690 -0
convert/src/voxcpm/modules/__init__.py +0 -0
convert/src/voxcpm/modules/audiovae/__init__.py +1 -0
convert/src/voxcpm/modules/audiovae/audio_vae.py +359 -0
convert/src/voxcpm/modules/layers/__init__.py +1 -0
convert/src/voxcpm/modules/layers/scalar_quantization_layer.py +26 -0
convert/src/voxcpm/modules/locdit/__init__.py +2 -0
convert/src/voxcpm/modules/locdit/local_dit.py +114 -0
convert/src/voxcpm/modules/locdit/unified_cfm.py +137 -0
convert/src/voxcpm/modules/locenc/__init__.py +1 -0
convert/src/voxcpm/modules/locenc/local_encoder.py +30 -0
convert/src/voxcpm/modules/minicpm4/__init__.py +3 -0
convert/src/voxcpm/modules/minicpm4/cache.py +47 -0
convert/src/voxcpm/modules/minicpm4/config.py +29 -0
convert/src/voxcpm/modules/minicpm4/model.py +473 -0
convert/src/voxcpm/utils/text_normalize.py +185 -0
convert/src/voxcpm/zipenhancer.py +76 -0

convert/MiniCPM4-0.5B/added_tokens.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "<|execute_end|>": 73444,
+  "<|execute_start|>": 73443,
+  "<|fim_middle|>": 73446,
+  "<|fim_prefix|>": 73445,
+  "<|fim_suffix|>": 73447,
+  "<|im_end|>": 73440,
+  "<|im_start|>": 73441,
+  "<|tool_call|>": 73442
+}

convert/MiniCPM4-0.5B/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "_name_or_path": "openbmb/MiniCPM4-0.5B",
+    "architectures": [
+        "MiniCPMForCausalLM"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_minicpm.MiniCPMConfig",
+        "AutoModel": "modeling_minicpm.MiniCPMModel",
+        "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
+        "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
+        "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
+    },
+    "bos_token_id": 1,
+    "eos_token_id": [2, 73440],
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.1,
+    "intermediate_size": 4096,
+    "max_position_embeddings": 32768,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "rope_type": "longrope",
+        "long_factor": [1.0004360675811768, 1.0668443441390991, 1.1631425619125366, 1.3025742769241333, 1.5040205717086792, 1.7941505908966064, 2.2101221084594727, 2.802666664123535, 3.6389970779418945, 4.804192543029785, 6.39855432510376, 8.527148246765137, 11.277542114257812, 14.684998512268066, 18.69317054748535, 23.13019371032715, 27.72362518310547, 32.1606559753418, 36.168827056884766, 39.57627868652344, 42.32667541503906, 44.45526885986328, 46.04962921142578, 47.21482849121094, 48.05115509033203, 48.64370346069336, 49.05967712402344, 49.34980392456055, 49.551246643066406, 49.69068145751953, 49.78697967529297, 49.85338592529297],
+        "short_factor": [1.0004360675811768, 1.0668443441390991, 1.1631425619125366, 1.3025742769241333, 1.5040205717086792, 1.7941505908966064, 2.2101221084594727, 2.802666664123535, 3.6389970779418945, 4.804192543029785, 6.39855432510376, 8.527148246765137, 11.277542114257812, 14.684998512268066, 18.69317054748535, 23.13019371032715, 27.72362518310547, 32.1606559753418, 36.168827056884766, 39.57627868652344, 42.32667541503906, 44.45526885986328, 46.04962921142578, 47.21482849121094, 48.05115509033203, 48.64370346069336, 49.05967712402344, 49.34980392456055, 49.551246643066406, 49.69068145751953, 49.78697967529297, 49.85338592529297],
+        "original_max_position_embeddings": 32768
+    },
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.46.3",
+    "use_cache": true,
+    "vocab_size": 73448,
+    "scale_emb": 12,
+    "dim_model_base": 256,
+    "scale_depth": 1.4
+}

convert/MiniCPM4-0.5B/configuration_minicpm.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# coding=utf-8
+# Copyright 2025 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MiniCPM model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class MiniCPMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MiniCPMModel`]. It is used to instantiate an MiniCPM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the MiniCPM-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MiniCPMModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
+            MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import MiniCPMModel, MiniCPMConfig
+    >>> # Initializing a MiniCPM minicpm-7b style configuration
+    >>> configuration = MiniCPMConfig()
+    >>> # Initializing a model from the minicpm-7b style configuration
+    >>> model = MiniCPMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'minicpm'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act='silu',
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        scale_emb=1,
+        dim_model_base=1,
+        scale_depth=1,
+        mup_denominator=None,
+        sparse_config=None,
+        **kwargs):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.scale_emb = scale_emb
+        self.dim_model_base = dim_model_base
+        self.scale_depth = scale_depth
+        # only used for Eagle Head
+        self.mup_denominator = mup_denominator
+        # sparse config
+        self.sparse_config = sparse_config
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        try:
+            import flash_attn
+            self._attn_implementation = 'flash_attention_2'
+        except:
+            pass
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
+                f'got {self.rope_scaling}'
+            )
+        rope_scaling_type = self.rope_scaling.get('type', None)
+        rope_scaling_factor = self.rope_scaling.get('factor', None)
+        if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

convert/MiniCPM4-0.5B/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": [
+    2,
+    73440
+  ],
+  "pad_token_id": 2,
+  "temperature": 0.8,
+  "top_p": 0.8,
+  "transformers_version": "4.46.1"
+}

convert/MiniCPM4-0.5B/modeling_minicpm.py ADDED Viewed

	@@ -0,0 +1,1514 @@

+# coding=utf-8
+# Copyright 2025 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MiniCPM model."""
+import math
+import re
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, CacheLayerMixin, DynamicLayer
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_minicpm import MiniCPMConfig
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+except:
+    pass
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = 'MiniCPMConfig'
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# @torch.jit.script  # type: ignore
+def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
+    old_dtype = hidden.dtype
+    variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+    hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype)
+    return hidden * weight
+class MiniCPMRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniCPMRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
+ALL_LAYERNORM_LAYERS.append(MiniCPMRMSNorm)
+class MiniCPMRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            # seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class MiniCPMLongRoPE(MiniCPMRotaryEmbedding):
+    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, short_factor=None, long_factor=None, original_max_position_embeddings=None):
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        scale = (max_position_embeddings / self.original_max_position_embeddings)
+        self.scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=device)
+        freqs = torch.mul(
+            torch.outer(t, 1.0 / ext_factors).to(device=device),
+            self.inv_freq.to(device=device).to(dtype)
+        )
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype) * self.scaling_factor, persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype) * self.scaling_factor, persistent=False)
+class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
+    """MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
+    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer('inv_freq', inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    # cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    # sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    # q_embed = (q * cos) + (rotate_half(q) * sin)
+    # k_embed = (k * cos) + (rotate_half(k) * sin)
+    orig_dtype = k.dtype
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+    q_fp32 = q.to(dtype=torch.float32, device=q.device)
+    k_fp32 = k.to(dtype=torch.float32, device=k.device)
+    q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
+    k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
+    return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
+class MiniCPMMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def _unpad_one_tensor(hidden_states, attention_mask):
+    # Unpad the hidden states using the indices
+    indices, cu_seqlens, max_seqlen_in_batch = _get_unpad_data(attention_mask)
+    batch_size, seq_len = hidden_states.shape[:2]
+    # Get the remaining dimensions
+    remaining_dims = hidden_states.shape[2:]
+    # Reshape to (batch_size * seq_len, *remaining_dims)
+    reshaped_states = hidden_states.reshape(batch_size * seq_len, *remaining_dims)
+    # Apply unpadding using indices
+    unpadded_states = index_first_axis(reshaped_states, indices)
+    return unpadded_states, indices, cu_seqlens, max_seqlen_in_batch
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class MiniCPMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f'Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will '
+                'to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` '
+                'when creating this class.'
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = MiniCPMRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling['rope_type']
+            scaling_factor = self.config.rope_scaling.get('factor', None)
+            if scaling_type == 'linear':
+                self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == 'dynamic':
+                self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == 'longrope':
+                self.rotary_emb = MiniCPMLongRoPE(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    short_factor=self.config.rope_scaling['short_factor'],
+                    long_factor=self.config.rope_scaling['long_factor'],
+                    base=self.rope_theta,
+                    original_max_position_embeddings=self.config.rope_scaling['original_max_position_embeddings']
+                )
+            else:
+                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = position_ids.max().item() + 1
+        cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class MiniCPMFlashAttention2(MiniCPMAttention):
+    """
+    MiniCPM flash attention module. This module inherits from `MiniCPMAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # MiniCPMFlashAttention2 attention does not support output_attentions
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop('padding_mask')
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = position_ids.max().item() + 1
+        cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (MiniCPMRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, '_pre_quantization_dtype'):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f'The input hidden states seems to be silently casted in float32, this might be related to'
+                f' the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in'
+                f' {target_dtype}.'
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in MiniCPMFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class MiniCPMSdpaAttention(MiniCPMAttention):
+    """
+    MiniCPM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MiniCPMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from MiniCPMAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                'MiniCPMModel is using MiniCPMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, '
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = position_ids.max().item() + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == 'cuda' and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+MINICPM_ATTENTION_CLASSES = {
+    'eager': MiniCPMAttention,
+    'flash_attention_2': MiniCPMFlashAttention2,
+    'sdpa': MiniCPMSdpaAttention,
+}
+class MiniCPMDecoderLayer(nn.Module):
+    def __init__(self, config: MiniCPMConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = MiniCPMMLP(config)
+        self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.scale_depth = config.scale_depth
+        self.num_hidden_layers = config.num_hidden_layers
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+MINICPM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`MiniCPMConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    'The bare MiniCPM Model outputting raw hidden-states without any specific head on top.',
+    MINICPM_START_DOCSTRING,
+)
+class MiniCPMPreTrainedModel(PreTrainedModel):
+    config_class = MiniCPMConfig
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['MiniCPMDecoderLayer']
+    _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+MINICPM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    'The bare MiniCPM Model outputting raw hidden-states without any specific head on top.',
+    MINICPM_START_DOCSTRING,
+)
+class MiniCPMModel(MiniCPMPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
+    Args:
+        config: MiniCPMConfig
+    """
+    def __init__(self, config: MiniCPMConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == 'sdpa'
+        self._use_flash_attention_2 = config._attn_implementation == 'flash_attention_2'
+        self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError('You have to specify either input_ids or inputs_embeds')
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                raise ValueError(
+                    'You must use the new past_key_values format, such as the Cache class, instead of the old tuple format.'
+                )
+            # Calculate the usable length of past key values
+            past_key_values_length = past_key_values.get_seq_length() if isinstance(past_key_values, Cache) else 0
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            # attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            if attention_mask is None:
+                raise ValueError(
+                    f'need attention_mask for flash attention, but got {attention_mask}.'
+                )
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
+    _tied_weights_keys = ['lm_head.weight']
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MiniCPMModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
+        >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        hidden_states = hidden_states[:, slice_indices, :].contiguous()
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base))
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                # Use the new Cache class methods
+                cache_length = past_key_values.get_seq_length()
+                past_length = cache_length
+                max_cache_length = None
+            else:
+                raise ValueError(
+                    'You must use the new past_key_values format, such as the Cache class, instead of the old tuple format.'
+                )
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+            }
+        )
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = 'user',
+             max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor:
+            gen_kwargs = {
+                'max_length': max_length,
+                'num_beams': num_beams,
+                'do_sample': do_sample,
+                'top_p': top_p,
+                'temperature': temperature,
+                'logits_processor': logits_processor,
+                **kwargs
+            }
+        else:
+            gen_kwargs = {
+                'max_length': max_length,
+                'num_beams': num_beams,
+                'do_sample': do_sample,
+                'top_p': top_p,
+                'temperature': temperature,
+                'logits_processor': logits_processor,
+                **kwargs
+            }
+        history.append({'role': role, 'content': query})
+        history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
+        inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs['input_ids'][0]):-1]
+        response = tokenizer.decode(outputs)
+        pattern = re.compile(r'.*?(?=<AI>|<用户>)', re.DOTALL)
+        matches = pattern.findall(response)
+        if len(matches) > 0:
+            response = matches[0]
+        history.append({'role': 'assistant', 'content': response})
+        return response, history
+@add_start_docstrings(
+    """
+    The MiniCPM Model transformer with a sequence classification head on top (linear layer).
+    [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MINICPM_START_DOCSTRING,
+)
+class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MiniCPMModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.')
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

convert/MiniCPM4-0.5B/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>",
+    "<|tool_call|>",
+    "<|execute_start|>",
+    "<|execute_end|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

convert/MiniCPM4-0.5B/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

convert/MiniCPM4-0.5B/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb74d51116831c3bf65db812c553f94ab0c88dcf97a5bbb37e3504f6d359c530
+size 1181204

convert/MiniCPM4-0.5B/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73440": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73441": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73442": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73443": {
+      "content": "<|execute_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73444": {
+      "content": "<|execute_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73445": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73446": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73447": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_start|>",
+    "<|tool_call|>",
+    "<|execute_start|>",
+    "<|execute_end|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

convert/README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# 模型转换
+1. 测试可用的依赖版本如下：
+```
+torch==2.10.0
+transformers==4.57.6
+onnx==1.18.0
+onnxruntime==1.22.0
+einops==0.8.2
+rknn-toolkit2==2.3.2
+rkllm-toolkit==1.2.3
+```
+2. 下载模型
+从`https://huggingface.co/openbmb/VoxCPM-0.5B`下载模型，保存到`./VoxCPM-0.5B`文件夹。
+3. 转换模型
+```bash
+python scripts/build_rk3588_pipeline.py
+```
+转换后的模型会放置在`build/rk3588/final_models/`.
+---
+# Model Conversion
+1. Tested dependency versions:
+```
+torch==2.10.0
+transformers==4.57.6
+onnx==1.18.0
+onnxruntime==1.22.0
+einops==0.8.2
+rknn-toolkit2==2.3.2
+rkllm-toolkit==1.2.3
+```
+2. Download the model
+Download the model from `https://huggingface.co/openbmb/VoxCPM-0.5B` and save it to the `./VoxCPM-0.5B` directory.
+3. Convert the model
+```bash
+python scripts/build_rk3588_pipeline.py
+```
+The converted models will be placed in `build/rk3588/final_models/`.

convert/scripts/build_rk3588_pipeline.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import argparse
+import contextlib
+import json
+import os
+from pathlib import Path
+import shutil
+import subprocess
+from rknn.api import RKNN
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SRC_DIR = REPO_ROOT / "src"
+TOKENIZER_SUPPORT_FILES = [
+    "tokenizer.json",
+    "tokenizer_config.json",
+    "tokenizer.model",
+    "special_tokens_map.json",
+    "added_tokens.json",
+    "generation_config.json",
+    "README.md",
+    "modeling_minicpm.py",
+    "configuration_minicpm.py",
+]
+RKNN_SPECS = [
+    ("audio_vae_encode.onnx", "audio_vae_encode.rknn", ["audio_wave"], [[1, 1, 40960]], None),
+    ("audio_vae_decode.onnx", "audio_vae_decode.rknn", ["latent"], [[1, 64, 64]], None),
+    ("locenc.onnx", "locenc_64.rknn", ["x"], [[1, 64, 2, 64]], None),
+    ("locenc.onnx", "locenc_1.rknn", ["x"], [[1, 1, 2, 64]], None),
+    ("fsq_layer.onnx", "fsq_layer.rknn", ["hidden"], [[1, 64, 1024]], [[[1, 64, 1024]], [[1, 1, 1024]]]),
+    ("stop_head.onnx", "stop_head.rknn", ["hidden"], [[1, 1024]], None),
+    ("lm_to_dit_proj.onnx", "lm_to_dit_proj.rknn", ["input"], [[1, 1024]], None),
+    ("res_to_dit_proj.onnx", "res_to_dit_proj.rknn", ["input"], [[1, 1024]], None),
+    ("dit_step.onnx", "dit_step.rknn", ["x", "mu", "t", "cond", "dt"], [[1, 64, 2], [1, 1024], [1], [1, 64, 2], [1]], None),
+]
+def run(cmd: list[str], *, cwd: Path | None = None, env: dict[str, str] | None = None):
+    print("+", " ".join(cmd))
+    subprocess.run(cmd, cwd=cwd, env=env, check=True)
+@contextlib.contextmanager
+def pushd(path: Path):
+    prev = Path.cwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(prev)
+def ensure_dir(path: Path):
+    path.mkdir(parents=True, exist_ok=True)
+def copy_if_exists(src: Path, dst: Path):
+    if src.exists():
+        shutil.copy2(src, dst)
+def sync_hf_support_files(minicpm_dir: Path, target_dir: Path):
+    ensure_dir(target_dir)
+    metadata_json = target_dir / "configuration.json"
+    if metadata_json.exists():
+        metadata_json.unlink()
+    for name in TOKENIZER_SUPPORT_FILES:
+        copy_if_exists(minicpm_dir / name, target_dir / name)
+def patch_hf_config(reference_config_path: Path, target_config_path: Path, architecture: str):
+    reference = json.loads(reference_config_path.read_text())
+    target = json.loads(target_config_path.read_text())
+    if "auto_map" in reference:
+        target["auto_map"] = reference["auto_map"]
+    target["architectures"] = [architecture]
+    target_config_path.write_text(json.dumps(target, indent=2, ensure_ascii=False) + "\n")
+def export_onnx(model_dir: Path, onnx_dir: Path):
+    ensure_dir(onnx_dir)
+    env = os.environ.copy()
+    env["PYTHONPATH"] = str(SRC_DIR) + (os.pathsep + env["PYTHONPATH"] if env.get("PYTHONPATH") else "")
+    run(
+        [
+            "python",
+            str(REPO_ROOT / "scripts" / "export_onnx.py"),
+            "--model-dir",
+            str(model_dir),
+            "--out-dir",
+            str(onnx_dir),
+            "--dump-embeddings",
+        ],
+        cwd=REPO_ROOT,
+        env=env,
+    )
+def convert_one_rknn(
+    onnx_dir: Path,
+    rknn_dir: Path,
+    spec: tuple[str, str, list[str], list[list[int]] | None, list[list[list[int]]] | None],
+    target_platform: str,
+):
+    onnx_name, rknn_name, inputs, input_size_list, dynamic_input = spec
+    onnx_path = onnx_dir / onnx_name
+    out_path = rknn_dir / rknn_name
+    ensure_dir(rknn_dir)
+    if not onnx_path.exists():
+        raise FileNotFoundError(f"Missing ONNX file: {onnx_path}")
+    rknn = RKNN(verbose=False)
+    ret = rknn.config(target_platform=target_platform, dynamic_input=dynamic_input)
+    if ret != 0:
+        raise RuntimeError(f"RKNN config failed for {onnx_name}, ret={ret}")
+    load_kwargs = {"model": str(onnx_path)}
+    if input_size_list is not None:
+        load_kwargs["inputs"] = inputs
+        load_kwargs["input_size_list"] = input_size_list
+    ret = rknn.load_onnx(**load_kwargs)
+    if ret != 0:
+        raise RuntimeError(f"RKNN load_onnx failed for {onnx_name}, ret={ret}")
+    ret = rknn.build(do_quantization=False)
+    if ret != 0:
+        raise RuntimeError(f"RKNN build failed for {onnx_name}, ret={ret}")
+    ret = rknn.export_rknn(str(out_path))
+    if ret != 0:
+        raise RuntimeError(f"RKNN export failed for {out_path}, ret={ret}")
+    rknn.release()
+def export_rknn(onnx_dir: Path, rknn_dir: Path, target_platform: str):
+    ensure_dir(rknn_dir)
+    copy_if_exists(onnx_dir / "embed_tokens.npy", rknn_dir / "embed_tokens.npy")
+    with pushd(rknn_dir):
+        for spec in RKNN_SPECS:
+            convert_one_rknn(onnx_dir, rknn_dir, spec, target_platform)
+def collect_final_models(build_dir: Path):
+    final_dir = build_dir / "final_models"
+    ensure_dir(final_dir)
+    for name in [
+        "audio_vae_encode.rknn",
+        "audio_vae_decode.rknn",
+        "locenc_64.rknn",
+        "locenc_1.rknn",
+        "fsq_layer.rknn",
+        "stop_head.rknn",
+        "lm_to_dit_proj.rknn",
+        "res_to_dit_proj.rknn",
+        "dit_step.rknn",
+        "embed_tokens.npy",
+    ]:
+        copy_if_exists(build_dir / "rknn" / name, final_dir / name)
+    copy_if_exists(build_dir / "rkllm" / "base" / "language_model.rkllm", final_dir / "base_lm.rkllm")
+    copy_if_exists(build_dir / "rkllm" / "residual" / "language_model.rkllm", final_dir / "residual_lm.rkllm")
+def convert_vox_to_hf(vox_config: Path, vox_state: Path, minicpm_dir: Path, base_out: Path, residual_out: Path):
+    ensure_dir(base_out)
+    ensure_dir(residual_out)
+    run(
+        [
+            "python",
+            str(REPO_ROOT / "scripts" / "convert_vox_minicpm_to_hf.py"),
+            "--vox-config",
+            str(vox_config),
+            "--vox-state",
+            str(vox_state),
+            "--minicpm-dir",
+            str(minicpm_dir),
+            "--out-dir",
+            str(base_out),
+            "--out-residual-dir",
+            str(residual_out),
+        ],
+        cwd=REPO_ROOT,
+    )
+    sync_hf_support_files(minicpm_dir, base_out)
+    sync_hf_support_files(minicpm_dir, residual_out)
+    patch_hf_config(minicpm_dir / "config.json", base_out / "config.json", "MiniCPMForCausalLM")
+    patch_hf_config(minicpm_dir / "config.json", residual_out / "config.json", "MiniCPMModel")
+def export_rkllm(hf_dir: Path, out_path: Path, target_platform: str, num_npu_core: int):
+    hf_home = out_path.parent.parent.parent / "cache" / "huggingface"
+    ensure_dir(hf_home)
+    env = os.environ.copy()
+    env["HF_HOME"] = str(hf_home)
+    env["HUGGINGFACE_HUB_CACHE"] = str(hf_home / "hub")
+    env["TRANSFORMERS_CACHE"] = str(hf_home / "transformers")
+    run(
+        [
+            "python",
+            str(REPO_ROOT / "scripts" / "export_rkllm.py"),
+            "--model-dir",
+            str(hf_dir),
+            "--output",
+            str(out_path),
+            "--target-platform",
+            target_platform,
+            "--num-npu-core",
+            str(num_npu_core),
+            "--hf-home",
+            str(hf_home),
+        ],
+        cwd=REPO_ROOT,
+        env=env,
+    )
+def write_manifest(build_dir: Path, model_dir: Path, minicpm_dir: Path):
+    manifest = {
+        "model_dir": str(model_dir),
+        "minicpm_dir": str(minicpm_dir),
+        "onnx_dir": str(build_dir / "onnx"),
+        "rknn_dir": str(build_dir / "rknn"),
+        "hf_base_dir": str(build_dir / "hf" / "base"),
+        "hf_residual_dir": str(build_dir / "hf" / "residual"),
+        "rkllm_base_model": str(build_dir / "rkllm" / "base" / "language_model.rkllm"),
+        "rkllm_residual_model": str(build_dir / "rkllm" / "residual" / "language_model.rkllm"),
+        "output_dir": str(build_dir / "output"),
+    }
+    ensure_dir(build_dir)
+    (build_dir / "build_manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False) + "\n")
+def main():
+    parser = argparse.ArgumentParser(description="Rebuild the VoxCPM RK3588 deployment artifacts from scratch.")
+    parser.add_argument("--model-dir", default="VoxCPM-0.5B", help="Path to the original VoxCPM-0.5B model directory.")
+    parser.add_argument("--minicpm-dir", default="MiniCPM4-0.5B", help="Path to the reference MiniCPM4-0.5B directory.")
+    parser.add_argument("--build-dir", default="build/rk3588", help="Output root for rebuilt artifacts.")
+    parser.add_argument("--target-platform", default="rk3588", help="RK target platform.")
+    parser.add_argument("--skip-onnx", action="store_true", help="Skip ONNX export.")
+    parser.add_argument("--skip-rknn", action="store_true", help="Skip RKNN conversion.")
+    parser.add_argument("--skip-hf", action="store_true", help="Skip Vox->HF conversion.")
+    parser.add_argument("--skip-rkllm", action="store_true", help="Skip RKLLM export.")
+    args = parser.parse_args()
+    model_dir = (REPO_ROOT / args.model_dir).resolve()
+    minicpm_dir = (REPO_ROOT / args.minicpm_dir).resolve()
+    build_dir = (REPO_ROOT / args.build_dir).resolve()
+    onnx_dir = build_dir / "onnx"
+    rknn_dir = build_dir / "rknn"
+    hf_base_dir = build_dir / "hf" / "base"
+    hf_residual_dir = build_dir / "hf" / "residual"
+    rkllm_base_path = build_dir / "rkllm" / "base" / "language_model.rkllm"
+    rkllm_residual_path = build_dir / "rkllm" / "residual" / "language_model.rkllm"
+    ensure_dir(build_dir / "output")
+    if not args.skip_onnx:
+        export_onnx(model_dir, onnx_dir)
+    if not args.skip_rknn:
+        export_rknn(onnx_dir, rknn_dir, args.target_platform)
+    if not args.skip_hf:
+        convert_vox_to_hf(
+            vox_config=model_dir / "config.json",
+            vox_state=model_dir / "pytorch_model.bin",
+            minicpm_dir=minicpm_dir,
+            base_out=hf_base_dir,
+            residual_out=hf_residual_dir,
+        )
+    if not args.skip_rkllm:
+        export_rkllm(hf_base_dir, rkllm_base_path, args.target_platform, num_npu_core=1)
+        export_rkllm(hf_residual_dir, rkllm_residual_path, args.target_platform, num_npu_core=3)
+    collect_final_models(build_dir)
+    write_manifest(build_dir, model_dir, minicpm_dir)
+    print(f"Saved: {build_dir / 'build_manifest.json'}")
+if __name__ == "__main__":
+    main()

convert/scripts/convert_vox_minicpm_to_hf.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import argparse
+import json
+import os
+import sys
+import torch
+import math
+def load_vox_configs(vox_config_path: str) -> tuple[dict, dict]:
+    """Return (base_lm_cfg, residual_cfg)."""
+    with open(vox_config_path, "r") as f:
+        data = json.load(f)
+    base = data["lm_config"]
+    rope = base.get("rope_scaling")
+    if rope:
+        rope = dict(rope)
+        # Vox config uses "type", transformers expects "rope_type"
+        if "type" in rope and "rope_type" not in rope:
+            rope["rope_type"] = rope.pop("type")
+        base["rope_scaling"] = rope
+    residual = dict(base)
+    residual["num_hidden_layers"] = data.get("residual_lm_num_layers", residual["num_hidden_layers"])
+    # keep vocab_size for easier loading; Vox sets 0 because inputs_embeds are provided
+    residual.setdefault("vocab_size", base.get("vocab_size"))
+    # Align transformers residual scaling with Vox (no scaling when use_mup=False)
+    if not base.get("use_mup", True):
+        base["scale_depth"] = math.sqrt(base["num_hidden_layers"])
+        residual["scale_depth"] = math.sqrt(residual["num_hidden_layers"])
+    return base, residual
+def build_hf_config(lm_cfg: dict, minicpm_dir: str):
+    sys.path.insert(0, minicpm_dir)
+    from configuration_minicpm import MiniCPMConfig
+    return MiniCPMConfig(**lm_cfg)
+def convert_state_dict(vox_state_path: str, lm_prefix: str) -> dict:
+    raw = torch.load(vox_state_path, map_location="cpu")
+    sd = raw["state_dict"] if isinstance(raw, dict) and "state_dict" in raw else raw
+    out = {}
+    prefix = f"{lm_prefix}."
+    for k, v in sd.items():
+        if not k.startswith(prefix):
+            continue
+        new_k = "model." + k[len(prefix) :]
+        out[new_k] = v
+    # Tie lm_head to embeddings for MiniCPMForCausalLM
+    if "model.embed_tokens.weight" in out:
+        out["lm_head.weight"] = out["model.embed_tokens.weight"]
+    return out
+def main():
+    parser = argparse.ArgumentParser(description="Convert VoxCPM MiniCPM weights to transformers format")
+    parser.add_argument(
+        "--vox-config",
+        default="VoxCPM-0.5B/config.json",
+        help="Path to VoxCPM config.json (used to read lm_config)",
+    )
+    parser.add_argument(
+        "--vox-state",
+        default="VoxCPM-0.5B/pytorch_model.bin",
+        help="Path to VoxCPM checkpoint containing base_lm weights",
+    )
+    parser.add_argument(
+        "--minicpm-dir",
+        default="MiniCPM4-0.5B",
+        help="Path to local MiniCPM4-0.5B directory (provides configuration_minicpm.py)",
+    )
+    parser.add_argument(
+        "--out-dir",
+        default="converted-minicpm-hf",
+        help="Output directory for base LM transformers-style checkpoint",
+    )
+    parser.add_argument(
+        "--out-residual-dir",
+        default="converted-minicpm-residual-hf",
+        help="Output directory for residual LM checkpoint",
+    )
+    args = parser.parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    os.makedirs(args.out_residual_dir, exist_ok=True)
+    base_cfg, residual_cfg = load_vox_configs(args.vox_config)
+    hf_config = build_hf_config(base_cfg, args.minicpm_dir)
+    hf_config.save_pretrained(args.out_dir)
+    print("Loaded Vox lm_config and wrote transformers config to", args.out_dir)
+    hf_state = convert_state_dict(args.vox_state, lm_prefix="base_lm")
+    out_path = os.path.join(args.out_dir, "pytorch_model.bin")
+    torch.save(hf_state, out_path)
+    print("Saved base LM weights to", out_path)
+    residual_hf_config = build_hf_config(residual_cfg, args.minicpm_dir)
+    residual_hf_config.save_pretrained(args.out_residual_dir)
+    residual_state = convert_state_dict(args.vox_state, lm_prefix="residual_lm")
+    residual_out_path = os.path.join(args.out_residual_dir, "pytorch_model.bin")
+    torch.save(residual_state, residual_out_path)
+    print("Saved residual LM weights to", residual_out_path)
+    print("Load with MiniCPMForCausalLM.from_pretrained(...) or MiniCPMModel.from_pretrained(...).")
+if __name__ == "__main__":
+    main()

convert/scripts/export_onnx.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import argparse
+import os
+import sys
+import torch
+from torch import nn
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+SRC_DIR = os.path.join(REPO_ROOT, "src")
+if SRC_DIR not in sys.path:
+    sys.path.insert(0, SRC_DIR)
+from voxcpm.model.voxcpm import VoxCPMModel
+def remove_weight_norm(module: nn.Module):
+    """Strip weight_norm wrappers for cleaner ONNX graphs."""
+    for name, child in module.named_children():
+        remove_weight_norm(child)
+        if isinstance(child, (nn.Conv1d, nn.ConvTranspose1d)):
+            try:
+                torch.nn.utils.remove_weight_norm(child)
+            except ValueError:
+                # not wrapped, skip
+                pass
+class VAEEncodeWrapper(nn.Module):
+    def __init__(self, audio_vae: nn.Module):
+        super().__init__()
+        self.audio_vae = audio_vae
+    def forward(self, audio_wave: torch.Tensor):
+        return self.audio_vae.encode(audio_wave, self.audio_vae.sample_rate)
+class VAEDecodeWrapper(nn.Module):
+    def __init__(self, audio_vae: nn.Module):
+        super().__init__()
+        self.audio_vae = audio_vae
+    def forward(self, latent: torch.Tensor):
+        return self.audio_vae.decode(latent)
+class LocEncWrapper(nn.Module):
+    def __init__(self, locenc: nn.Module):
+        super().__init__()
+        self.locenc = locenc
+    def forward(self, x: torch.Tensor):
+        # x: [B, T, P, D]
+        return self.locenc(x)
+class LocEncLmWrapper(nn.Module):
+    """LocEnc with enc_to_lm projection fused in a single graph."""
+    def __init__(self, locenc: nn.Module, proj: nn.Module):
+        super().__init__()
+        self.locenc = locenc
+        self.proj = proj
+    def forward(self, x: torch.Tensor):
+        # x: [B, T, P, D]
+        hidden = self.locenc(x)
+        return self.proj(hidden)
+class FSQWrapper(nn.Module):
+    def __init__(self, fsq: nn.Module):
+        super().__init__()
+        self.fsq = fsq
+    def forward(self, hidden: torch.Tensor):
+        return self.fsq(hidden)
+class StopHeadWrapper(nn.Module):
+    def __init__(self, stop_proj: nn.Linear, stop_actn: nn.Module, stop_head: nn.Linear):
+        super().__init__()
+        self.stop_proj = stop_proj
+        self.stop_actn = stop_actn
+        self.stop_head = stop_head
+    def forward(self, hidden: torch.Tensor):
+        hidden = self.stop_proj(hidden)
+        hidden = self.stop_actn(hidden)
+        return self.stop_head(hidden)
+class CFMWrapper(nn.Module):
+    """
+    Wrapper for one diffusion step block.
+    Note: the number of diffusion steps (n_timesteps) is fixed at export time.
+    """
+    def __init__(self, cfm: nn.Module, patch_size: int, n_timesteps: int, cfg_value: float):
+        super().__init__()
+        self.cfm = cfm
+        self.patch_size = patch_size
+        self.n_timesteps = n_timesteps
+        self.cfg_value = cfg_value
+    def forward(self, mu: torch.Tensor, cond: torch.Tensor):
+        # mu: [B, H_dit], cond: [B, D_feat, P]
+        return self.cfm(
+            mu=mu,
+            n_timesteps=self.n_timesteps,
+            patch_size=self.patch_size,
+            cond=cond,
+            cfg_value=self.cfg_value,
+        )
+class DiTStepWrapper(nn.Module):
+    """
+    Wrapper for a single VoxCPMLocDiT forward (one diffusion score estimation step).
+    Inputs match VoxCPMLocDiT.forward: x, mu, t, cond, dt.
+    """
+    def __init__(self, dit: nn.Module):
+        super().__init__()
+        self.dit = dit
+    def forward(self, x: torch.Tensor, mu: torch.Tensor, t: torch.Tensor, cond: torch.Tensor, dt: torch.Tensor):
+        return self.dit(x, mu, t, cond, dt)
+def export(model: nn.Module, inputs, path: str, dynamic_axes: dict, opset: int):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    torch.onnx.export(
+        model,
+        inputs,
+        path,
+        opset_version=opset,
+        dynamo=True,
+        do_constant_folding=True,
+        input_names=list(dynamic_axes.keys()),
+        output_names=["output"],
+        dynamic_axes=dynamic_axes,
+    )
+    print(f"Saved: {path}")
+def main():
+    parser = argparse.ArgumentParser(description="Export VoxCPM submodules to ONNX (LLM excluded).")
+    parser.add_argument("--model-dir", required=True, help="Path to VoxCPM model directory (config/weights).")
+    parser.add_argument("--out-dir", default="onnx_exports", help="Output directory for ONNX files.")
+    parser.add_argument("--opset", type=int, default=18, help="ONNX opset version.")
+    parser.add_argument("--audio-samples", type=int, default=1280, help="Dummy audio length for encoder export.")
+    parser.add_argument("--latent-steps", type=int, default=6, help="Dummy latent steps for decoder export.")
+    parser.add_argument("--seq-len", type=int, default=4, help="Dummy sequence length for LocEnc/FSQ export.")
+    parser.add_argument("--dit-step-t", type=float, default=0.5, help="Dummy diffusion time for DiT step export.")
+    parser.add_argument("--force-fp32", action="store_true", help="Force submodules to float32 for ONNX export.")
+    parser.add_argument("--dump-embeddings", action="store_true", help="Dump base_lm.embed_tokens weights to npy.")
+    args = parser.parse_args()
+    device = torch.device("cpu")
+    # Load full model once, then peel submodules; keep optimize disabled.
+    full_model = VoxCPMModel.from_local(args.model_dir, optimize=False).to(device).eval()
+    if args.force_fp32 or full_model.config.dtype != "float32":
+        full_model.config.dtype = "float32"
+        full_model = full_model.to(torch.float32)
+        full_model.audio_vae = full_model.audio_vae.to(torch.float32)
+    remove_weight_norm(full_model)
+    # Audio VAE encode
+    vae_enc = VAEEncodeWrapper(full_model.audio_vae).to(device).eval()
+    dummy_audio = torch.randn(1, 1, args.audio_samples, device=device)
+    export(
+        vae_enc,
+        dummy_audio,
+        os.path.join(args.out_dir, "audio_vae_encode.onnx"),
+        dynamic_axes={"audio_wave": {0: "batch", 2: "samples"}},
+        opset=args.opset,
+    )
+    # Audio VAE decode
+    vae_dec = VAEDecodeWrapper(full_model.audio_vae).to(device).eval()
+    dummy_latent = torch.randn(1, full_model.audio_vae.latent_dim, args.latent_steps, device=device)
+    export(
+        vae_dec,
+        dummy_latent,
+        os.path.join(args.out_dir, "audio_vae_decode.onnx"),
+        dynamic_axes={"latent": {0: "batch", 2: "latent_steps"}},
+        opset=args.opset,
+    )
+    # LocEnc with enc_to_lm projection fused
+    locenc = LocEncLmWrapper(full_model.feat_encoder, full_model.enc_to_lm_proj).to(device).eval()
+    dummy_seq = torch.randn(1, args.seq_len, full_model.patch_size, full_model.feat_dim, device=device)
+    export(
+        locenc,
+        dummy_seq,
+        os.path.join(args.out_dir, "locenc.onnx"),
+        dynamic_axes={"x": {0: "batch", 1: "seq_len"}},
+        opset=args.opset,
+    )
+    # FSQ layer
+    fsq = FSQWrapper(full_model.fsq_layer).to(device).eval()
+    hidden_size = full_model.config.lm_config.hidden_size
+    dummy_hidden = torch.randn(1, args.seq_len, hidden_size, device=device)
+    export(
+        fsq,
+        dummy_hidden,
+        os.path.join(args.out_dir, "fsq_layer.onnx"),
+        dynamic_axes={"hidden": {0: "batch", 1: "seq_len"}},
+        opset=args.opset,
+    )
+    # Stop head
+    stop = StopHeadWrapper(full_model.stop_proj, full_model.stop_actn, full_model.stop_head).to(device).eval()
+    dummy_stop_inp = torch.randn(1, hidden_size, device=device)
+    export(
+        stop,
+        dummy_stop_inp,
+        os.path.join(args.out_dir, "stop_head.onnx"),
+        dynamic_axes={"hidden": {0: "batch"}},
+        opset=args.opset,
+    )
+    # Projection layers
+    # export(
+    #     full_model.enc_to_lm_proj,
+    #     dummy_hidden,
+    #     os.path.join(args.out_dir, "enc_to_lm_proj.onnx"),
+    #     dynamic_axes={"input": {0: "batch", 1: "seq_len"}},
+    #     opset=args.opset,
+    # )
+    lm_hidden = torch.randn(1, full_model.config.lm_config.hidden_size, device=device)
+    export(
+        full_model.lm_to_dit_proj,
+        lm_hidden,
+        os.path.join(args.out_dir, "lm_to_dit_proj.onnx"),
+        dynamic_axes={"input": {0: "batch"}},
+        opset=args.opset,
+    )
+    export(
+        full_model.res_to_dit_proj,
+        lm_hidden,
+        os.path.join(args.out_dir, "res_to_dit_proj.onnx"),
+        dynamic_axes={"input": {0: "batch"}},
+        opset=args.opset,
+    )
+    # VoxCPMLocDiT single step (score function)
+    dit_step = DiTStepWrapper(full_model.feat_decoder.estimator).to(device).eval()
+    dummy_x = torch.randn(1, full_model.feat_dim, full_model.patch_size, device=device)
+    dummy_mu = torch.randn(1, full_model.config.dit_config.hidden_dim, device=device)
+    dummy_t = torch.full((1,), args.dit_step_t, device=device)
+    dummy_dt = torch.full((1,), 0.0, device=device)
+    dummy_cond = torch.randn(1, full_model.feat_dim, full_model.patch_size, device=device)
+    export(
+        dit_step,
+        (dummy_x, dummy_mu, dummy_t, dummy_cond, dummy_dt),
+        os.path.join(args.out_dir, "dit_step.onnx"),
+        dynamic_axes={
+            "x": {0: "batch"},
+            "mu": {0: "batch"},
+            "t": {0: "batch"},
+            "cond": {0: "batch"},
+            "dt": {0: "batch"},
+        },
+        opset=args.opset,
+    )
+    # # UnifiedCFM + VoxCPMLocDiT (single-step sampler unrolled with fixed n_timesteps)
+    # cfm = CFMWrapper(
+    #     full_model.feat_decoder,
+    #     patch_size=full_model.patch_size,
+    #     n_timesteps=args.cfm_steps,
+    #     cfg_value=args.cfg_value,
+    # ).to(device).eval()
+    # dummy_mu = torch.randn(1, full_model.config.dit_config.hidden_dim, device=device)
+    # dummy_cond = torch.randn(1, full_model.feat_dim, full_model.patch_size, device=device)
+    # export(
+    #     cfm,
+    #     (dummy_mu, dummy_cond),
+    #     os.path.join(args.out_dir, "cfm_step.onnx"),
+    #     dynamic_axes={"mu": {0: "batch"}, "cond": {0: "batch"}},
+    #     opset=args.opset,
+    # )
+    if args.dump_embeddings and hasattr(full_model.base_lm, "embed_tokens"):
+        import numpy as np
+        emb = full_model.base_lm.embed_tokens.weight.detach().cpu().numpy()
+        os.makedirs(args.out_dir, exist_ok=True)
+        np.save(os.path.join(args.out_dir, "embed_tokens.npy"), emb)
+        print(f"Saved: {os.path.join(args.out_dir, 'embed_tokens.npy')}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

convert/scripts/export_rkllm.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import argparse
+import os
+from pathlib import Path
+from rkllm.api import RKLLM
+def export_rkllm(
+    model_dir: Path,
+    output_path: Path,
+    target_platform: str,
+    num_npu_core: int,
+    optimization_level: int,
+):
+    llm = RKLLM()
+    ret = llm.load_huggingface(model=str(model_dir), model_lora=None, device="cpu")
+    if ret != 0:
+        raise RuntimeError(f"load_huggingface failed for {model_dir}, ret={ret}")
+    ret = llm.build(
+        do_quantization=False,
+        optimization_level=optimization_level,
+        quantized_dtype="w8a8",
+        quantized_algorithm="normal",
+        target_platform=target_platform,
+        num_npu_core=num_npu_core,
+        extra_qparams=None,
+    )
+    if ret != 0:
+        raise RuntimeError(f"RKLLM build failed for {model_dir}, ret={ret}")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    ret = llm.export_rkllm(str(output_path))
+    if ret != 0:
+        raise RuntimeError(f"export_rkllm failed for {output_path}, ret={ret}")
+def main():
+    parser = argparse.ArgumentParser(description="Export a HuggingFace-format MiniCPM model to RKLLM.")
+    parser.add_argument("--model-dir", required=True, help="Input HuggingFace model directory.")
+    parser.add_argument("--output", required=True, help="Output .rkllm path.")
+    parser.add_argument("--target-platform", default="rk3588", help="RK target platform.")
+    parser.add_argument("--num-npu-core", type=int, default=1, help="NPU cores for RKLLM build.")
+    parser.add_argument("--optimization-level", type=int, default=1, help="RKLLM optimization level.")
+    parser.add_argument("--hf-home", default=None, help="Optional writable Hugging Face cache root.")
+    args = parser.parse_args()
+    if args.hf_home:
+        hf_home = str(Path(args.hf_home).resolve())
+        os.environ["HF_HOME"] = hf_home
+        os.environ["HUGGINGFACE_HUB_CACHE"] = str(Path(hf_home) / "hub")
+        os.environ["TRANSFORMERS_CACHE"] = str(Path(hf_home) / "transformers")
+    export_rkllm(
+        model_dir=Path(args.model_dir),
+        output_path=Path(args.output),
+        target_platform=args.target_platform,
+        num_npu_core=args.num_npu_core,
+        optimization_level=args.optimization_level,
+    )
+    print(f"Saved: {args.output}")
+if __name__ == "__main__":
+    main()

convert/src/voxcpm/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .core import VoxCPM
+__all__ = [
+    "VoxCPM",
+]

convert/src/voxcpm/cli.py ADDED Viewed

	@@ -0,0 +1,299 @@

+#!/usr/bin/env python3
+"""
+VoxCPM Command Line Interface
+Unified CLI for voice cloning, direct TTS synthesis, and batch processing.
+Usage examples:
+    # Direct synthesis (single sample)
+    voxcpm --text "Hello world" --output output.wav
+    # Voice cloning (with reference audio and text)
+    voxcpm --text "Hello world" --prompt-audio voice.wav --prompt-text "reference text" --output output.wav --denoise
+    # Batch processing (each line in the file is one sample)
+    voxcpm --input texts.txt --output-dir ./outputs/
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Optional, List
+import soundfile as sf
+from voxcpm.core import VoxCPM
+def validate_file_exists(file_path: str, file_type: str = "file") -> Path:
+    """Validate that a file exists."""
+    path = Path(file_path)
+    if not path.exists():
+        raise FileNotFoundError(f"{file_type} '{file_path}' does not exist")
+    return path
+def validate_output_path(output_path: str) -> Path:
+    """Validate the output path and create parent directories if needed."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path
+def load_model(args) -> VoxCPM:
+    """Load VoxCPM model.
+    Prefer --model-path if provided; otherwise use from_pretrained (Hub).
+    """
+    print("Loading VoxCPM model...")
+    # 兼容旧参数：ZIPENHANCER_MODEL_PATH 环境变量作为默认
+    zipenhancer_path = getattr(args, "zipenhancer_path", None) or os.environ.get(
+        "ZIPENHANCER_MODEL_PATH", None
+    )
+    # Load from local path if provided
+    if getattr(args, "model_path", None):
+        try:
+            model = VoxCPM(
+                voxcpm_model_path=args.model_path,
+                zipenhancer_model_path=zipenhancer_path,
+                enable_denoiser=not getattr(args, "no_denoiser", False),
+            )
+            print("Model loaded (local).")
+            return model
+        except Exception as e:
+            print(f"Failed to load model (local): {e}")
+            sys.exit(1)
+    # Otherwise, try from_pretrained (Hub); exit on failure
+    try:
+        model = VoxCPM.from_pretrained(
+            hf_model_id=getattr(args, "hf_model_id", "openbmb/VoxCPM-0.5B"),
+            load_denoiser=not getattr(args, "no_denoiser", False),
+            zipenhancer_model_id=zipenhancer_path,
+            cache_dir=getattr(args, "cache_dir", None),
+            local_files_only=getattr(args, "local_files_only", False),
+        )
+        print("Model loaded (from_pretrained).")
+        return model
+    except Exception as e:
+        print(f"Failed to load model (from_pretrained): {e}")
+        sys.exit(1)
+def cmd_clone(args):
+    """Voice cloning command."""
+    # Validate inputs
+    if not args.text:
+        print("Error: Please provide text to synthesize (--text)")
+        sys.exit(1)
+    if not args.prompt_audio:
+        print("Error: Voice cloning requires a reference audio (--prompt-audio)")
+        sys.exit(1)
+    if not args.prompt_text:
+        print("Error: Voice cloning requires a reference text (--prompt-text)")
+        sys.exit(1)
+    # Validate files
+    prompt_audio_path = validate_file_exists(args.prompt_audio, "reference audio file")
+    output_path = validate_output_path(args.output)
+    # Load model
+    model = load_model(args)
+    # Generate audio
+    print(f"Synthesizing text: {args.text}")
+    print(f"Reference audio: {prompt_audio_path}")
+    print(f"Reference text: {args.prompt_text}")
+    audio_array = model.generate(
+        text=args.text,
+        prompt_wav_path=str(prompt_audio_path),
+        prompt_text=args.prompt_text,
+        cfg_value=args.cfg_value,
+        inference_timesteps=args.inference_timesteps,
+        normalize=args.normalize,
+        denoise=args.denoise
+    )
+    # Save audio
+    sf.write(str(output_path), audio_array, 16000)
+    print(f"Saved audio to: {output_path}")
+    # Stats
+    duration = len(audio_array) / 16000
+    print(f"Duration: {duration:.2f}s")
+def cmd_synthesize(args):
+    """Direct TTS synthesis command."""
+    # Validate inputs
+    if not args.text:
+        print("Error: Please provide text to synthesize (--text)")
+        sys.exit(1)
+    # Validate output path
+    output_path = validate_output_path(args.output)
+    # Load model
+    model = load_model(args)
+    # Generate audio
+    print(f"Synthesizing text: {args.text}")
+    audio_array = model.generate(
+        text=args.text,
+        prompt_wav_path=None,
+        prompt_text=None,
+        cfg_value=args.cfg_value,
+        inference_timesteps=args.inference_timesteps,
+        normalize=args.normalize,
+        denoise=False  # 无参考音频时不需要降噪
+    )
+    # Save audio
+    sf.write(str(output_path), audio_array, 16000)
+    print(f"Saved audio to: {output_path}")
+    # Stats
+    duration = len(audio_array) / 16000
+    print(f"Duration: {duration:.2f}s")
+def cmd_batch(args):
+    """Batch synthesis command."""
+    # Validate input file
+    input_file = validate_file_exists(args.input, "input file")
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            texts = [line.strip() for line in f if line.strip()]
+    except Exception as e:
+        print(f"Failed to read input file: {e}")
+        sys.exit(1)
+    if not texts:
+        print("Error: Input file is empty or contains no valid lines")
+        sys.exit(1)
+    print(f"Found {len(texts)} lines to process")
+    model = load_model(args)
+    prompt_audio_path = None
+    if args.prompt_audio:
+        prompt_audio_path = str(validate_file_exists(args.prompt_audio, "reference audio file"))
+    success_count = 0
+    for i, text in enumerate(texts, 1):
+        print(f"\nProcessing {i}/{len(texts)}: {text[:50]}...")
+        try:
+            audio_array = model.generate(
+                text=text,
+                prompt_wav_path=prompt_audio_path,
+                prompt_text=args.prompt_text,
+                cfg_value=args.cfg_value,
+                inference_timesteps=args.inference_timesteps,
+                normalize=args.normalize,
+                denoise=args.denoise and prompt_audio_path is not None
+            )
+            output_file = output_dir / f"output_{i:03d}.wav"
+            sf.write(str(output_file), audio_array, 16000)
+            duration = len(audio_array) / 16000
+            print(f"  Saved: {output_file} ({duration:.2f}s)")
+            success_count += 1
+        except Exception as e:
+            print(f"  Failed: {e}")
+            continue
+    print(f"\nBatch finished: {success_count}/{len(texts)} succeeded")
+def _build_unified_parser():
+    """Build unified argument parser (no subcommands, route by args)."""
+    parser = argparse.ArgumentParser(
+        description="VoxCPM CLI (single parser) - voice cloning, direct TTS, and batch processing",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Direct synthesis (single sample)
+  voxcpm --text "Hello world" --output out.wav
+  # Voice cloning (reference audio + text)
+  voxcpm --text "Hello world" --prompt-audio voice.wav --prompt-text "reference text" --output out.wav --denoise
+  # Batch processing
+  voxcpm --input texts.txt --output-dir ./outs
+  # Select model (from Hub)
+  voxcpm --text "Hello" --output out.wav --hf-model-id openbmb/VoxCPM-0.5B
+        """
+    )
+    # Task selection (automatic routing by presence of args)
+    parser.add_argument("--input", "-i", help="Input text file (one line per sample)")
+    parser.add_argument("--output-dir", "-od", help="Output directory (for batch mode)")
+    parser.add_argument("--text", "-t", help="Text to synthesize (single-sample mode)")
+    parser.add_argument("--output", "-o", help="Output audio file path (single-sample mode)")
+    # Prompt audio (for voice cloning)
+    parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path")
+    parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio")
+    parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio")
+    parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)")
+    # Generation parameters
+    parser.add_argument("--cfg-value", type=float, default=2.0, help="CFG guidance scale (default: 2.0)")
+    parser.add_argument("--inference-timesteps", type=int, default=10, help="Inference steps (default: 10)")
+    parser.add_argument("--normalize", action="store_true", help="Enable text normalization")
+    # Model loading parameters
+    parser.add_argument("--model-path", type=str, help="Local VoxCPM model path (overrides Hub download)")
+    parser.add_argument("--hf-model-id", type=str, default="openbmb/VoxCPM-0.5B", help="Hugging Face repo id (e.g., openbmb/VoxCPM-0.5B)")
+    parser.add_argument("--cache-dir", type=str, help="Cache directory for Hub downloads")
+    parser.add_argument("--local-files-only", action="store_true", help="Use only local files (no network)")
+    parser.add_argument("--no-denoiser", action="store_true", help="Disable denoiser model loading")
+    parser.add_argument("--zipenhancer-path", type=str, default="iic/speech_zipenhancer_ans_multiloss_16k_base", help="ZipEnhancer model id or local path (default reads from env)")
+    return parser
+def main():
+    """Unified CLI entrypoint: route by provided arguments."""
+    parser = _build_unified_parser()
+    args = parser.parse_args()
+    # Routing: prefer batch → single (clone/direct)
+    if args.input:
+        if not args.output_dir:
+            print("Error: Batch mode requires --output-dir")
+            parser.print_help()
+            sys.exit(1)
+        return cmd_batch(args)
+    # Single-sample mode
+    if not args.text or not args.output:
+        print("Error: Single-sample mode requires --text and --output")
+        parser.print_help()
+        sys.exit(1)
+    # If prompt audio+text provided → voice cloning
+    if args.prompt_audio or args.prompt_text:
+        if not args.prompt_text and args.prompt_file:
+            assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible."
+            with open(args.prompt_file, 'r', encoding='utf-8') as f:
+                args.prompt_text = f.read()
+        if not args.prompt_audio or not args.prompt_text:
+            print("Error: Voice cloning requires both --prompt-audio and --prompt-text")
+            sys.exit(1)
+        return cmd_clone(args)
+    # Otherwise → direct synthesis
+    return cmd_synthesize(args)
+if __name__ == "__main__":
+    main()

convert/src/voxcpm/core.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import re
+import tempfile
+import numpy as np
+from typing import Generator
+from huggingface_hub import snapshot_download
+from .model.voxcpm import VoxCPMModel
+class VoxCPM:
+    def __init__(self,
+            voxcpm_model_path : str,
+            zipenhancer_model_path : str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
+            enable_denoiser : bool = True,
+            optimize: bool = True,
+        ):
+        """Initialize VoxCPM TTS pipeline.
+        Args:
+            voxcpm_model_path: Local filesystem path to the VoxCPM model assets
+                (weights, configs, etc.). Typically the directory returned by
+                a prior download step.
+            zipenhancer_model_path: ModelScope acoustic noise suppression model
+                id or local path. If None, denoiser will not be initialized.
+            enable_denoiser: Whether to initialize the denoiser pipeline.
+            optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
+        """
+        print(f"voxcpm_model_path: {voxcpm_model_path}, zipenhancer_model_path: {zipenhancer_model_path}, enable_denoiser: {enable_denoiser}")
+        self.tts_model = VoxCPMModel.from_local(voxcpm_model_path, optimize=optimize)
+        self.text_normalizer = None
+        if enable_denoiser and zipenhancer_model_path is not None:
+            from .zipenhancer import ZipEnhancer
+            self.denoiser = ZipEnhancer(zipenhancer_model_path)
+        else:
+            self.denoiser = None
+        print("Warm up VoxCPMModel...")
+        self.tts_model.generate(
+            target_text="Hello, this is the first test sentence.",
+            max_len=10,
+        )
+    @classmethod
+    def from_pretrained(cls,
+            hf_model_id: str = "openbmb/VoxCPM-0.5B",
+            load_denoiser: bool = True,
+            zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
+            cache_dir: str = None,
+            local_files_only: bool = False,
+            **kwargs,
+        ):
+        """Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
+        Args:
+            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
+            load_denoiser: Whether to initialize the denoiser pipeline.
+            zipenhancer_model_id: Denoiser model id or path for ModelScope
+                acoustic noise suppression.
+            cache_dir: Custom cache directory for the snapshot.
+            local_files_only: If True, only use local files and do not attempt
+                to download.
+        Kwargs:
+            Additional keyword arguments passed to the ``VoxCPM`` constructor.
+        Returns:
+            VoxCPM: Initialized instance whose ``voxcpm_model_path`` points to
+            the downloaded snapshot directory.
+        Raises:
+            ValueError: If neither a valid ``hf_model_id`` nor a resolvable
+                ``hf_model_id`` is provided.
+        """
+        repo_id = hf_model_id
+        if not repo_id:
+            raise ValueError("You must provide hf_model_id")
+        # Load from local path if provided
+        if os.path.isdir(repo_id):
+            local_path = repo_id
+        else:
+            # Otherwise, try from_pretrained (Hub); exit on failure
+            local_path = snapshot_download(
+                repo_id=repo_id,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+            )
+        return cls(
+            voxcpm_model_path=local_path,
+            zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
+            enable_denoiser=load_denoiser,
+            **kwargs,
+        )
+    def generate(self, *args, **kwargs) -> np.ndarray:
+        return next(self._generate(*args, streaming=False, **kwargs))
+    def generate_streaming(self, *args, **kwargs) -> Generator[np.ndarray, None, None]:
+        return self._generate(*args, streaming=True, **kwargs)
+    def _generate(self,
+            text : str,
+            prompt_wav_path : str = None,
+            prompt_text : str = None,
+            cfg_value : float = 2.0,
+            inference_timesteps : int = 10,
+            max_length : int = 4096,
+            normalize : bool = True,
+            denoise : bool = True,
+            retry_badcase : bool = True,
+            retry_badcase_max_times : int = 3,
+            retry_badcase_ratio_threshold : float = 6.0,
+            streaming: bool = False,
+        ) -> Generator[np.ndarray, None, None]:
+        """Synthesize speech for the given text and return a single waveform.
+        This method optionally builds and reuses a prompt cache. If an external
+        prompt (``prompt_wav_path`` + ``prompt_text``) is provided, it will be
+        used for all sub-sentences. Otherwise, the prompt cache is built from
+        the first generated result and reused for the remaining text chunks.
+        Args:
+            text: Input text. Can include newlines; each non-empty line is
+                treated as a sub-sentence.
+            prompt_wav_path: Path to a reference audio file for prompting.
+            prompt_text: Text content corresponding to the prompt audio.
+            cfg_value: Guidance scale for the generation model.
+            inference_timesteps: Number of inference steps.
+            max_length: Maximum token length during generation.
+            normalize: Whether to run text normalization before generation.
+            denoise: Whether to denoise the prompt audio if a denoiser is
+                available.
+            retry_badcase: Whether to retry badcase.
+            retry_badcase_max_times: Maximum number of times to retry badcase.
+            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio.
+            streaming: Whether to return a generator of audio chunks.
+        Returns:
+            Generator of numpy.ndarray: 1D waveform array (float32) on CPU.
+            Yields audio chunks for each generations step if ``streaming=True``,
+            otherwise yields a single array containing the final audio.
+        """
+        if not text.strip() or not isinstance(text, str):
+            raise ValueError("target text must be a non-empty string")
+        if prompt_wav_path is not None:
+            if not os.path.exists(prompt_wav_path):
+                raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}")
+        if (prompt_wav_path is None) != (prompt_text is None):
+            raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
+        text = text.replace("\n", " ")
+        text = re.sub(r'\s+', ' ', text)
+        temp_prompt_wav_path = None
+        try:
+            if prompt_wav_path is not None and prompt_text is not None:
+                if denoise and self.denoiser is not None:
+                    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+                        temp_prompt_wav_path = tmp_file.name
+                    self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path)
+                    prompt_wav_path = temp_prompt_wav_path
+                fixed_prompt_cache = self.tts_model.build_prompt_cache(
+                    prompt_wav_path=prompt_wav_path,
+                    prompt_text=prompt_text
+                )
+            else:
+                fixed_prompt_cache = None  # will be built from the first inference
+            if normalize:
+                if self.text_normalizer is None:
+                    from .utils.text_normalize import TextNormalizer
+                    self.text_normalizer = TextNormalizer()
+                text = self.text_normalizer.normalize(text)
+            generate_result = self.tts_model._generate_with_prompt_cache(
+                            target_text=text,
+                            prompt_cache=fixed_prompt_cache,
+                            min_len=2,
+                            max_len=max_length,
+                            inference_timesteps=inference_timesteps,
+                            cfg_value=cfg_value,
+                            retry_badcase=retry_badcase,
+                            retry_badcase_max_times=retry_badcase_max_times,
+                            retry_badcase_ratio_threshold=retry_badcase_ratio_threshold,
+                            streaming=streaming,
+                        )
+            for wav, _, _ in generate_result:
+                yield wav.squeeze(0).cpu().numpy()
+        finally:
+            if temp_prompt_wav_path and os.path.exists(temp_prompt_wav_path):
+                try:
+                    os.unlink(temp_prompt_wav_path)
+                except OSError:
+                    pass

convert/src/voxcpm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .voxcpm import VoxCPMModel
2	+
3	+ __all__ = ["VoxCPMModel"]

convert/src/voxcpm/model/utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import List
+import torch
+from transformers import PreTrainedTokenizer
+def mask_multichar_chinese_tokens(tokenizer: PreTrainedTokenizer):
+    """Create a tokenizer wrapper that converts multi-character Chinese tokens to single characters.
+    This function creates a wrapper around the provided tokenizer that automatically
+    splits multi-character Chinese tokens into individual characters. This is useful
+    for ensuring consistent tokenization of Chinese text.
+    Args:
+        tokenizer: The base tokenizer to wrap
+    Returns:
+        A CharTokenizerWrapper instance that handles multi-character Chinese tokens
+    Example:
+        >>> from transformers import LlamaTokenizerFast
+        >>> tokenizer = LlamaTokenizerFast.from_pretrained("path/to/tokenizer")
+        >>> wrapped_tokenizer = mask_multichar_chinese_tokens(tokenizer)
+        >>> tokens = wrapped_tokenizer("你好世界")
+    """
+    # Pre-compute multi-character tokens (length >= 2, pure Chinese characters)
+    multichar_tokens = {
+        token for token in tokenizer.vocab.keys()
+        if len(token) >= 2 and all("\u4e00" <= c <= "\u9fff" for c in token)
+    }
+    class CharTokenizerWrapper:
+        """Wrapper class for tokenizers that handles multi-character Chinese tokens.
+        This wrapper automatically splits multi-character Chinese tokens into
+        individual characters while preserving the original tokenizer's interface.
+        """
+        def __init__(self, base_tokenizer: PreTrainedTokenizer) -> None:
+            """Initialize the wrapper with a base tokenizer.
+            Args:
+                base_tokenizer: The tokenizer to wrap
+            """
+            self.tokenizer = base_tokenizer
+            self.multichar_tokens = multichar_tokens
+        def tokenize(self, text: str, **kwargs) -> List[str]:
+            """Tokenize text and split multi-character Chinese tokens into single characters.
+            Args:
+                text: Input text to tokenize
+                **kwargs: Additional arguments passed to the base tokenizer
+            Returns:
+                List of processed tokens with multi-character Chinese tokens split
+            Example:
+                >>> wrapper = CharTokenizerWrapper(tokenizer)
+                >>> tokens = wrapper.tokenize("你好世界")
+                >>> # Returns ["你", "好", "世", "界"] instead of ["你好", "世界"]
+            """
+            if not isinstance(text, str):
+                raise TypeError(f"Expected string input, got {type(text)}")
+            tokens = self.tokenizer.tokenize(text, **kwargs)
+            processed = []
+            for token in tokens:
+                # Remove possible subword prefix
+                clean_token = token.replace("▁", "")
+                if clean_token in self.multichar_tokens:
+                    # Split multi-character token into single characters
+                    chars = list(clean_token)
+                    processed.extend(chars)
+                else:
+                    processed.append(token)
+            return processed
+        def __call__(self, text: str, **kwargs) -> List[int]:
+            """Call the tokenizer and return token IDs.
+            This method provides the same interface as the original tokenizer
+            but with multi-character Chinese token handling.
+            Args:
+                text: Input text to tokenize
+                **kwargs: Additional arguments passed to the base tokenizer
+            Returns:
+                List of token IDs
+            Raises:
+                TypeError: If input is not a string
+                ValueError: If tokenization fails
+            """
+            try:
+                tokens = self.tokenize(text, **kwargs)
+                result = self.tokenizer.convert_tokens_to_ids(tokens)
+                return result
+            except Exception as e:
+                raise ValueError(f"Tokenization failed: {str(e)}") from e
+    return CharTokenizerWrapper(tokenizer)
+def get_dtype(dtype: str):
+    if dtype == "bfloat16":
+        return torch.bfloat16
+    elif dtype == "bf16":
+        return torch.bfloat16
+    elif dtype == "float16":
+        return torch.float16
+    elif dtype == "fp16":
+        return torch.float16
+    elif dtype == "float32":
+        return torch.float32
+    elif dtype == "fp32":
+        return torch.float32
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")

convert/src/voxcpm/model/voxcpm.py ADDED Viewed

	@@ -0,0 +1,690 @@

+"""
+VoxCPM: A Tokenizer-free speech generation model
+This module contains the main VoxCPM model implementation, including configuration classes
+and the core VoxCPMModel for text-to-speech generation.
+Copyright 2025 OpenBMB
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+from typing import Tuple, Union, Generator, List
+import torch
+import torch.nn as nn
+import torchaudio
+import warnings
+from einops import rearrange
+from pydantic import BaseModel
+from tqdm import tqdm
+from transformers import LlamaTokenizerFast
+from ..modules.audiovae import AudioVAE
+from ..modules.layers import ScalarQuantizationLayer
+from ..modules.locdit import CfmConfig, UnifiedCFM, VoxCPMLocDiT
+from ..modules.locenc import VoxCPMLocEnc
+from ..modules.minicpm4 import MiniCPM4Config, MiniCPMModel
+from .utils import get_dtype, mask_multichar_chinese_tokens
+class VoxCPMEncoderConfig(BaseModel):
+    hidden_dim: int = 1024
+    ffn_dim: int = 4096
+    num_heads: int = 16
+    num_layers: int = 4
+    kv_channels: int = None
+class VoxCPMDitConfig(BaseModel):
+    hidden_dim: int = 1024
+    ffn_dim: int = 4096
+    num_heads: int = 16
+    num_layers: int = 4
+    kv_channels: int = None
+    cfm_config: CfmConfig
+class VoxCPMConfig(BaseModel):
+    lm_config: MiniCPM4Config
+    patch_size: int = 2
+    feat_dim: int = 64
+    residual_lm_num_layers: int = 6
+    scalar_quantization_latent_dim: int = 256
+    scalar_quantization_scale: int = 9
+    encoder_config: VoxCPMEncoderConfig
+    dit_config: VoxCPMDitConfig
+    max_length: int = 4096
+    device: str = "cuda"
+    dtype: str = "bfloat16"
+class VoxCPMModel(nn.Module):
+    def __init__(
+        self,
+        config: VoxCPMConfig,
+        tokenizer: LlamaTokenizerFast,
+        audio_vae: AudioVAE,
+    ):
+        super().__init__()
+        self.config = config
+        self.feat_dim = config.feat_dim
+        self.patch_size = config.patch_size
+        self.device = config.device
+        if not torch.cuda.is_available():
+            if torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        print(f"Running on device: {self.device}, dtype: {self.config.dtype}")
+        # Text-Semantic LM
+        self.base_lm = MiniCPMModel(config.lm_config)
+        self.base_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
+        self.text_tokenizer = mask_multichar_chinese_tokens(tokenizer)
+        self.audio_start_token = 101
+        self.audio_end_token = 102
+        # Residual Acoustic LM
+        residual_lm_config = config.lm_config.model_copy(deep=True)
+        residual_lm_config.num_hidden_layers = config.residual_lm_num_layers
+        residual_lm_config.vocab_size = 0
+        self.residual_lm = MiniCPMModel(residual_lm_config)
+        self.residual_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
+        # Local Encoder
+        encoder_config = config.lm_config.model_copy(deep=True)
+        encoder_config.hidden_size = config.encoder_config.hidden_dim
+        encoder_config.intermediate_size = config.encoder_config.ffn_dim
+        encoder_config.num_attention_heads = config.encoder_config.num_heads
+        encoder_config.num_hidden_layers = config.encoder_config.num_layers
+        encoder_config.kv_channels = config.encoder_config.kv_channels
+        encoder_config.vocab_size = 0
+        self.feat_encoder = VoxCPMLocEnc(encoder_config, input_dim=config.feat_dim)
+        # Local DiT
+        decoder_config = config.lm_config.model_copy(deep=True)
+        decoder_config.hidden_size = config.dit_config.hidden_dim
+        decoder_config.intermediate_size = config.dit_config.ffn_dim
+        decoder_config.num_attention_heads = config.dit_config.num_heads
+        decoder_config.num_hidden_layers = config.dit_config.num_layers
+        decoder_config.kv_channels = config.dit_config.kv_channels
+        decoder_config.vocab_size = 0
+        self.feat_decoder = UnifiedCFM(
+            in_channels=config.feat_dim,
+            cfm_params=config.dit_config.cfm_config,
+            estimator=VoxCPMLocDiT(decoder_config, in_channels=config.feat_dim),
+        )
+        # Projection layers
+        self.fsq_layer = ScalarQuantizationLayer(
+            config.lm_config.hidden_size,
+            config.lm_config.hidden_size,
+            config.scalar_quantization_latent_dim,
+            config.scalar_quantization_scale
+        )
+        self.enc_to_lm_proj = nn.Linear(config.encoder_config.hidden_dim, config.lm_config.hidden_size)
+        self.lm_to_dit_proj = nn.Linear(config.lm_config.hidden_size, config.dit_config.hidden_dim)
+        self.res_to_dit_proj = nn.Linear(config.lm_config.hidden_size, config.dit_config.hidden_dim)
+        # Stop Predictor
+        self.stop_proj = nn.Linear(config.lm_config.hidden_size, config.lm_config.hidden_size)
+        self.stop_actn = nn.SiLU()
+        self.stop_head = nn.Linear(config.lm_config.hidden_size, 2, bias=False)
+        # Audio VAE
+        self.audio_vae = audio_vae
+        self.chunk_size = audio_vae.chunk_size
+        self.sample_rate = audio_vae.sample_rate
+    def optimize(self, disable: bool = False):
+        try:
+            if disable:
+                raise ValueError("Optimization disabled by user")
+            if self.device != "cuda":
+                raise ValueError("VoxCPMModel can only be optimized on CUDA device")
+            try:
+                import triton
+            except:
+                raise ValueError("triton is not installed")
+            self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True)
+            self.residual_lm.forward_step = torch.compile(self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True)
+            self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
+            self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
+        except Exception as e:
+            print(f"Error: {e}")
+            print("Warning: VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
+            self.base_lm.forward_step = self.base_lm.forward_step
+            self.residual_lm.forward_step = self.residual_lm.forward_step
+            self.feat_encoder_step = self.feat_encoder
+            self.feat_decoder.estimator = self.feat_decoder.estimator
+        return self
+    def generate(self, *args, **kwargs) -> torch.Tensor:
+        return next(self._generate(*args, streaming=False, **kwargs))
+    def generate_streaming(self, *args, **kwargs) -> Generator[torch.Tensor, None, None]:
+        return self._generate(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _generate(
+        self,
+        target_text: str,
+        prompt_text: str = "",
+        prompt_wav_path: str = "",
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        retry_badcase: bool = False,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0, # setting acceptable ratio of audio length to text length (for badcase detection)
+        streaming: bool = False,
+    ) -> Generator[torch.Tensor, None, None]:
+        if retry_badcase and streaming:
+            warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.")
+            retry_badcase = False
+        if len(prompt_wav_path) == 0:
+            text = target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor(
+                        [self.audio_start_token],
+                        dtype=torch.int32,
+                        device=text_token.device,
+                    ),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            audio_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_mask = torch.ones(text_length).type(torch.int32).to(text_token.device)
+            audio_mask = torch.zeros(text_length).type(torch.int32).to(text_token.device)
+        else:
+            text = prompt_text + target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            audio, sr = torchaudio.load(prompt_wav_path)
+            if audio.size(0) > 1:
+                audio = audio.mean(dim=0, keepdim=True)
+            if sr != self.sample_rate:
+                audio = torchaudio.functional.resample(audio, sr, self.sample_rate)
+            patch_len = self.patch_size * self.chunk_size
+            if audio.size(1) % patch_len != 0:
+                audio = torch.nn.functional.pad(audio, (0, patch_len - audio.size(1) % patch_len))
+            # (B, D, T)
+            audio_feat = self.audio_vae.encode(audio.to(self.device), self.sample_rate).cpu()
+            audio_feat = audio_feat.view(
+                self.audio_vae.latent_dim,
+                -1,
+                self.patch_size,
+            ).permute(1, 2, 0)
+            audio_feat = audio_feat[:-1, ...] # trick: remove the last padding token
+            audio_length = audio_feat.size(0)
+            text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device)
+            text_token = torch.cat([text_token, text_pad_token])
+            audio_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            audio_feat = torch.cat([audio_pad_feat, audio_feat], dim=0)
+            text_mask = (
+                torch.cat([torch.ones(text_length), torch.zeros(audio_length)]).type(torch.int32).to(text_token.device)
+            )
+            audio_mask = (
+                torch.cat([torch.zeros(text_length), torch.ones(audio_length)]).type(torch.int32).to(text_token.device)
+            )
+        text_token = text_token.unsqueeze(0).to(self.device)
+        text_mask = text_mask.unsqueeze(0).to(self.device)
+        audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
+        audio_mask = audio_mask.unsqueeze(0).to(self.device)
+        target_text_length = len(self.text_tokenizer(target_text))
+        retry_badcase_times = 0
+        while retry_badcase_times < retry_badcase_max_times:
+            inference_result = self._inference(
+                text_token,
+                text_mask,
+                audio_feat,
+                audio_mask,
+                min_len=min_len,
+                max_len=int(target_text_length * retry_badcase_ratio_threshold + 10) if retry_badcase else max_len,
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                streaming=streaming,
+            )
+            if streaming:
+                patch_len = self.patch_size * self.chunk_size
+                for latent_pred, _ in inference_result:
+                    decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+                    decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
+                    yield decode_audio
+                break
+            else:
+                latent_pred, pred_audio_feat = next(inference_result)
+                if retry_badcase:
+                    if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
+                        print(f"  Badcase detected, audio_text_ratio={pred_audio_feat.shape[0] / target_text_length}, retrying...")
+                        retry_badcase_times += 1
+                        continue
+                    else:
+                        break
+                else:
+                    break
+        if not streaming:
+            decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
+            decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
+            yield decode_audio
+    @torch.inference_mode()
+    def build_prompt_cache(
+        self,
+        prompt_text: str,
+        prompt_wav_path: str,
+    ):
+        """
+        Build prompt cache for subsequent fast generation.
+        Args:
+            prompt_text: prompt text (required)
+            prompt_wav_path: prompt audio path (required)
+        Returns:
+            prompt_cache: dict with text tokens and audio features
+        """
+        if not prompt_text or not prompt_wav_path:
+            raise ValueError("prompt_text and prompt_wav_path are required")
+        # build text tokens
+        text_token = torch.LongTensor(self.text_tokenizer(prompt_text))
+        # load audio
+        audio, sr = torchaudio.load(prompt_wav_path)
+        if audio.size(0) > 1:
+            audio = audio.mean(dim=0, keepdim=True)
+        if sr != self.sample_rate:
+            audio = torchaudio.functional.resample(audio, sr, self.sample_rate)
+        patch_len = self.patch_size * self.chunk_size
+        if audio.size(1) % patch_len != 0:
+            audio = torch.nn.functional.pad(audio, (0, patch_len - audio.size(1) % patch_len))
+        # extract audio features
+        audio_feat = self.audio_vae.encode(audio.to(self.device), self.sample_rate).cpu()
+        audio_feat = audio_feat.view(
+            self.audio_vae.latent_dim,
+            -1,
+            self.patch_size,
+        ).permute(1, 2, 0) # (D, T, P)
+        audio_feat = audio_feat[:-1, ...] # trick: remove the last padding token
+        # build prompt cache
+        prompt_cache = {
+            "text_token": text_token,
+            "audio_feat": audio_feat,
+        }
+        return prompt_cache
+    def merge_prompt_cache(
+        self,
+        original_cache: dict,
+        new_text_token: torch.Tensor,
+        new_audio_feat: torch.Tensor,
+    ):
+        """
+        Merge original prompt cache with newly generated content to stabilize voice.
+        Args:
+            original_cache: original prompt cache
+            new_text_token: newly generated text tokens
+            new_audio_feat: newly generated audio features
+        Returns:
+            merged_cache: merged cache
+        """
+        if original_cache is None:
+            return {
+                "text_token": new_text_token,
+                "audio_feat": new_audio_feat,
+            }
+        original_text_token = original_cache["text_token"]
+        original_audio_feat = original_cache["audio_feat"]
+        merged_text_token = torch.cat([original_text_token, new_text_token], dim=0)
+        merged_audio_feat = torch.cat([original_audio_feat, new_audio_feat], dim=0)
+        # build new cache
+        merged_cache = {
+            "text_token": merged_text_token,
+            "audio_feat": merged_audio_feat,
+        }
+        return merged_cache
+    def generate_with_prompt_cache(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return next(self._generate_with_prompt_cache(*args, streaming=False, **kwargs))
+    def generate_with_prompt_cache_streaming(
+        self, *args, **kwargs
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]], None, None]:
+        return self._generate_with_prompt_cache(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _generate_with_prompt_cache(
+        self,
+        target_text: str,
+        prompt_cache: dict,
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        retry_badcase: bool = False,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0,
+        streaming: bool = False,
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, List[torch.Tensor]]], None, None]:
+        """
+        Generate audio using pre-built prompt cache.
+        Args:
+            target_text: Text to convert to speech
+            prompt_cache: Cache built by build_prompt_cache (can be None)
+            min_len: Minimum audio length to avoid very short audio
+            max_len: Maximum audio length
+            inference_timesteps: Number of diffusion sampling steps
+            cfg_value: Classifier-free guidance value
+            retry_badcase: Whether to retry on bad cases
+            retry_badcase_max_times: Maximum retry attempts
+            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio
+            streaming: Whether to return a generator of audio chunks
+        Returns:
+            Generator of Tuple containing:
+                - Decoded audio tensor for the current step if ``streaming=True``, else final decoded audio tensor
+                - Tensor of new text tokens
+                - New audio features up to the current step as a List if ``streaming=True``, else as a concatenated Tensor
+        """
+        if retry_badcase and streaming:
+            warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.")
+            retry_badcase = False
+        # get prompt from cache
+        if prompt_cache is None:
+            prompt_text_token = torch.empty(0, dtype=torch.int32)
+            prompt_audio_feat = torch.empty((0, self.patch_size, self.audio_vae.latent_dim), dtype=torch.float32)
+        else:
+            prompt_text_token = prompt_cache["text_token"]
+            prompt_audio_feat = prompt_cache["audio_feat"]
+        # build target text tokens
+        target_text_token = torch.LongTensor(self.text_tokenizer(target_text))
+        text_token = torch.cat([prompt_text_token, target_text_token], dim=0)
+        text_token = torch.cat(
+            [
+                text_token,
+                torch.tensor(
+                    [self.audio_start_token],
+                    dtype=torch.int32,
+                    device=text_token.device,
+                ),
+            ],
+            dim=-1,
+        )
+        audio_length = prompt_audio_feat.size(0)
+        text_length = text_token.shape[0]
+        text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device)
+        audio_pad_feat = torch.zeros(
+            (text_token.shape[0], self.patch_size, self.audio_vae.latent_dim),
+            dtype=torch.float32,
+            device=text_token.device,
+        )
+        text_token = torch.cat([text_token, text_pad_token])
+        audio_feat = torch.cat([audio_pad_feat, prompt_audio_feat], dim=0)
+        text_mask = torch.cat([torch.ones(text_length), torch.zeros(audio_length)]).type(torch.int32).to(text_token.device)
+        audio_mask = torch.cat([torch.zeros(text_length), torch.ones(audio_length)]).type(torch.int32).to(text_token.device)
+        text_token = text_token.unsqueeze(0).to(self.device)
+        text_mask = text_mask.unsqueeze(0).to(self.device)
+        audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
+        audio_mask = audio_mask.unsqueeze(0).to(self.device)
+        # run inference
+        target_text_length = len(self.text_tokenizer(target_text))
+        retry_badcase_times = 0
+        while retry_badcase_times < retry_badcase_max_times:
+            inference_result = self._inference(
+                text_token,
+                text_mask,
+                audio_feat,
+                audio_mask,
+                min_len=min_len,
+                max_len=int(target_text_length * retry_badcase_ratio_threshold + 10) if retry_badcase else max_len,
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                streaming=streaming,
+            )
+            if streaming:
+                patch_len = self.patch_size * self.chunk_size
+                for latent_pred, pred_audio_feat in inference_result:
+                    decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+                    decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
+                    yield (
+                        decode_audio,
+                        target_text_token,
+                        pred_audio_feat
+                    )
+                break
+            else:
+                latent_pred, pred_audio_feat = next(inference_result)
+                if retry_badcase:
+                    if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
+                        print(f"  Badcase detected, audio_text_ratio={pred_audio_feat.shape[0] / target_text_length}, retrying...")
+                        retry_badcase_times += 1
+                        continue
+                    else:
+                        break
+                else:
+                    break
+        if not streaming:
+            decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
+            decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
+            yield (
+                decode_audio,
+                target_text_token,
+                pred_audio_feat
+            )
+    def inference(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+        return next(self._inference(*args, streaming=False, **kwargs))
+    def inference_streaming(self, *args, **kwargs) -> Generator[Tuple[torch.Tensor, List[torch.Tensor]], None, None]:
+        return self._inference(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _inference(
+        self,
+        text: torch.Tensor,
+        text_mask: torch.Tensor,
+        feat: torch.Tensor,
+        feat_mask: torch.Tensor,
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        streaming: bool = False,
+    ) -> Generator[Tuple[torch.Tensor, Union[torch.Tensor, List[torch.Tensor]]], None, None]:
+        """Core inference method for audio generation.
+        This is the main inference loop that generates audio features
+        using the language model and diffusion transformer.
+        Args:
+            text: Input text tokens
+            text_mask: Mask for text tokens
+            feat: Input audio features
+            feat_mask: Mask for audio features
+            min_len: Minimum generation length
+            max_len: Maximum generation length
+            inference_timesteps: Number of diffusion steps
+            cfg_value: Classifier-free guidance value
+            streaming: Whether to yield each step latent feature or just the final result
+        Returns:
+            Generator of Tuple containing:
+                - Predicted latent feature at the current step if ``streaming=True``, else final latent features
+                - Predicted audio feature sequence so far as a List if ``streaming=True``, else as a concatenated Tensor
+        """
+        B, T, P, D = feat.shape
+        feat_embed = self.feat_encoder(feat)  # [b, t, h_feat]
+        feat_embed = self.enc_to_lm_proj(feat_embed)
+        if self.config.lm_config.use_mup:
+            scale_emb = self.config.lm_config.scale_emb
+        else:
+            scale_emb = 1.0
+        text_embed = self.base_lm.embed_tokens(text) * scale_emb
+        combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed
+        prefix_feat_cond = feat[:, -1, ...]  # b, p, d
+        pred_feat_seq = []  # b, t, p, d
+        curr_embed = None
+        enc_outputs, kv_cache_tuple = self.base_lm(
+            inputs_embeds=combined_embed,
+            is_causal=True,
+        )
+        self.base_lm.kv_cache.fill_caches(kv_cache_tuple)
+        enc_outputs = self.fsq_layer(enc_outputs) * feat_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1)
+        lm_hidden = enc_outputs[:, -1, :]
+        residual_enc_outputs, residual_kv_cache_tuple = self.residual_lm(
+            inputs_embeds=enc_outputs + feat_mask.unsqueeze(-1) * feat_embed,
+            is_causal=True,
+        )
+        self.residual_lm.kv_cache.fill_caches(residual_kv_cache_tuple)
+        residual_hidden = residual_enc_outputs[:, -1, :]
+        for i in tqdm(range(max_len)):
+            dit_hidden_1 = self.lm_to_dit_proj(lm_hidden)  # [b, h_dit]
+            dit_hidden_2 = self.res_to_dit_proj(residual_hidden)  # [b, h_dit]
+            dit_hidden = dit_hidden_1 + dit_hidden_2  # [b, h_dit]
+            pred_feat = self.feat_decoder(
+                mu=dit_hidden,
+                patch_size=self.patch_size,
+                cond=prefix_feat_cond.transpose(1, 2).contiguous(),
+                n_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+            ).transpose(
+                1, 2
+            )  # [b, p, d]
+            curr_embed = self.feat_encoder_step(pred_feat.unsqueeze(1))  # b, 1, c
+            curr_embed = self.enc_to_lm_proj(curr_embed)
+            pred_feat_seq.append(pred_feat.unsqueeze(1))  # b, 1, p, d
+            prefix_feat_cond = pred_feat
+            if streaming:
+                # return the last three predicted latent features to provide enough context for smooth decoding
+                pred_feat_chunk = torch.cat(pred_feat_seq[-3:], dim=1)
+                feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size)
+                yield feat_pred, pred_feat_seq
+            stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item()
+            if i > min_len and stop_flag == 1:
+                break
+            lm_hidden = self.base_lm.forward_step(
+                curr_embed[:, 0, :], torch.tensor([self.base_lm.kv_cache.step()], device=curr_embed.device)
+            ).clone()
+            lm_hidden = self.fsq_layer(lm_hidden)
+            residual_hidden = self.residual_lm.forward_step(
+                lm_hidden + curr_embed[:, 0, :], torch.tensor([self.residual_lm.kv_cache.step()], device=curr_embed.device)
+            ).clone()
+        if not streaming:
+            pred_feat_seq = torch.cat(pred_feat_seq, dim=1)  # b, t, p, d
+            feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
+            yield feat_pred, pred_feat_seq.squeeze(0).cpu()
+    @classmethod
+    def from_local(cls, path: str, optimize: bool = True):
+        config = VoxCPMConfig.model_validate_json(open(os.path.join(path, "config.json")).read())
+        tokenizer = LlamaTokenizerFast.from_pretrained(path)
+        audio_vae = AudioVAE()
+        vae_state_dict = torch.load(
+            os.path.join(path, "audiovae.pth"),
+            map_location="cpu",
+            weights_only=True,
+        )["state_dict"]
+        model = cls(config, tokenizer, audio_vae)
+        lm_dtype = get_dtype(model.config.dtype)
+        model = model.to(lm_dtype)
+        model.audio_vae = model.audio_vae.to(torch.float32)
+        model_state_dict = torch.load(
+            os.path.join(path, "pytorch_model.bin"),
+            map_location="cpu",
+            weights_only=True,
+        )["state_dict"]
+        for kw, val in vae_state_dict.items():
+            model_state_dict[f"audio_vae.{kw}"] = val
+        model.load_state_dict(model_state_dict, strict=True)
+        return model.to(model.device).eval().optimize(disable=not optimize)

convert/src/voxcpm/modules/__init__.py ADDED Viewed

File without changes

convert/src/voxcpm/modules/audiovae/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .audio_vae import AudioVAE

convert/src/voxcpm/modules/audiovae/audio_vae.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import math
+from typing import List, Union
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, padding: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__padding = padding
+    def forward(self, x):
+        x_pad = F.pad(x, (self.__padding * 2, 0))
+        return super().forward(x_pad)
+class CausalTransposeConv1d(nn.ConvTranspose1d):
+    def __init__(self, *args, padding: int = 0, output_padding: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__padding = padding
+        self.__output_padding = output_padding
+    def forward(self, x):
+        return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
+def WNCausalConv1d(*args, **kwargs):
+    return weight_norm(CausalConv1d(*args, **kwargs))
+def WNCausalTransposeConv1d(*args, **kwargs):
+    return weight_norm(CausalTransposeConv1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+class CausalResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1, kernel: int = 7, groups: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNCausalConv1d(
+                dim,
+                dim,
+                kernel_size=kernel,
+                dilation=dilation,
+                padding=pad,
+                groups=groups,
+            ),
+            Snake1d(dim),
+            WNCausalConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        assert pad == 0
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class CausalEncoderBlock(nn.Module):
+    def __init__(self, output_dim: int = 16, input_dim=None, stride: int = 1, groups=1):
+        super().__init__()
+        input_dim = input_dim or output_dim // 2
+        self.block = nn.Sequential(
+            CausalResidualUnit(input_dim, dilation=1, groups=groups),
+            CausalResidualUnit(input_dim, dilation=3, groups=groups),
+            CausalResidualUnit(input_dim, dilation=9, groups=groups),
+            Snake1d(input_dim),
+            WNCausalConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class CausalEncoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        latent_dim: int = 32,
+        strides: list = [2, 4, 8, 8],
+        depthwise: bool = False,
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNCausalConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            groups = d_model // 2 if depthwise else 1
+            self.block += [CausalEncoderBlock(output_dim=d_model, stride=stride, groups=groups)]
+        groups = d_model if depthwise else 1
+        # Create two convolution, for mu and logvar
+        self.fc_mu = WNCausalConv1d(d_model, latent_dim, kernel_size=3, padding=1)
+        self.fc_logvar = WNCausalConv1d(d_model, latent_dim, kernel_size=3, padding=1)
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        hidden_state = self.block(x)
+        return {
+            "hidden_state": hidden_state,
+            "mu": self.fc_mu(hidden_state),
+            "logvar": self.fc_logvar(hidden_state),
+        }
+class NoiseBlock(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.linear = WNCausalConv1d(dim, dim, kernel_size=1, bias=False)
+    def forward(self, x):
+        B, C, T = x.shape
+        noise = torch.randn((B, 1, T), device=x.device, dtype=x.dtype)
+        h = self.linear(x)
+        n = noise * h
+        x = x + n
+        return x
+class CausalDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        stride: int = 1,
+        groups=1,
+        use_noise_block: bool = False,
+    ):
+        super().__init__()
+        layers = [
+            Snake1d(input_dim),
+            WNCausalTransposeConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=stride % 2,
+            ),
+        ]
+        if use_noise_block:
+            layers.append(NoiseBlock(output_dim))
+        layers.extend(
+            [
+                CausalResidualUnit(output_dim, dilation=1, groups=groups),
+                CausalResidualUnit(output_dim, dilation=3, groups=groups),
+                CausalResidualUnit(output_dim, dilation=9, groups=groups),
+            ]
+        )
+        self.block = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.block(x)
+class TransposeLastTwoDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.transpose(x, -1, -2)
+class CausalDecoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        depthwise: bool = False,
+        d_out: int = 1,
+        use_noise_block: bool = False,
+    ):
+        super().__init__()
+        # Add first conv layer
+        if depthwise:
+            layers = [
+                WNCausalConv1d(
+                    input_channel,
+                    input_channel,
+                    kernel_size=7,
+                    padding=3,
+                    groups=input_channel,
+                ),
+                WNCausalConv1d(input_channel, channels, kernel_size=1),
+            ]
+        else:
+            layers = [WNCausalConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            groups = output_dim if depthwise else 1
+            layers += [
+                CausalDecoderBlock(
+                    input_dim,
+                    output_dim,
+                    stride,
+                    groups=groups,
+                    use_noise_block=use_noise_block,
+                )
+            ]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNCausalConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+class AudioVAE(nn.Module):
+    """
+    Args:
+    """
+    def __init__(
+        self,
+        encoder_dim: int = 128,
+        encoder_rates: List[int] = [2, 5, 8, 8],
+        latent_dim: int = 64,
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = [8, 8, 5, 2],
+        depthwise: bool = True,
+        sample_rate: int = 16000,
+        use_noise_block: bool = False,
+    ):
+        super().__init__()
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.depthwise = depthwise
+        self.use_noise_block = use_noise_block
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = CausalEncoder(
+            encoder_dim,
+            latent_dim,
+            encoder_rates,
+            depthwise=depthwise,
+        )
+        self.decoder = CausalDecoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+            depthwise=depthwise,
+            use_noise_block=use_noise_block,
+        )
+        self.sample_rate = sample_rate
+        self.chunk_size = math.prod(encoder_rates)
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        pad_to = self.hop_length
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / pad_to) * pad_to - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data
+    def decode(self, z: torch.Tensor):
+        """Decode given latent codes and return audio data
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        return self.decoder(z)
+    def encode(self, audio_data: torch.Tensor, sample_rate: int):
+        """
+        Args:
+            audio_data: Tensor[B x 1 x T]
+            sample_rate: int
+        Returns:
+            z: Tensor[B x D x T]
+        """
+        if audio_data.ndim == 2:
+            audio_data = audio_data.unsqueeze(1)
+        audio_data = self.preprocess(audio_data, sample_rate)
+        return self.encoder(audio_data)["mu"]

convert/src/voxcpm/modules/layers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .scalar_quantization_layer import ScalarQuantizationLayer

convert/src/voxcpm/modules/layers/scalar_quantization_layer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+class ScalarQuantizationLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, latent_dim: int = 64, scale: int = 9):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.latent_dim = latent_dim
+        self.scale = scale
+        self.in_proj = nn.Linear(in_dim, latent_dim)
+        self.out_proj = nn.Linear(latent_dim, out_dim)
+    def forward(self, hidden):
+        hidden = self.in_proj(hidden)
+        hidden = torch.tanh(hidden)
+        if self.training:
+            quantized = torch.round(hidden * self.scale) / self.scale
+            hidden = hidden + (quantized - hidden).detach()
+        else:
+            hidden = torch.round(hidden * self.scale) / self.scale
+        return self.out_proj(hidden)

convert/src/voxcpm/modules/locdit/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .unified_cfm import UnifiedCFM, CfmConfig
2	+ from .local_dit import VoxCPMLocDiT

convert/src/voxcpm/modules/locdit/local_dit.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+from ..minicpm4 import MiniCPMModel, MiniCPM4Config
+import torch.nn as nn
+import math
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=x.dtype, device=device) * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, bias=True)
+        self.act = nn.SiLU()
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, bias=True)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class VoxCPMLocDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        config: MiniCPM4Config,
+        in_channels: int = 64,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.config = config
+        self.in_proj = nn.Linear(in_channels, config.hidden_size, bias=True)
+        self.cond_proj = nn.Linear(in_channels, config.hidden_size, bias=True)
+        self.out_proj = nn.Linear(config.hidden_size, self.out_channels, bias=True)
+        self.time_embeddings = SinusoidalPosEmb(config.hidden_size)
+        self.time_mlp = TimestepEmbedding(
+            in_channels=config.hidden_size,
+            time_embed_dim=config.hidden_size,
+        )
+        self.delta_time_mlp = TimestepEmbedding(
+            in_channels=config.hidden_size,
+            time_embed_dim=config.hidden_size,
+        )
+        assert config.vocab_size == 0, "vocab_size must be 0 for local DiT"
+        self.decoder = MiniCPMModel(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        t: torch.Tensor,
+        cond: torch.Tensor,
+        dt: torch.Tensor,
+    ):
+        """
+        Forward pass of DiT.
+        x: (N, C, T) tensor of inputs
+        mu: (N, C) tensor of hidden embedding
+        t: (N,) tensor of diffusion timesteps
+        cond: (N, C, T') tensor of prefix conditions
+        dt: (N,) used for mean velocity (may be supported in the future...)
+        """
+        x = self.in_proj(x.transpose(1, 2).contiguous())
+        cond = self.cond_proj(cond.transpose(1, 2).contiguous())
+        prefix = cond.size(1)
+        t = self.time_embeddings(t).to(x.dtype)
+        t = self.time_mlp(t)
+        dt = self.time_embeddings(dt).to(x.dtype)
+        dt = self.delta_time_mlp(dt)
+        t = t + dt
+        x = torch.cat([(mu + t).unsqueeze(1), cond, x], dim=1)
+        hidden, _ = self.decoder(x, is_causal=False)
+        hidden = hidden[:, prefix + 1 :, :]
+        hidden = self.out_proj(hidden)
+        return hidden.transpose(1, 2).contiguous()

convert/src/voxcpm/modules/locdit/unified_cfm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from typing import List
+from .local_dit import VoxCPMLocDiT
+import math
+from pydantic import BaseModel
+class CfmConfig(BaseModel):
+    sigma_min: float = 1e-06
+    solver: str = "euler"
+    t_scheduler: str = "log-norm"
+class UnifiedCFM(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        cfm_params: CfmConfig,
+        estimator: VoxCPMLocDiT,
+        mean_mode: bool = False,
+    ):
+        super().__init__()
+        self.solver = cfm_params.solver
+        self.sigma_min = cfm_params.sigma_min
+        self.t_scheduler = cfm_params.t_scheduler
+        self.in_channels = in_channels
+        self.mean_mode = mean_mode
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+    @torch.inference_mode()
+    def forward(
+        self,
+        mu: torch.Tensor,
+        n_timesteps: int,
+        patch_size: int,
+        cond: torch.Tensor,
+        temperature: float = 1.0,
+        cfg_value: float = 1.0,
+        sway_sampling_coef: float = 1.0,
+        use_cfg_zero_star: bool = True,
+    ):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats)
+            n_timesteps (int): number of diffusion steps
+            cond: Not used but kept for future purposes
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, c = mu.shape
+        t = patch_size
+        z = torch.randn((b, self.in_channels, t), device=mu.device, dtype=mu.dtype) * temperature
+        t_span = torch.linspace(1, 0, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        # Sway sampling strategy
+        t_span = t_span + sway_sampling_coef * (torch.cos(torch.pi / 2 * t_span) - 1 + t_span)
+        return self.solve_euler(z, t_span=t_span, mu=mu, cond=cond, cfg_value=cfg_value, use_cfg_zero_star=use_cfg_zero_star)
+    def optimized_scale(self, positive_flat, negative_flat):
+        dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
+        squared_norm = torch.sum(negative_flat ** 2, dim=1, keepdim=True) + 1e-8
+        st_star = dot_product / squared_norm
+        return st_star
+    def solve_euler(
+        self,
+        x: torch.Tensor,
+        t_span: torch.Tensor,
+        mu: torch.Tensor,
+        cond: torch.Tensor,
+        cfg_value: float = 1.0,
+        use_cfg_zero_star: bool = True,
+    ):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats)
+            cond: condition -- prefix prompt
+            cfg_value (float, optional): cfg value for guidance. Defaults to 1.0.
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[0] - t_span[1]
+        sol = []
+        zero_init_steps = max(1, int(len(t_span) * 0.04))
+        for step in range(1, len(t_span)):
+            if use_cfg_zero_star and step <= zero_init_steps:
+                dphi_dt = 0.
+            else:
+                # Classifier-Free Guidance inference introduced in VoiceBox
+                b = x.size(0)
+                x_in = torch.zeros([2 * b, self.in_channels, x.size(2)], device=x.device, dtype=x.dtype)
+                mu_in = torch.zeros([2 * b, mu.size(1)], device=x.device, dtype=x.dtype)
+                t_in = torch.zeros([2 * b], device=x.device, dtype=x.dtype)
+                dt_in = torch.zeros([2 * b], device=x.device, dtype=x.dtype)
+                cond_in = torch.zeros([2 * b, self.in_channels, x.size(2)], device=x.device, dtype=x.dtype)
+                x_in[:b], x_in[b:] = x, x
+                mu_in[:b] = mu
+                t_in[:b], t_in[b:] = t.unsqueeze(0), t.unsqueeze(0)
+                dt_in[:b], dt_in[b:] = dt.unsqueeze(0), dt.unsqueeze(0)
+                # not used now
+                if not self.mean_mode:
+                    dt_in = torch.zeros_like(dt_in)
+                cond_in[:b], cond_in[b:] = cond, cond
+                dphi_dt = self.estimator(x_in, mu_in, t_in, cond_in, dt_in)
+                dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+                if use_cfg_zero_star:
+                    positive_flat = dphi_dt.view(b, -1)
+                    negative_flat = cfg_dphi_dt.view(b, -1)
+                    st_star = self.optimized_scale(positive_flat, negative_flat)
+                    st_star = st_star.view(b, *([1] * (len(dphi_dt.shape) - 1)))
+                else:
+                    st_star = 1.0
+                dphi_dt = cfg_dphi_dt * st_star + cfg_value * (dphi_dt - cfg_dphi_dt * st_star)
+            x = x - dt * dphi_dt
+            t = t - dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t - t_span[step + 1]
+        return sol[-1]

convert/src/voxcpm/modules/locenc/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .local_encoder import VoxCPMLocEnc

convert/src/voxcpm/modules/locenc/local_encoder.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+from ..minicpm4 import MiniCPMModel, MiniCPM4Config
+from einops import rearrange
+class VoxCPMLocEnc(nn.Module):
+    def __init__(self, config: MiniCPM4Config, input_dim: int = 64):
+        super().__init__()
+        self.config = config
+        self.special_token = nn.Parameter(torch.randn(1, 1, 1, config.hidden_size))
+        self.in_proj = nn.Linear(input_dim, config.hidden_size, bias=True)
+        assert config.vocab_size == 0, "vocab_size must be 0 for local encoder"
+        self.encoder = MiniCPMModel(config)
+    def forward(self, x):
+        """
+        x: [B, T, P, D]
+        """
+        B, T, P, D = x.shape
+        x = self.in_proj(x)
+        special_tokens = self.special_token.expand(B, T, 1, -1)
+        x = torch.cat([special_tokens, x], dim=2)
+        x = rearrange(x, "b t p c -> (b t) p c")
+        outputs, _ = self.encoder(x, is_causal=False)
+        cls_output = outputs[:, 0, :]
+        return rearrange(cls_output, "(b t) c -> b t c", b=B)

convert/src/voxcpm/modules/minicpm4/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .config import MiniCPM4Config
+from .model import MiniCPMModel
+from .cache import StaticKVCache

convert/src/voxcpm/modules/minicpm4/cache.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import List, Tuple
+import torch
+class StaticKVCache:
+    def __init__(
+        self,
+        num_layers: int,
+        num_kv_heads: int,
+        dim_kv_head: int,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        max_length: int = 8192,
+    ):
+        self.max_length = max_length
+        self.num_layers = num_layers
+        self.kv_cache = torch.zeros(
+            2,
+            num_layers,
+            batch_size,
+            num_kv_heads,
+            max_length,
+            dim_kv_head,
+            device=device,
+            dtype=dtype,
+        )
+        self.current_length = 0
+    def get_layer_cache(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.kv_cache[0, layer_idx], self.kv_cache[1, layer_idx]
+    def step(self) -> int:
+        if self.current_length >= self.max_length:
+            raise ValueError("KV cache is full")
+        ret = self.current_length
+        self.current_length += 1
+        return ret
+    def fill_caches(self, kv_caches: List[Tuple[torch.Tensor, torch.Tensor]]):
+        self.current_length = kv_caches[0][0].size(2)
+        self.kv_cache.zero_()
+        for i in range(self.num_layers):
+            self.kv_cache[0, i, :, :, : self.current_length, :] = kv_caches[i][0]
+            self.kv_cache[1, i, :, :, : self.current_length, :] = kv_caches[i][1]

convert/src/voxcpm/modules/minicpm4/config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pydantic import BaseModel
+from typing import List
+class RopeScalingConfig(BaseModel):
+    type: str
+    long_factor: List[float]
+    short_factor: List[float]
+    original_max_position_embeddings: int
+class MiniCPM4Config(BaseModel):
+    bos_token_id: int
+    eos_token_id: int
+    hidden_size: int
+    intermediate_size: int
+    max_position_embeddings: int
+    num_attention_heads: int
+    num_hidden_layers: int
+    num_key_value_heads: int
+    rms_norm_eps: float
+    rope_scaling: RopeScalingConfig
+    vocab_size: int
+    use_mup: bool = True
+    scale_emb: float
+    dim_model_base: int
+    scale_depth: float
+    rope_theta: float
+    kv_channels: int = None

convert/src/voxcpm/modules/minicpm4/model.py ADDED Viewed

	@@ -0,0 +1,473 @@

+from .config import MiniCPM4Config
+import torch
+import torch.nn as nn
+from typing import List, Tuple
+import math
+from .cache import StaticKVCache
+def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
+    old_dtype = hidden.dtype
+    variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+    hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype)
+    return hidden * weight
+class MiniCPMRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniCPMRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
+    """
+    Args:
+        q: Tensor(batch_size, num_heads, seq_len, head_dim)
+        k: Tensor(batch_size, num_key_value_heads, seq_len, head_dim)
+        cos: Tensor(seq_len, head_dim)
+        sin: Tensor(seq_len, head_dim)
+    Returns:
+        Tensor(batch_size, num_heads, seq_len, head_dim), Tensor(batch_size, num_key_value_heads, seq_len, head_dim)
+    """
+    orig_dtype = q.dtype
+    q = q.to(torch.float32)
+    k = k.to(torch.float32)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(orig_dtype), k_embed.to(orig_dtype)
+def scaled_dot_product_attention_gqa_compat(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    *,
+    attn_mask: torch.Tensor | None = None,
+    is_causal: bool = False,
+    enable_gqa: bool = False,
+) -> torch.Tensor:
+    """ONNX-export friendly fallback for scaled_dot_product_attention(enable_gqa=True)."""
+    orig_dtype = query.dtype
+    query = query.to(torch.float32)
+    key = key.to(torch.float32)
+    value = value.to(torch.float32)
+    if enable_gqa and query.shape[-3] != key.shape[-3]:
+        repeat_factor = query.shape[-3] // key.shape[-3]
+        key = key.repeat_interleave(repeat_factor, dim=-3)
+        value = value.repeat_interleave(repeat_factor, dim=-3)
+    scale = 1.0 / math.sqrt(query.size(-1))
+    attn_scores = torch.matmul(query, key.transpose(-2, -1)) * scale
+    if is_causal:
+        q_len = query.size(-2)
+        k_len = key.size(-2)
+        q_pos = torch.arange(q_len, device=query.device).unsqueeze(-1)
+        k_pos = torch.arange(k_len, device=query.device).unsqueeze(0)
+        causal_mask = k_pos <= (q_pos + k_len - q_len)
+        attn_scores = attn_scores.masked_fill(~causal_mask, torch.finfo(attn_scores.dtype).min)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            while attn_mask.ndim < attn_scores.ndim:
+                attn_mask = attn_mask.unsqueeze(0)
+            attn_scores = attn_scores.masked_fill(~attn_mask, torch.finfo(attn_scores.dtype).min)
+        else:
+            attn_scores = attn_scores + attn_mask.to(attn_scores.dtype)
+    attn_probs = torch.softmax(attn_scores, dim=-1)
+    return torch.matmul(attn_probs, value).to(orig_dtype)
+class MiniCPMLongRoPE(nn.Module):
+    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, config: MiniCPM4Config):
+        super().__init__()
+        self.config = config
+        self.dim = config.kv_channels if config.kv_channels else config.hidden_size // config.num_attention_heads
+        self.base = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.short_factor = config.rope_scaling.short_factor
+        self.long_factor = config.rope_scaling.long_factor
+        self.original_max_position_embeddings = config.rope_scaling.original_max_position_embeddings
+        scale = (self.max_position_embeddings / self.original_max_position_embeddings)
+        self.scaling_factor = math.sqrt(
+            1 + math.log(scale) / math.log(self.original_max_position_embeddings)
+        )
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.max_seq_len_cached = 0
+        self.register_buffer("cos_cached", torch.empty(0), persistent=False)
+        self.register_buffer("sin_cached", torch.empty(0), persistent=False)
+        self._set_cos_sin_cache(
+            seq_len=self.max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.float32
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        """设置cos和sin缓存"""
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=device)
+        freqs = torch.mul(
+            torch.outer(t, 1.0 / ext_factors).to(device=device),
+            self.inv_freq.to(device=device).to(dtype)
+        )
+        # 创建embeddings
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos().to(dtype) * self.scaling_factor
+        self.sin_cached = emb.sin().to(dtype) * self.scaling_factor
+    def forward(self, position_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            position_ids: Tensor(seq_len) 或 Tensor(batch_size, seq_len)
+        Returns:
+            Tensor(seq_len, head_dim), Tensor(seq_len, head_dim)
+        """
+        cos = self.cos_cached[position_ids]
+        sin = self.sin_cached[position_ids]
+        return cos, sin
+class MiniCPMAttention(nn.Module):
+    def __init__(self, config: MiniCPM4Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = 10000.0
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        is_causal: bool,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_emb
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        # ref: https://github.com/pytorch/pytorch/issues/163597
+        # there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+        if torch.onnx.is_in_onnx_export():
+            attn_output = scaled_dot_product_attention_gqa_compat(
+                query_states,
+                key_states,
+                value_states,
+                is_causal=is_causal,
+                enable_gqa=True,
+            )
+        else:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                is_causal=is_causal,
+                enable_gqa=True,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        past_key_value = (key_states, value_states)
+        return attn_output, past_key_value
+    def forward_step(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        position_id: int,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, 1, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, 1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, 1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_emb
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_cache, value_cache = kv_cache
+        key_cache[:, :, position_id, :] = key_states
+        value_cache[:, :, position_id, :] = value_states
+        attn_mask = torch.arange(key_cache.size(2), device=key_cache.device) <= position_id
+        # ref: https://github.com/pytorch/pytorch/issues/163597
+        # there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous
+        query_states = query_states.unsqueeze(0)
+        key_cache = key_cache.unsqueeze(0)
+        value_cache = value_cache.unsqueeze(0)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_cache,
+            value_cache,
+            attn_mask=attn_mask,
+            enable_gqa=True,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class MiniCPMMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class MiniCPMDecoderLayer(nn.Module):
+    def __init__(self, config: MiniCPM4Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MiniCPMAttention(config=config, layer_idx=layer_idx)
+        self.mlp = MiniCPMMLP(config)
+        self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.scale_depth = config.scale_depth
+        self.num_hidden_layers = config.num_hidden_layers
+        self.use_mup = config.use_mup
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        is_causal: bool,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            position_ids (`torch.LongTensor`): position ids of shape `(batch_size, seq_len)`
+            is_causal (`bool`): whether the attention mask is causal
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            position_emb=position_emb,
+            is_causal=is_causal,
+        )
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states, present_key_value
+    def forward_step(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        position_id: torch.Tensor,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn.forward_step(
+            hidden_states=hidden_states,
+            position_emb=position_emb,
+            position_id=position_id,
+            kv_cache=kv_cache,
+        )
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states
+class MiniCPMModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
+    Args:
+        config: MiniCPMConfig
+    """
+    def __init__(self, config: MiniCPM4Config):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.config = config
+        if config.vocab_size > 0:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        else:
+            self.embed_tokens = nn.Identity()
+        self.layers = nn.ModuleList(
+            [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rope_emb = MiniCPMLongRoPE(config)
+        self.kv_cache = None
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        is_causal: bool = True,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        """
+        Args:
+            inputs_embeds: Tensor(batch_size, seq_length, hidden_size)
+            is_causal: bool, whether the attention mask is causal
+        Returns:
+            hidden_states: Tensor(batch_size, seq_length, hidden_size)
+            next_decoder_cache: List[(batch_size, num_heads, seq_length, head_dim), (batch_size, num_heads, seq_length, head_dim)]
+        """
+        position_ids = torch.arange(0, inputs_embeds.size(1), dtype=torch.long, device=inputs_embeds.device)
+        position_emb = self.rope_emb(position_ids)
+        hidden_states = inputs_embeds
+        next_decoder_cache = []
+        for decoder_layer in self.layers:
+            hidden_states, this_cache = decoder_layer(
+                hidden_states,
+                position_emb,
+                is_causal,
+            )
+            next_decoder_cache.append(this_cache)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, next_decoder_cache
+    def forward_step(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_id: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            inputs_embeds: Tensor(batch_size, hidden_size)
+        Returns:
+            hidden_states: Tensor(batch_size, hidden_size)
+        """
+        assert self.kv_cache is not None, "KV cache is not setup"
+        position_emb = self.rope_emb(position_id)
+        hidden_states = inputs_embeds
+        for i, decoder_layer in enumerate(self.layers):
+            hidden_states = decoder_layer.forward_step(
+                hidden_states,
+                position_emb,
+                position_id,
+                self.kv_cache.get_layer_cache(i),
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+    def setup_cache(self, batch_size: int, max_length: int, device, dtype: torch.dtype):
+        self.kv_cache = StaticKVCache(
+            num_layers=self.config.num_hidden_layers,
+            num_kv_heads=self.config.num_key_value_heads,
+            dim_kv_head=self.config.hidden_size // self.config.num_attention_heads if self.config.kv_channels is None else self.config.kv_channels,
+            batch_size=batch_size,
+            device=device,
+            dtype=dtype,
+            max_length=max_length,
+        )

convert/src/voxcpm/utils/text_normalize.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# some functions are copied from https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/utils/frontend_utils.py
+import re
+import regex
+import inflect
+from functools import partial
+from wetext import Normalizer
+chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
+# whether contain chinese character
+def contains_chinese(text):
+    return bool(chinese_char_pattern.search(text))
+# replace special symbol
+def replace_corner_mark(text):
+    text = text.replace('²', '平方')
+    text = text.replace('³', '立方')
+    text = text.replace('√', '根号')
+    text = text.replace('≈', '约等于')
+    text = text.replace('<', '小于')
+    return text
+# remove meaningless symbol
+def remove_bracket(text):
+    text = text.replace('（', ' ').replace('）', ' ')
+    text = text.replace('【', ' ').replace('】', ' ')
+    text = text.replace('`', '').replace('`', '')
+    text = text.replace("——", " ")
+    return text
+# spell Arabic numerals
+def spell_out_number(text: str, inflect_parser):
+    new_text = []
+    st = None
+    for i, c in enumerate(text):
+        if not c.isdigit():
+            if st is not None:
+                num_str = inflect_parser.number_to_words(text[st: i])
+                new_text.append(num_str)
+                st = None
+            new_text.append(c)
+        else:
+            if st is None:
+                st = i
+    if st is not None and st < len(text):
+        num_str = inflect_parser.number_to_words(text[st:])
+        new_text.append(num_str)
+    return ''.join(new_text)
+# split paragrah logic：
+# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
+# 2. cal sentence len according to lang
+# 3. split sentence according to puncatation
+def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
+    def calc_utt_length(_text: str):
+        if lang == "zh":
+            return len(_text)
+        else:
+            return len(tokenize(_text))
+    def should_merge(_text: str):
+        if lang == "zh":
+            return len(_text) < merge_len
+        else:
+            return len(tokenize(_text)) < merge_len
+    if lang == "zh":
+        pounc = ['。', '？', '！', '；', '：', '、', '.', '?', '!', ';']
+    else:
+        pounc = ['.', '?', '!', ';', ':']
+    if comma_split:
+        pounc.extend(['，', ','])
+    st = 0
+    utts = []
+    for i, c in enumerate(text):
+        if c in pounc:
+            if len(text[st: i]) > 0:
+                utts.append(text[st: i] + c)
+            if i + 1 < len(text) and text[i + 1] in ['"', '”']:
+                tmp = utts.pop(-1)
+                utts.append(tmp + text[i + 1])
+                st = i + 2
+            else:
+                st = i + 1
+    if len(utts) == 0:
+        if lang == "zh":
+            utts.append(text + '。')
+        else:
+            utts.append(text + '.')
+    final_utts = []
+    cur_utt = ""
+    for utt in utts:
+        if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
+            final_utts.append(cur_utt)
+            cur_utt = ""
+        cur_utt = cur_utt + utt
+    if len(cur_utt) > 0:
+        if should_merge(cur_utt) and len(final_utts) != 0:
+            final_utts[-1] = final_utts[-1] + cur_utt
+        else:
+            final_utts.append(cur_utt)
+    return final_utts
+# remove blank between chinese character
+def replace_blank(text: str):
+    out_str = []
+    for i, c in enumerate(text):
+        if c == " ":
+            if ((text[i + 1].isascii() and text[i + 1] != " ") and
+                    (text[i - 1].isascii() and text[i - 1] != " ")):
+                out_str.append(c)
+        else:
+            out_str.append(c)
+    return "".join(out_str)
+def clean_markdown(md_text: str) -> str:
+    # 去除代码块 ``` ```（包括多行）
+    md_text = re.sub(r"```.*?```", "", md_text, flags=re.DOTALL)
+    # 去除内联代码 `code`
+    md_text = re.sub(r"`[^`]*`", "", md_text)
+    # 去除图片语法 ![alt](url)
+    md_text = re.sub(r"!\[[^\]]*\]\([^\)]+\)", "", md_text)
+    # 去除链接但保留文本 [text](url) -> text
+    md_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", md_text)
+    # 替换无序列表符号
+    md_text = re.sub(r'^(\s*)-\s+', r'\1', md_text, flags=re.MULTILINE)
+    # 去除HTML标签
+    md_text = re.sub(r"<[^>]+>", "", md_text)
+    # 去除标题符号（#）
+    md_text = re.sub(r"^#{1,6}\s*", "", md_text, flags=re.MULTILINE)
+    # 去除多余空格和空行
+    md_text = re.sub(r"\n\s*\n", "\n", md_text)  # 多余空行
+    md_text = md_text.strip()
+    return md_text
+def clean_text(text):
+    # 去除 Markdown 语法
+    text = clean_markdown(text)
+    # 匹配并移除表情符号
+    text = regex.compile(r'\p{Emoji_Presentation}|\p{Emoji}\uFE0F', flags=regex.UNICODE).sub("",text)
+    # 去除换行符
+    text = text.replace("\n", " ")
+    text = text.replace("\t", " ")
+    text = text.replace('"', "\“")
+    return text
+class TextNormalizer:
+    def __init__(self, tokenizer=None):
+        self.tokenizer = tokenizer
+        self.zh_tn_model = Normalizer(lang="zh", operator="tn", remove_erhua=True)
+        self.en_tn_model = Normalizer(lang="en", operator="tn")
+        self.inflect_parser = inflect.engine()
+    def normalize(self, text, split=False):
+        # 去除 Markdown 语法，去除表情符号，去除换行符
+        lang = "zh" if contains_chinese(text) else "en"
+        text = clean_text(text)
+        if lang == "zh":
+            text = text.replace("=", "等于") # 修复 ”550 + 320 等于 870 千卡。“ 被错误正则为 ”五百五十加三百二十等于八七十千卡.“
+            if re.search(r'([\d$%^*_+≥≤≠×÷?=])', text): # 避免 英文连字符被错误正则为减
+                text = re.sub(r'(?<=[a-zA-Z0-9])-(?=\d)', ' - ', text) # 修复 x-2 被正则为 x负2
+            text = self.zh_tn_model.normalize(text)
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = remove_bracket(text)
+        else:
+            text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+        if split is False:
+            return text

convert/src/voxcpm/zipenhancer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+ZipEnhancer Module - Audio Denoising Enhancer
+Provides on-demand import ZipEnhancer functionality for audio denoising processing.
+Related dependencies are imported only when denoising functionality is needed.
+"""
+import os
+import tempfile
+from typing import Optional, Union
+import torchaudio
+import torch
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+class ZipEnhancer:
+    """ZipEnhancer Audio Denoising Enhancer"""
+    def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
+        """
+        Initialize ZipEnhancer
+        Args:
+            model_path: ModelScope model path or local path
+        """
+        self.model_path = model_path
+        self._pipeline = pipeline(
+                Tasks.acoustic_noise_suppression,
+                model=self.model_path
+            )
+    def _normalize_loudness(self, wav_path: str):
+        """
+        Audio loudness normalization
+        Args:
+            wav_path: Audio file path
+        """
+        audio, sr = torchaudio.load(wav_path)
+        loudness = torchaudio.functional.loudness(audio, sr)
+        normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
+        torchaudio.save(wav_path, normalized_audio, sr)
+    def enhance(self, input_path: str, output_path: Optional[str] = None,
+                normalize_loudness: bool = True) -> str:
+        """
+        Audio denoising enhancement
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path (optional, creates temp file by default)
+            normalize_loudness: Whether to perform loudness normalization
+        Returns:
+            str: Output audio file path
+        Raises:
+            RuntimeError: If pipeline is not initialized or processing fails
+        """
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
+        # Create temporary file if no output path is specified
+        if output_path is None:
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+                output_path = tmp_file.name
+        try:
+            # Perform denoising processing
+            self._pipeline(input_path, output_path=output_path)
+            # Loudness normalization
+            if normalize_loudness:
+                self._normalize_loudness(output_path)
+            return output_path
+        except Exception as e:
+            # Clean up possibly created temporary files
+            if output_path and os.path.exists(output_path):
+                try:
+                    os.unlink(output_path)
+                except OSError:
+                    pass
+            raise RuntimeError(f"Audio denoising processing failed: {e}")