diff --git "a/modeling_minicpmo.py" "b/modeling_minicpmo.py"
new file mode 100644--- /dev/null
+++ "b/modeling_minicpmo.py"
@@ -0,0 +1,5072 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import math
+import os
+import tempfile
+import threading
+import time
+import types
+from copy import deepcopy
+from dataclasses import dataclass
+from functools import partial
+from threading import Thread
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as P
+from torch import nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils.parametrizations import weight_norm
+from tqdm import tqdm
+
+if os.getenv("USE_FLAGOS") == "1":
+    import importlib
+
+    flag_gems = importlib.import_module("flag_gems")  # noqa: F401
+    flag_gems_experimental = importlib.import_module("flag_gems.experimental_ops")
+    gems_rmsnorm = flag_gems_experimental.rmsnorm
+
+    class GemsRMSNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-6):
+            super().__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, hidden_states):
+            return gems_rmsnorm(hidden_states, self.weight, self.variance_epsilon)
+
+        def extra_repr(self):
+            return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+    from transformers.models.llama import modeling_llama
+    from transformers.models.qwen3 import modeling_qwen3
+
+    modeling_qwen3.Qwen3RMSNorm = GemsRMSNorm
+    modeling_llama.LlamaRMSNorm = GemsRMSNorm
+
+from transformers import LlamaConfig
+from transformers import LlamaModel
+from transformers import PreTrainedModel
+from transformers import Qwen3ForCausalLM
+from transformers import Qwen3PreTrainedModel
+from transformers import TextIteratorStreamer
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.cache_utils import DynamicCache
+from transformers.cache_utils import EncoderDecoderCache
+from transformers.cache_utils import StaticCache
+from transformers.generation.logits_process import TopKLogitsWarper
+from transformers.generation.logits_process import TopPLogitsWarper
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.whisper.configuration_whisper import WhisperConfig
+from transformers.models.whisper.modeling_whisper import WhisperEncoder
+
+from .configuration_minicpmo import MiniCPMOConfig
+from .configuration_minicpmo import MiniCPMTTSConfig
+from .modeling_navit_siglip import SiglipVisionTransformer
+from .processing_minicpmo import MiniCPMOProcessor
+from .utils import as_dynamic_cache
+from .utils import ChunkPrefillChunkGenerate
+from .utils import drop_tokens_from_cache
+from .utils import DuplexWindowConfig
+from .utils import get_kv_cache_length
+from .utils import normalize_content
+from .utils import realign_rotary_suffix
+from .utils import SpeculativeSnapshot
+from .utils import streaming_token_decoder
+from .utils import StreamingWindowConfig
+from .utils import torch_clone_recursive
+from .utils import TTSSamplingParams
+from .utils import TTSStreamingGenerator
+
+logger = logging.getLogger(__name__)
+
+
+class MiniCPMOPreTrainedModel(Qwen3PreTrainedModel):
+    config_class = MiniCPMOConfig
+
+
+class MiniCPMO(MiniCPMOPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.llm = Qwen3ForCausalLM(config)
+        self.embed_dim = self.llm.config.hidden_size
+        self.llm.prepare_inputs_for_generation = types.MethodType(prepare_inputs_for_generation, self.llm)  # patch llm
+
+        # init vision module
+        if self.config.init_vision:
+            self.vpm = self.init_vision_module()
+            self.vision_dim = self.vpm.embed_dim
+            self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
+
+        # init audio module
+        if self.config.init_audio:
+            self.apm = self.init_audio_module()
+            audio_output_dim = int(self.apm.config.encoder_ffn_dim // 4)
+            self.audio_avg_pooler = nn.AvgPool1d(self.config.audio_pool_step, stride=self.config.audio_pool_step)
+            self.audio_projection_layer = MultiModalProjector(in_dim=audio_output_dim, out_dim=self.embed_dim)
+            self.audio_encoder_layer = -1
+
+        # init tts module
+        if self.config.init_tts:
+            self.tts = self.init_tts_module()
+
+        self.terminators = ["<|im_end|>", "<|endoftext|>"]
+
+        self.think_str = ""
+        if self.llm.__class__.__name__ == "Qwen3ForCausalLM":
+            self.think_str = "<think>\\n\\n</think>\\n\\n"
+
+        # for streaming
+        self.reset_session(reset_token2wav_cache=True)
+
+        # streaming audio processing constants
+        self.SAMPLE_RATE = 16000
+        self.CHUNK_MS = 1000  # regular chunk length (ms)
+        self.FIRST_CHUNK_MS = 1035  # first chunk length (ms)
+        self.CNN_REDUNDANCY_MS = 0  # CNN redundancy (ms)
+
+        # for sliding window
+        self.streaming_window_config = StreamingWindowConfig()
+        self.streaming_require_system_prompt = True
+        self.streaming_window_enabled = True
+        self.force_rope_reindex = False  # RoPE reindex testing switch
+
+    def init_streaming_processor(self):
+        self.prepare_processor(processor=None, tokenizer=None)
+
+        if hasattr(self.processor, "set_streaming_mode"):
+            self.processor.set_streaming_mode(
+                mode="exact",
+                chunk_ms=self.CHUNK_MS,
+                first_chunk_ms=self.FIRST_CHUNK_MS,
+                cnn_redundancy_ms=self.CNN_REDUNDANCY_MS,
+                enable_sliding_window=True,
+                slide_trigger_seconds=30.0,
+                slide_stride_seconds=10.0,
+            )
+            self.processor.reset_streaming()
+            self.audio_chunk_idx = 0
+
+    def reset_session(self, reset_token2wav_cache=True):
+        self.llm_past_key_values = None
+        self.audio_past_key_values = None
+        self.tts_last_turn_tokens = None
+        self.llm_generated = False  # last turn generated by llm or not
+        self.llm_generate_completed = False
+        self.new_user_msg = True
+
+        self.session_id = None
+
+        if reset_token2wav_cache:
+            self.token2wav_cache = None
+
+        # for sliding window
+        self.streaming_text_preserve = 0
+        self.streaming_position_offset = 0
+
+        self._rope_inv_freq_cache: Dict[Tuple[int, torch.device], torch.Tensor] = {}
+
+        self._next_round_id = 0
+        self._pending_round_id = None
+
+        self._omni_chunk_history: List[Dict[str, Union[str, int]]] = []
+        self._round_history: List[Dict[str, Union[int, str, torch.Tensor, Optional[int]]]] = []
+
+    def init_vision_module(self):
+        if self.config._attn_implementation == "flash_attention_2":
+            self.config.vision_config._attn_implementation = "flash_attention_2"
+        else:
+            self.config.vision_config._attn_implementation = "eager"
+        model = SiglipVisionTransformer(self.config.vision_config)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+
+        return model
+
+    def init_resampler(self, embed_dim, vision_dim):
+        return Resampler(
+            num_queries=self.config.query_num,
+            embed_dim=embed_dim,
+            num_heads=embed_dim // 128,
+            kv_dim=vision_dim,
+            adaptive=True,
+        )
+
+    def init_audio_module(self):
+        if self.config._attn_implementation == "eager":
+            self.config.audio_config._attn_implementation = "eager"
+        else:
+            # using flash_attention_2 will cause: RuntimeError: cu_seqlens_q must have shape (batch_size + 1)
+            self.config.audio_config._attn_implementation = "sdpa"
+
+        return MiniCPMWhisperEncoder(self.config.audio_config)
+
+    def init_tts_module(self):
+        if self.config._attn_implementation == "flash_attention_2":
+            self.config.tts_config.attn_implementation = "flash_attention_2"
+        else:
+            self.config.tts_config.attn_implementation = "eager"
+
+        return MiniCPMTTS(config=self.config.tts_config, audio_tokenizer=None)
+
+    def _ensure_asset_dir(self, asset_subpath: str, model_dir: Optional[str] = None) -> str:
+        """Ensure asset directory exists, downloading from HF if needed."""
+        model_dir = model_dir or os.path.join(self.config._name_or_path, asset_subpath)
+        if not os.path.exists(model_dir):
+            from huggingface_hub import snapshot_download
+
+            repo_dir = snapshot_download(
+                repo_id="openbmb/MiniCPM-o-4_5",
+                allow_patterns=[f"{asset_subpath}/**"],
+            )
+            model_dir = os.path.join(repo_dir, asset_subpath)
+        assert os.path.exists(model_dir), f"Asset directory not found: {model_dir}"
+        return model_dir
+
+    def init_tts(self, model_dir=None, enable_float16=False, n_timesteps=10, **kwargs):
+        if self.config.tts_config.audio_tokenizer_type != "s3tokenizer_step_audio":
+            logger.warning("audio tokenizer type is set to s3tokenizer_step_audio")
+            self.tts.config.audio_tokenizer_type = "s3tokenizer_step_audio"
+
+        try:
+            from stepaudio2 import Token2wav
+        except ImportError:
+            raise ImportError("Please install Token2wav via: pip install minicpmo-utils[all]")
+
+        model_dir = self._ensure_asset_dir("assets/token2wav", model_dir)
+        self.tts.audio_tokenizer = Token2wav(model_dir, float16=enable_float16, n_timesteps=n_timesteps)
+        return self.tts.audio_tokenizer
+
+    def get_input_embeddings(self):
+        return self.llm.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.llm.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.llm.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.llm.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.llm = decoder
+
+    def get_decoder(self):
+        return self.llm
+
+    @staticmethod
+    def get_sys_prompt(ref_audio=None, mode="default", language="en", ref_audio_max_ms=None):
+        if ref_audio is not None:
+            if isinstance(ref_audio, str):
+                import os
+
+                import librosa
+
+                if os.path.isfile(ref_audio):
+                    duration = ref_audio_max_ms / 1000.0 if ref_audio_max_ms else None
+                    ref_audio, _ = librosa.load(ref_audio, sr=16000, mono=True, duration=duration)
+                else:
+                    logger.error(f"Could not find {ref_audio}")
+                    ref_audio = None
+
+            assert isinstance(ref_audio, np.ndarray), "ref_audio error"
+
+        if mode == "omni":
+            if language == "zh":
+                sys_prompt = ""
+                vc_prompt_prefix = "模仿音频样本的音色并生成新的内容。"
+                vc_prompt_suffix = (
+                    "请用这种声音风格来为用户提供帮助。 请认真、高质量地回复用户的问题。 请用高自然度的方式和用户聊天。"
+                )
+            else:
+                sys_prompt = ""
+                vc_prompt_prefix = sys_prompt + "Clone the voice in the provided audio prompt."
+                vc_prompt_suffix = "As an assistant, you will speak using this voice style."
+
+            if ref_audio is not None:
+                sys_msgs = {"role": "system", "content": [vc_prompt_prefix, ref_audio, vc_prompt_suffix]}
+            else:
+                sys_msgs = {"role": "system", "content": [sys_prompt]}
+
+            return sys_msgs
+        elif mode == "audio_assistant":
+            if language == "zh":
+                vc_prompt_prefix = "模仿音频样本的音色并生成新的内容。"
+                vc_prompt_suffix = "你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手：面壁小钢炮。"
+            else:
+                vc_prompt_prefix = "Clone the voice in the provided audio prompt."
+                vc_prompt_suffix = "Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni."
+
+            if ref_audio is not None:
+                sys_msgs = {"role": "system", "content": [vc_prompt_prefix, ref_audio, vc_prompt_suffix]}
+            else:
+                logger.warning(
+                    "Warning: ref_audio is None, speech generation will be performed based on the default voice."
+                )
+                sys_msgs = {"role": "system", "content": ["Use the <reserved_53> voice.", vc_prompt_suffix]}
+
+            return sys_msgs
+        elif mode == "audio_roleplay":
+            if language == "zh":
+                vc_prompt_prefix = "模仿输入音频中的声音特征。"
+                vc_prompt_suffix = "假装你是上述音频中的人物，与我进行对话。"
+            else:
+                vc_prompt_prefix = "Clone the voice in the provided audio prompt."
+                vc_prompt_suffix = "Try to role-play the character based on the audio prompt above."
+
+            if ref_audio is not None:
+                sys_msgs = {"role": "system", "content": [vc_prompt_prefix, ref_audio, vc_prompt_suffix]}
+            else:
+                sys_msgs = {"role": "system", "content": ["Use the <reserved_53> voice.", vc_prompt_suffix]}
+
+            return sys_msgs
+        elif mode == "voice_cloning":
+            if language == "zh":
+                vc_prompt_prefix = "模仿输入音频中的声音特征。"
+            else:
+                vc_prompt_prefix = "Clone the voice in the provided audio prompt."
+
+            if ref_audio is not None:
+                sys_msgs = {"role": "system", "content": [vc_prompt_prefix, ref_audio]}
+            else:
+                raise ValueError("ref_audio con't be None in voice_cloning mode.")
+
+            return sys_msgs
+        else:
+            sys_prompt = "You are a helpful assistant. You can accept audio and text input and output voice and text."
+            sys_msgs = {"role": "system", "content": [sys_prompt]}
+
+            return sys_msgs
+
+    @staticmethod
+    def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        """Create mask for subsequent steps (size, size) with chunk size,
+        this is for streaming encoder
+
+        Args:
+            size (int): size of mask
+            chunk_size (int): size of chunk
+            num_left_chunks (int): number of left chunks
+                <0: use full chunk
+                >=0: use num_left_chunks
+            device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+            num_lookhead:
+
+        Returns:
+            torch.Tensor: mask
+        """
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead, size)
+            ret[i, start:ending] = True
+        return ret
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """Computes the output length of the convolutional layers and the output length of the audio encoder"""
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn - self.config.audio_pool_step
+        ) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    def get_vision_embedding(self, data):
+        if "vision_hidden_states" not in data:
+            dtype = self.llm.model.embed_tokens.weight.dtype
+            device = self.llm.model.embed_tokens.weight.device
+            tgt_sizes = data["tgt_sizes"]
+            pixel_values_list = data["pixel_values"]
+            vision_hidden_states = []
+            all_pixel_values = []
+            img_cnt = []
+            for pixel_values in pixel_values_list:
+                img_cnt.append(len(pixel_values))
+                all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values])
+
+            # exist image
+            if all_pixel_values:
+                tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
+                tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+
+                max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+
+                all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+                    all_pixel_values, batch_first=True, padding_value=0.0
+                )
+                B, L, _ = all_pixel_values.shape
+                all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+
+                patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
+                for i in range(B):
+                    patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+
+                vision_batch_size = self.config.vision_batch_size
+                all_pixel_values = all_pixel_values.type(dtype)
+                if B > vision_batch_size:
+                    hs = []
+                    for i in range(0, B, vision_batch_size):
+                        start_idx = i
+                        end_idx = i + vision_batch_size
+                        tmp_hs = self.vpm(
+                            all_pixel_values[start_idx:end_idx],
+                            patch_attention_mask=patch_attn_mask[start_idx:end_idx],
+                            tgt_sizes=tgt_sizes[start_idx:end_idx],
+                        ).last_hidden_state
+                        hs.append(tmp_hs)
+                    vision_embedding = torch.cat(hs, dim=0)
+                else:
+                    vision_embedding = self.vpm(
+                        all_pixel_values,
+                        patch_attention_mask=patch_attn_mask,
+                        tgt_sizes=tgt_sizes,
+                    ).last_hidden_state
+                vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+
+                start = 0
+                for pixel_values in pixel_values_list:
+                    img_cnt = len(pixel_values)
+                    if img_cnt > 0:
+                        vision_hidden_states.append(vision_embedding[start : start + img_cnt])
+                        start += img_cnt
+                    else:
+                        vision_hidden_states.append([])
+            else:  # no image
+                if self.training:
+                    dummy_image = torch.zeros((1, 3, 224, 224), device=device, dtype=dtype)
+                    tgt_sizes = torch.Tensor(
+                        [
+                            [
+                                (224 // self.config.patch_size),
+                                math.ceil(224 / self.config.patch_size),
+                            ]
+                        ]
+                    ).type(torch.int32)
+                    dummy_feature = self.resampler(self.vpm(dummy_image).last_hidden_state, tgt_sizes)
+                else:
+                    dummy_feature = []
+                for _ in range(len(pixel_values_list)):
+                    vision_hidden_states.append(dummy_feature)
+        else:
+            vision_hidden_states = data["vision_hidden_states"]
+
+        return vision_hidden_states
+
+    def get_vllm_embedding(self, data):
+        vision_hidden_states = self.get_vision_embedding(data)
+
+        if hasattr(self.llm.config, "scale_emb"):
+            vllm_embedding = self.llm.model.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb
+        else:
+            vllm_embedding = self.llm.model.embed_tokens(data["input_ids"])
+
+        vision_hidden_states = [
+            i.type(vllm_embedding.dtype) if isinstance(i, torch.Tensor) else i for i in vision_hidden_states
+        ]
+
+        bs = len(data["input_ids"])
+        for i in range(bs):
+            cur_vs_hs = vision_hidden_states[i]
+            if len(cur_vs_hs) > 0:
+                cur_vllm_emb = vllm_embedding[i]
+                cur_image_bound = data["image_bound"][i]
+                if len(cur_image_bound) > 0:
+                    image_indices = torch.stack(
+                        [torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]
+                    ).to(vllm_embedding.device)
+
+                    cur_vllm_emb.scatter_(
+                        0,
+                        image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]),
+                        cur_vs_hs.view(-1, cur_vs_hs.shape[-1]),
+                    )
+                elif self.training:
+                    cur_vllm_emb += cur_vs_hs[0].mean() * 0
+
+        return vllm_embedding, vision_hidden_states
+
+    def get_audio_embedding_streaming(
+        self,
+        data,
+        use_extra_context=False,
+        prefix_extra_frames=1,
+        suffix_extra_frames=1,
+        cnn_min_length=None,
+    ):
+        """Extract audio embeddings in a streaming manner using cached key-value pairs.
+
+        This method processes incoming audio features incrementally and stores/updates `past_key_values`
+        for faster inference on subsequent audio frames. It only supports batch_size=1 and is intended
+        for streaming scenarios.
+
+        Args:
+            data (dict):
+                - **"audio_features"** (`torch.FloatTensor`): Input mel-spectrograms of shape `(batch_size, 80, frames)`.
+                - **"audio_feature_lens"** (List[List[int]]): Lengths of each audio segment for each item in the batch.
+            use_extra_context (bool): If True, assumes input contains extra frames for CNN context.
+            prefix_extra_frames (int): Number of prefix extra frames.
+            suffix_extra_frames (int): Number of suffix extra frames.
+            cnn_min_length (int): Minimum length for CNN input padding.
+
+        Returns:
+            List[List[torch.Tensor]]: audio embeddings
+        """
+        wavforms = data.get("audio_features", [])  # (bs, 80, frames) or [], multi audios need filled in advance
+        audio_feature_lens_raw = data.get("audio_feature_lens", [])  # list, [[x1, x2], [y1], [z1]]
+
+        # exist audio
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            assert batch_size == 1
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            # whisper's past_key_values management (core)
+            if self.audio_past_key_values is not None:
+                cache_length = self.audio_past_key_values[0][0].shape[2]
+                apm_max_len = self.apm.embed_positions.weight.shape[0]
+                if cache_length + max_seq_len >= apm_max_len:
+                    logger.warning(
+                        f"audio_past_key_values length {cache_length + max_seq_len} exceed {apm_max_len}, reset."
+                    )
+                    self.audio_past_key_values = None
+
+            # build attention mask (bidirectional attention, same as offline mode)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            current_seq_len = (max_mel_seq_len - 1) // 2 + 1
+            # if use extra context, need to adjust sequence length
+            if use_extra_context:
+                # calculate actual sequence length after removing redundancy
+                # conv2's stride=2, so the mapping from mel frames to output frames is ceil(x/2)
+                prefix_to_remove = (prefix_extra_frames + 1) // 2 if prefix_extra_frames > 0 else 0
+                suffix_to_remove = (suffix_extra_frames + 1) // 2 if suffix_extra_frames > 0 else 0
+                current_seq_len = current_seq_len - prefix_to_remove - suffix_to_remove
+            # calculate history length (if there is KV cache)
+            if self.audio_past_key_values is not None:
+                past_len = self.audio_past_key_values[0][0].shape[2]  # get history sequence length
+                total_seq_len = past_len + current_seq_len
+            else:
+                past_len = 0
+                total_seq_len = current_seq_len
+            # create bidirectional attention mask (full attention)
+            audio_attention_mask = torch.zeros(
+                (batch_size, 1, current_seq_len, total_seq_len),
+                dtype=self.apm.conv1.weight.dtype,
+                device=wavforms.device,
+            )
+
+            # Step 1: APM processing
+            audio_outputs = self.apm(
+                wavforms,
+                past_key_values=self.audio_past_key_values,
+                use_cache=True,
+                output_hidden_states=True,
+                attention_mask=audio_attention_mask,
+                use_extra_context=use_extra_context,
+                prefix_extra_frames=prefix_extra_frames,
+                suffix_extra_frames=suffix_extra_frames,
+                cnn_min_length=cnn_min_length,
+            )
+
+            if hasattr(self, "audio_encoder_layer"):
+                audio_states = audio_outputs.hidden_states[self.audio_encoder_layer]
+            else:
+                audio_states = audio_outputs.last_hidden_state
+
+            self.audio_past_key_values = audio_outputs.past_key_values
+
+            # Step 2: Projection
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            # Step 3: Pooling
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(audio_feature_lens)
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(audio_embeds[idx, : num_audio_tokens[idx], :])
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+
+                return final_audio_embeds
+            else:
+                return final_audio_embeds
+        else:
+            return []
+
+    def get_audio_embedding(self, data, chunk_length=-1, dummy=True):
+        wavforms = data.get("audio_features", [])  # (bs, 80, frames) or [], multi audios need filled in advance
+        audio_feature_lens_raw = data.get("audio_feature_lens", [])  # list, [[x1, x2], [y1], [z1]]
+
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            # Create a sequence tensor of shape (batch_size, max_seq_len)
+            seq_range = (
+                torch.arange(
+                    0,
+                    max_seq_len,
+                    dtype=audio_feature_lens.dtype,
+                    device=audio_feature_lens.device,
+                )
+                .unsqueeze(0)
+                .expand(batch_size, max_seq_len)
+            )
+            lengths_expand = audio_feature_lens.unsqueeze(1).expand(batch_size, max_seq_len)
+            # Create mask
+            padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+            audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                batch_size, 1, max_seq_len, max_seq_len
+            )
+            audio_attention_mask = audio_attention_mask_.to(
+                dtype=self.apm.conv1.weight.dtype, device=self.apm.conv1.weight.device
+            )
+
+            if chunk_length > 0:
+                chunk_num_frame = int(chunk_length * 50)
+                chunk_mask = self.subsequent_chunk_mask(
+                    size=max_seq_len,
+                    chunk_size=chunk_num_frame,
+                    num_left_chunks=-1,
+                    device=audio_attention_mask_.device,
+                )
+                audio_attention_mask_ = torch.logical_or(audio_attention_mask_, torch.logical_not(chunk_mask))
+
+            audio_attention_mask[audio_attention_mask_] = float("-inf")
+            audio_states = self.apm(
+                wavforms, output_hidden_states=True, attention_mask=audio_attention_mask
+            ).hidden_states[self.audio_encoder_layer]
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(audio_feature_lens)
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(audio_embeds[idx, : num_audio_tokens[idx], :])
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+        elif self.training and dummy:
+            dtype = self.apm.embed_positions.weight.dtype
+            device = self.apm.embed_positions.weight.device
+
+            dummy_wavs = torch.zeros((1, 80, 100), device=device, dtype=dtype)
+            audio_states = self.apm(dummy_wavs, output_hidden_states=True).hidden_states[self.audio_encoder_layer]
+
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+            return [audio_embeds]
+        else:
+            return []
+
+    def get_omni_embedding(self, data, input_embeddings, chunk_length=-1, stream_input=False):
+        """
+        Args:
+            data:
+            input_embeddings:
+            chunk_length: whisper use full attention or chunk attention
+            stream_input: use streaming audio embedding or not
+
+        Returns:
+            final embeddings with audio feature
+        """
+        if stream_input:
+            audio_embeddings = self.get_audio_embedding_streaming(data)
+        else:
+            audio_embeddings = self.get_audio_embedding(data, chunk_length)
+
+        bs = len(input_embeddings)
+        if len(data.get("audio_features", [])) > 0:
+            assert len(audio_embeddings) == len(input_embeddings)
+
+            if len(audio_embeddings) > 0:
+                audio_bounds = data["audio_bounds"]
+
+                if self.config.stream_input:
+                    assert bs == 1, "audio stream_input mode only support batch size 1"
+                    for i in range(bs):
+                        audio_embs = torch.cat(audio_embeddings[i], dim=0).to(
+                            device=input_embeddings.device, dtype=input_embeddings.dtype
+                        )
+                        audio_start_pos = 0
+                        for bound in audio_bounds[i]:
+                            audio_len = bound[1] - bound[0]
+                            input_embeddings[i, bound[0] : bound[1]] = audio_embs[
+                                audio_start_pos : audio_start_pos + audio_len, :
+                            ]
+                            audio_start_pos += audio_len
+                else:
+                    for i in range(bs):
+                        audio_embs = audio_embeddings[i]
+                        bounds = audio_bounds[i]
+                        for embs, bound in zip(audio_embs, bounds):
+                            audio_indices = torch.arange(bound[0], bound[1], dtype=torch.long).to(
+                                input_embeddings.device
+                            )
+
+                            if embs.shape[0] != len(audio_indices):
+                                raise ValueError(
+                                    f"Shape mismatch: Trying to assign embeddings of shape {embs.shape} "
+                                    f"to input indices of length {len(audio_indices)}"
+                                )
+                            input_embeddings[i, audio_indices] = embs.to(input_embeddings.dtype)
+        elif self.training:
+            for i in range(bs):
+                # dummy audio_embedings
+                input_embeddings += audio_embeddings[0].mean() * 0
+
+        return input_embeddings
+
+    def forward(self, data, **kwargs):
+        vllm_embedding, vision_hidden_states = self.get_vllm_embedding(data)
+        vllm_embedding = self.get_omni_embedding(
+            data,
+            input_embeddings=vllm_embedding,
+            chunk_length=self.config.audio_chunk_length,
+        )
+
+        position_ids = data["position_ids"]
+        if position_ids.dtype != torch.int64:
+            position_ids = position_ids.long()
+
+        return self.llm(
+            input_ids=None,
+            position_ids=position_ids,
+            inputs_embeds=vllm_embedding,
+            **kwargs,
+        )
+
+    def _decode(self, inputs_embeds, tokenizer, attention_mask, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        outputs = self.llm.generate(
+            inputs_embeds=inputs_embeds,
+            pad_token_id=0,
+            eos_token_id=terminators,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            return_dict_in_generate=True,
+            **kwargs,
+        )
+        return outputs
+
+    def _decode_stream(self, inputs_embeds, tokenizer, **kwargs):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        streamer = TextIteratorStreamer(tokenizer=tokenizer)
+        generation_config = {
+            "inputs_embeds": inputs_embeds,
+            "pad_token_id": 0,
+            "eos_token_id": terminators,
+            "streamer": streamer,
+        }
+        generation_config.update(kwargs)
+        thread = Thread(target=self.llm.generate, kwargs=generation_config)
+        thread.start()
+        return streamer
+
+    def _decode_text(self, result_ids, tokenizer):
+        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        result_text = []
+        for result in result_ids:
+            result = result[result != 0]
+            if result[0] == tokenizer.bos_id:
+                result = result[1:]
+            if result[-1] in terminators:
+                result = result[:-1]
+            result_text.append(tokenizer.decode(result))
+        return result_text
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        tgt_sizes=None,
+        audio_features=None,
+        audio_feature_lens=None,
+        image_bound=None,
+        audio_bounds=None,
+        spk_bounds=None,
+        attention_mask=None,
+        tokenizer=None,
+        vision_hidden_states=None,
+        stream=False,
+        **kwargs,
+    ):
+        assert input_ids is not None
+        assert len(input_ids) == len(pixel_values)
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "audio_features": audio_features,
+            "audio_feature_lens": audio_feature_lens,
+            "image_bound": image_bound,
+            "audio_bounds": audio_bounds,
+            "spk_bounds": spk_bounds,
+        }
+
+        if vision_hidden_states is None:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["tgt_sizes"] = tgt_sizes
+        else:
+            model_inputs["vision_hidden_states"] = vision_hidden_states
+
+        with torch.inference_mode():
+            model_inputs["inputs_embeds"], vision_hidden_states = self.get_vllm_embedding(model_inputs)
+            model_inputs["inputs_embeds"] = self.get_omni_embedding(
+                model_inputs,
+                input_embeddings=model_inputs["inputs_embeds"],
+                chunk_length=self.config.audio_chunk_length,
+            )
+
+            if stream:
+                result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs)
+                outputs = {}  # if stream return TextIteratorStreamer and output is empty
+            else:
+                outputs = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, **kwargs)
+                result = self._decode_text(outputs.sequences, tokenizer)
+
+        return result, outputs
+
+    def _build_streaming_mask(self, tts_tokens_len):
+        tts_sequence_full_length = 1 + self.tts.streaming_text_reserved_len + 1
+        streaming_attention_mask = torch.zeros(tts_sequence_full_length, dtype=torch.int8)
+        streaming_attention_mask[0 : 1 + 1 + tts_tokens_len + 1] = 1
+        streaming_attention_mask[-1] = 1
+        return streaming_attention_mask
+
+    def _generate_mel_spec(self, inputs, outputs, text, output_chunk_size=25, tts_max_new_tokens=2048):
+        spk_embeds = self._get_last_spk_embeds(inputs, outputs)
+
+        text = text.split("<|tts_bos|>")[-1]
+        gen_text = text.split("<|tts_eos|>")[0]
+        tts_text, tts_token_lens = self.prepare_tts_text(gen_text)
+        tts_inputs = self.tts_processor.text_tokenizer.encode(tts_text, add_special_tokens=False)
+        tts_input_ids = torch.Tensor(tts_inputs).unsqueeze(0).to(self.device, dtype=torch.long)
+        streaming_tts_text_mask = self._build_streaming_mask(tts_token_lens).to(device=self.tts.device)
+
+        logits_warpers, logits_processors = gen_logits(
+            num_code=626,
+            top_p=self.tts.top_p,
+            top_k=self.tts.top_k,
+            repetition_penalty=self.tts.repetition_penalty,
+        )
+
+        condition_length = 1 + self.tts.streaming_text_reserved_len + 1
+
+        dtype = self.tts.emb_text.weight.dtype
+        emb = torch.zeros(1, condition_length, self.tts.num_vq, dtype=dtype, device=self.tts.device)
+        past_key_values = [
+            (
+                torch.zeros(
+                    1,
+                    self.tts.config.num_attention_heads,
+                    condition_length - 1,
+                    self.tts.config.hidden_size // self.tts.config.num_attention_heads,
+                    dtype=emb.dtype,
+                    device=self.tts.device,
+                ),
+                torch.zeros(
+                    1,
+                    self.tts.config.num_attention_heads,
+                    condition_length - 1,
+                    self.tts.config.hidden_size // self.tts.config.num_attention_heads,
+                    dtype=emb.dtype,
+                    device=self.tts.device,
+                ),
+            )
+            for _ in range(self.tts.config.num_hidden_layers)
+        ]
+
+        audio_input_ids = torch.zeros(
+            1,
+            condition_length,
+            self.tts.num_vq,
+            dtype=torch.long,
+            device=self.tts.device,
+        )
+
+        eos_lab = False
+        for chunk_idx in range(math.ceil(emb.shape[1] / self.tts.streaming_text_chunk_size)):
+            if chunk_idx == 0:
+                begin = chunk_idx * self.tts.streaming_text_chunk_size + 0
+                end = (chunk_idx + 1) * self.tts.streaming_text_chunk_size + 1
+            else:
+                begin = chunk_idx * self.tts.streaming_text_chunk_size + 1
+                end = min(
+                    (chunk_idx + 1) * self.tts.streaming_text_chunk_size + 1,
+                    condition_length - 1,
+                )
+
+            if end - begin > 0:
+                text_input_ids = tts_input_ids[:, begin:end]
+                position_ids = torch.arange(begin, end, dtype=torch.long, device=self.tts.device).unsqueeze(0)
+
+                if begin == 0:
+                    past_key_values = self.tts.prefill_text(
+                        input_ids=text_input_ids,
+                        position_ids=position_ids,
+                        past_key_values=past_key_values,
+                        lm_spk_emb_last_hidden_states=spk_embeds,
+                    )
+                else:
+                    past_key_values = self.tts.prefill_text(
+                        input_ids=text_input_ids,
+                        position_ids=position_ids,
+                        past_key_values=past_key_values,
+                    )
+
+            outputs = self.tts.generate(
+                input_ids=audio_input_ids,
+                past_key_values=past_key_values,
+                streaming_tts_text_mask=streaming_tts_text_mask,
+                max_new_token=output_chunk_size,
+                force_no_stop=self.force_no_stop,
+                temperature=torch.tensor([0.1, 0.3, 0.1, 0.3], dtype=torch.float, device=self.tts.device),
+                eos_token=torch.tensor([625], dtype=torch.long, device=self.tts.device),
+                logits_warpers=logits_warpers,
+                logits_processors=logits_processors,
+            )
+            audio_input_ids = outputs.audio_input_ids
+            past_key_values = outputs.past_key_values
+
+            if outputs.finished:
+                eos_lab = True
+                break
+
+        if not eos_lab:
+            while True:
+                outputs = self.tts.generate(
+                    input_ids=audio_input_ids,
+                    past_key_values=past_key_values,
+                    streaming_tts_text_mask=streaming_tts_text_mask,
+                    max_new_token=output_chunk_size,
+                    force_no_stop=self.force_no_stop,
+                    temperature=torch.tensor([0.1, 0.3, 0.1, 0.3], dtype=torch.float, device=self.tts.device),
+                    eos_token=torch.tensor([625], dtype=torch.long, device=self.tts.device),
+                    logits_warpers=logits_warpers,
+                    logits_processors=logits_processors,
+                )
+
+                audio_input_ids = outputs.audio_input_ids
+                past_key_values = outputs.past_key_values
+
+                if outputs.finished:
+                    break
+                if outputs.new_ids.shape[1] > tts_max_new_tokens:
+                    break
+
+    @staticmethod
+    def prepare_generation_config(do_sample, max_new_tokens=50, min_new_tokens=0, **kwargs):
+        num_beams = kwargs.get("num_beams", 3)
+        generation_config = {
+            "num_beams": num_beams,
+            "top_p": 0.8,
+            "top_k": 100,
+            "temperature": 0.7,
+            "do_sample": True,
+            "repetition_penalty": 1.02,
+        }
+
+        if do_sample:
+            generation_config.update(
+                {
+                    "top_p": 0.8,
+                    "top_k": 100,
+                    "temperature": 0.7,
+                    "do_sample": True,
+                    "repetition_penalty": 1.02,
+                }
+            )
+        elif num_beams > 1:
+            generation_config.update({"num_beams": num_beams, "repetition_penalty": 1.2, "do_sample": False})
+        else:
+            generation_config.update({"do_sample": False, "repetition_penalty": 1.02})
+
+        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
+        generation_config["min_new_tokens"] = min_new_tokens
+        generation_config["max_new_tokens"] = max_new_tokens
+
+        return generation_config
+
+    def prepare_processor(self, processor=None, tokenizer=None):
+        if processor is not None:
+            self.processor = processor
+        if not hasattr(self, "processor") or self.processor is None:
+            self.processor = MiniCPMOProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
+        if tokenizer is not None:
+            self.processor.tokenizer = tokenizer
+
+    @torch.inference_mode()
+    def chat(
+        self,
+        image=None,
+        msgs=None,
+        vision_hidden_states=None,
+        max_new_tokens=4096,
+        min_new_tokens=0,
+        do_sample=True,
+        max_inp_length=8192,
+        max_slice_nums=None,
+        use_image_id=None,
+        enable_thinking=False,
+        use_tts_template=False,
+        generate_audio=False,
+        output_audio_path=None,
+        output_tts_inputs_embeds_path=None,
+        omni_mode=False,
+        teacher_forcing=False,
+        return_prompt=False,
+        tts_proj_layer=-1,
+        tts_sampling_params: TTSSamplingParams = TTSSamplingParams(),
+        merge_audio_from_same_content=True,
+        stream=False,
+        stream_input=False,
+        tokenizer=None,
+        processor=None,
+        **kwargs,
+    ):
+        from PIL import Image
+
+        batched = isinstance(msgs[0], list)
+        msgs_list = msgs
+        images_list = image
+
+        if not batched:
+            images_list, msgs_list = [images_list], [msgs_list]
+        else:
+            assert images_list is None, "Please integrate image to msgs when using batch inference."
+            images_list = [None] * len(msgs_list)
+        assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same."
+
+        self.prepare_processor(processor=processor, tokenizer=tokenizer)
+
+        prompts_lists = []
+        input_images_list = []
+        input_audios_list = []
+        audio_parts_list = []
+
+        for image, msgs in zip(images_list, msgs_list):
+            if isinstance(msgs, str):
+                msgs = json.loads(msgs)
+            copy_msgs = deepcopy(msgs)
+
+            assert len(msgs) > 0, "msgs is empty"
+            assert do_sample or not stream, "if use stream mode, make sure do_sample=True"
+
+            if image is not None and isinstance(copy_msgs[0]["content"], str):
+                copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
+
+            images = []
+            audios = []
+            audio_parts = []
+            for i, msg in enumerate(copy_msgs):
+                role = msg["role"]
+                content = msg["content"]
+                assert role in ["system", "user", "assistant"]
+                if i == 0:
+                    assert role in ["user", "system"], "The role of first msg should be user"
+                # Normalize structured content (OpenAI format) to native format
+                content = normalize_content(content)
+                cur_msgs = []
+                for c in content:
+                    if isinstance(c, Image.Image):
+                        images.append(c)
+                        cur_msgs.append("<image>./</image>")
+                    elif isinstance(c, np.ndarray):  # audio
+                        audios.append(c)
+                        audio_parts.append(i)
+                        cur_msgs.append("<audio>./</audio>")
+                        use_tts_template = True
+                    elif isinstance(c, str):
+                        cur_msgs.append(c)
+
+                if omni_mode or stream_input:
+                    msg["content"] = "".join(cur_msgs)
+                else:
+                    msg["content"] = "\n".join(cur_msgs)
+
+            prompts_lists.append(
+                self.processor.tokenizer.apply_chat_template(
+                    copy_msgs,
+                    tokenize=False,
+                    add_generation_prompt=False if teacher_forcing else True,
+                    use_tts_template=use_tts_template,
+                    enable_thinking=enable_thinking,
+                )
+            )
+            input_images_list.append(images)
+            input_audios_list.append(audios)
+            audio_parts_list.append(audio_parts)
+
+        if not merge_audio_from_same_content:
+            audio_parts_list = None
+
+        inputs = self.processor(
+            prompts_lists,
+            input_images_list,
+            input_audios_list,
+            audio_parts_list,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+            stream_input=stream_input,
+            return_tensors="pt",
+            max_length=max_inp_length,
+        ).to(self.device)
+
+        if stream:
+            kwargs["num_beams"] = 1
+        generation_config = self.prepare_generation_config(
+            do_sample=do_sample, max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, **kwargs
+        )
+        generation_config.pop("max_new_tokens", None)
+
+        inputs.pop("image_sizes")
+
+        # teacher_forcing = True => generate audio with given text
+        with torch.inference_mode():
+            res, outputs = self.generate(
+                **inputs,
+                tokenizer=self.processor.tokenizer,
+                max_new_tokens=1 if teacher_forcing else max_new_tokens,
+                vision_hidden_states=vision_hidden_states,
+                stream=stream,
+                **generation_config,
+            )
+
+        if stream:
+            return res
+
+        # spk bound and tts bound
+        tts_bos_token = self.processor.tokenizer.convert_tokens_to_ids("<|tts_bos|>")
+        tts_eos_token = self.processor.tokenizer.convert_tokens_to_ids("<|tts_eos|>")
+
+        # Combine input_ids and generated sequences to get complete sequence
+        input_ids = inputs["input_ids"][0]
+        generated_ids = outputs.sequences[0]
+        # Combine by concatenating input_ids with the new tokens from generated sequence
+        full_sequence = torch.cat([input_ids, generated_ids])
+        # Update the sequences in outputs
+        full_sequences = full_sequence.unsqueeze(0)
+
+        outputs["full_sequences"] = full_sequences
+
+        tts_bos_indices = []
+        tts_eos_indices = []
+        for i, x in enumerate(full_sequences[0]):
+            if x == tts_bos_token:
+                # tts_bos + 1 is the position of the first tts, so that it is convenient to slice hidden states for tts
+                tts_bos_indices.append(i + 1)
+            elif x == tts_eos_token:
+                if teacher_forcing and i == len(full_sequences[0]) - 1:
+                    continue
+                tts_eos_indices.append(i)
+
+        tts_bos_idx = tts_bos_indices[-1] if tts_bos_indices else -1
+        # Use None instead of -1 when no EOS token found, so that slice [start:None]
+        # means "to the end" rather than [start:-1] which excludes the last element
+        tts_eos_idx = tts_eos_indices[-1] if tts_eos_indices else None
+
+        tts_bound = (tts_bos_idx, tts_eos_idx)
+
+        answer = res[0]
+        if answer is not None:
+            answer = answer.split("<|tts_eos|>")[0]
+
+        if use_tts_template and generate_audio and output_audio_path:
+            import soundfile as sf
+
+            try:
+                generated_waveform = self._generate_speech_non_streaming(
+                    outputs=outputs,
+                    tts_bound=tts_bound,
+                    tts_proj_layer=tts_proj_layer,
+                    audio_prompt=(
+                        input_audios_list[0][0]
+                        if len(input_audios_list) > 0 and len(input_audios_list[0]) > 0
+                        else None
+                    ),
+                    output_tts_inputs_embeds_path=output_tts_inputs_embeds_path,
+                    tts_sampling_params=tts_sampling_params,
+                )
+                if isinstance(generated_waveform, torch.Tensor):
+                    sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)
+                elif isinstance(generated_waveform, np.ndarray):
+                    sf.write(output_audio_path, generated_waveform, samplerate=24000)
+                logger.debug(f"audio saved to {output_audio_path}")
+            except:
+                import traceback
+
+                traceback.print_exc()
+
+        if return_prompt:
+            return answer, prompts_lists[0]
+        else:
+            return answer
+
+    @torch.inference_mode()
+    def _generate_speech_non_streaming(
+        self,
+        outputs,
+        tts_bound,
+        tts_proj_layer,
+        audio_prompt,
+        output_tts_inputs_embeds_path=None,
+        tts_sampling_params: TTSSamplingParams = TTSSamplingParams(),
+    ):
+        last_hidden_states = [hs[tts_proj_layer] for hs in outputs.hidden_states]
+        last_hidden_states = torch.vstack([i[0] for i in last_hidden_states])
+
+        spk_embeds = (
+            torch.ones([0, self.tts.config.hidden_size]).to(last_hidden_states.device).to(last_hidden_states.dtype)
+        )
+
+        if self.tts.condition_type == "hidden_text_merge":
+            llm_tokens = outputs["full_sequences"][0][tts_bound[0] : tts_bound[1]]
+            llm_tokens = torch.tensor(llm_tokens, device=self.tts.emb_text.weight.device, dtype=torch.long)
+            llm_embeds = self.tts.emb_text(llm_tokens)  # make sure emb_text is compatible with llm vocab size
+
+            hidden_embeds = last_hidden_states[tts_bound[0] : tts_bound[1]]
+            hidden_embeds = self.tts.projector_semantic(hidden_embeds)
+
+            if self.tts.config.normalize_projected_hidden:
+                hidden_embeds = F.normalize(hidden_embeds, p=2, dim=-1)
+
+            tts_embeds = llm_embeds + hidden_embeds
+        else:
+            raise NotImplementedError
+
+        audio_bos = [self.tts.audio_bos_token_id]
+        audio_bos = torch.tensor(audio_bos, device=self.tts.emb_text.weight.device, dtype=torch.long)
+
+        audio_bos_embeds = self.tts.emb_text(audio_bos)
+
+        text_eos_embed = self.tts.emb_text(
+            torch.tensor(
+                [self.tts.config.text_eos_token_id],
+                device=self.tts.emb_text.weight.device,
+                dtype=torch.long,
+            )
+        )
+
+        inputs_embeds = torch.cat([spk_embeds, tts_embeds, text_eos_embed, audio_bos_embeds], dim=0).unsqueeze(0)
+
+        # save inputs_embeds to file
+        if output_tts_inputs_embeds_path:
+            torch.save(inputs_embeds, output_tts_inputs_embeds_path)
+
+        outputs = self.tts.generate(
+            inputs_embeds=inputs_embeds,
+            sampling_params=tts_sampling_params,
+            eos_token=torch.tensor(
+                [self.tts.config.num_audio_tokens - 1],
+                dtype=torch.long,
+                device=self.tts.device,
+            ),
+        )
+
+        import io
+
+        import soundfile as sf
+
+        generated_tokens = outputs.new_ids.squeeze(-1)
+        reference_audio = audio_prompt
+        prompt_wav_path = None
+        if reference_audio is not None:
+            logger.debug("use reference audio in data to generate waveform")
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+                prompt_wav_path = tmp_wav.name
+                sf.write(prompt_wav_path, reference_audio, 16000)
+        wav_bytes = self.tts.audio_tokenizer(
+            generated_tokens.squeeze(0).tolist(),
+            prompt_wav_path,
+        )
+        # convert wav bytes back to tensor for caller compatibility
+        waveform, sr = sf.read(io.BytesIO(wav_bytes))
+        return torch.tensor(waveform, dtype=torch.float32)
+
+    @torch.inference_mode()
+    def init_token2wav_cache(self, prompt_speech_16k):
+        import soundfile as sf
+
+        if hasattr(self.tts.audio_tokenizer, "set_stream_cache"):
+            self.tts.audio_tokenizer.cache = None
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+                prompt_wav_path = tmp_wav.name
+                sf.write(prompt_wav_path, prompt_speech_16k, 16000)
+                flow_cache_base, hift_cache_base = self.tts.audio_tokenizer.set_stream_cache(prompt_wav_path)
+
+            self.token2wav_cache = {
+                "flow_cache_base": torch_clone_recursive(flow_cache_base),
+                "hift_cache_base": torch_clone_recursive(hift_cache_base),
+            }
+        else:
+            model_input = self.tts.audio_tokenizer.frontend.frontend_token2wav(
+                speech_tokens=torch.zeros(1, 1, dtype=torch.long, device=self.tts.device),
+                speech_16k=None,
+                prompt_speech_16k=prompt_speech_16k,
+                resample_rate=self.tts.audio_tokenizer.sample_rate,
+                prompt_speech=None,
+            )
+
+            prompt_token = model_input["flow_prompt_speech_token"]
+            prompt_feat = model_input["prompt_speech_feat"]
+            embedding = model_input["flow_embedding"]
+
+            if self.tts.audio_tokenizer.fp16:
+                prompt_feat = prompt_feat.to(torch.half)
+                embedding = embedding.to(torch.half)
+
+            prepared_cache = self.tts.audio_tokenizer.model.prepare_cache_from_prompt(
+                prompt_token=prompt_token,
+                prompt_feat=prompt_feat,
+                embedding=embedding,
+                n_timesteps=self.tts.config.s3_stream_n_timesteps,
+                code_chunk_size=self.tts.config.s3_stream_chunk_size,
+                chunk_prelook_size=self.tts.config.s3_stream_prelook_size,
+                use_attn_idx=False,
+            )
+
+            self.token2wav_cache = prepared_cache
+
+    # for sliding window
+    def _ensure_dynamic_cache(self):
+        cache = self.llm_past_key_values
+        if cache is None:
+            return None
+
+        cache = as_dynamic_cache(cache)
+        if isinstance(cache, DynamicCache):
+            self.llm_past_key_values = cache
+            return cache
+
+        return None
+
+    def _get_kv_cache_length(self, cache=None):
+        cache = cache if cache is not None else self.llm_past_key_values
+        return get_kv_cache_length(cache)
+
+    # todo: not-used del?
+    def _rebuild_cache_from_history(self):
+        preserved_ids: List[torch.Tensor] = []
+        for entry in self._omni_chunk_history:
+            ids = entry.get("input_ids")
+            if ids is None or not isinstance(ids, torch.Tensor) or ids.numel() == 0:
+                continue
+            preserved_ids.append(ids.to(self.device))
+        if not preserved_ids:
+            self.llm_past_key_values = None
+            self.streaming_position_offset = 0
+            self._rope_inv_freq_cache.clear()
+            return
+
+        concat_ids = torch.cat(preserved_ids, dim=1)
+        attention_mask = torch.ones((1, concat_ids.shape[1]), dtype=torch.bool, device=self.device)
+        outputs = self.llm(
+            input_ids=concat_ids,
+            attention_mask=attention_mask,
+            use_cache=True,
+            return_dict=True,
+        )
+        self.llm_past_key_values = outputs.past_key_values
+        self.streaming_position_offset = 0
+        self._rope_inv_freq_cache.clear()
+
+    def _get_rope_theta(self) -> float:
+        return float(getattr(self.llm.config, "rope_theta", 10000.0))
+
+    def _realign_rotary_suffix(
+        self,
+        suffix_keys: torch.Tensor,
+        old_positions: torch.Tensor,
+        new_positions: torch.Tensor,
+    ) -> torch.Tensor:
+        return realign_rotary_suffix(
+            suffix_keys,
+            old_positions,
+            new_positions,
+            rope_theta=self._get_rope_theta(),
+            inv_freq_cache=self._rope_inv_freq_cache,
+        )
+
+    def _encode_text(self, tokenizer, text) -> Optional[torch.Tensor]:
+        if tokenizer is None or not text:
+            return None
+        ids = tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"]
+        return ids.to(self.device)
+
+    @staticmethod
+    def _safe_decode(tokenizer, input_ids):
+        if tokenizer is None or input_ids is None:
+            return None
+        if isinstance(input_ids, torch.Tensor):
+            ids = input_ids.cpu().tolist()
+            if ids and isinstance(ids[0], list):
+                ids = ids[0]
+        else:
+            ids = input_ids
+        try:
+            return tokenizer.decode(ids, skip_special_tokens=False)
+        except Exception:
+            return None
+
+    def _finalize_round(
+        self, round_id: Optional[int], cache_before: int, assistant_input_ids: Optional[torch.Tensor] = None
+    ):
+        if round_id is None:
+            self._pending_round_id = None
+            return
+        cache_after = self._get_kv_cache_length()
+        if assistant_input_ids is not None:
+            assistant_len = assistant_input_ids.shape[1]
+        else:
+            assistant_len = max(cache_after - cache_before, 0)
+        if assistant_len > 0:
+            self._register_chunk(
+                assistant_len,
+                "assistant",
+                round_id=round_id,
+                input_ids=assistant_input_ids,
+                tokenizer=self.processor.tokenizer if hasattr(self, "processor") else None,
+            )
+
+        self._pending_round_id = None
+        self._next_round_id += 1
+
+    def _register_chunk(
+        self,
+        seq_len: int,
+        chunk_type: str,
+        *,
+        round_id: int,
+        input_ids=None,
+        tokenizer=None,
+    ) -> None:
+        if seq_len <= 0:
+            return
+        entry = {"length": int(seq_len), "type": chunk_type, "round": round_id}
+        if input_ids is not None:
+            entry["input_ids"] = input_ids.clone().detach()
+            entry["decoded"] = self._safe_decode(tokenizer, entry["input_ids"])
+        else:
+            entry["input_ids"] = None
+            entry["decoded"] = None
+        self._omni_chunk_history.append(entry)
+
+        if chunk_type == "system":
+            self.streaming_text_preserve = max(self.streaming_text_preserve, entry["length"])
+
+    def _drop_tokens_from_cache(self, length: int, cache: DynamicCache) -> bool:
+        """Drop tokens from cache using the utility function."""
+        _, new_offset, success = drop_tokens_from_cache(
+            cache=cache,
+            length=length,
+            preserve=self.streaming_text_preserve,
+            position_offset=self.streaming_position_offset,
+            rope_theta=self._get_rope_theta(),
+            inv_freq_cache=self._rope_inv_freq_cache,
+        )
+        if success:
+            self.streaming_position_offset = new_offset
+        return success
+
+    def _drop_next_round(self, cache: DynamicCache) -> bool:
+        seen_rounds = set()
+        for entry in self._omni_chunk_history:
+            round_id = entry.get("round")
+            if round_id is None or round_id in seen_rounds:
+                continue
+            seen_rounds.add(round_id)
+            round_entries = [e for e in self._omni_chunk_history if e.get("round") == round_id]
+            if any(e.get("type") == "system" for e in round_entries):
+                continue
+            if self._drop_round(round_id, cache):
+                return True
+        return False
+
+    def _drop_round(self, round_id: int, cache: DynamicCache) -> bool:
+        entries = [e for e in self._omni_chunk_history if e.get("round") == round_id]
+        if not entries:
+            return False
+        total_len = sum(e["length"] for e in entries)
+        if total_len <= 0:
+            for e in entries:
+                self._omni_chunk_history.remove(e)
+            return False
+        if not self._drop_tokens_from_cache(total_len, cache):
+            return False
+        for e in entries:
+            self._omni_chunk_history.remove(e)
+        return True
+
+    def _enforce_text_window(self) -> None:
+        if not self.streaming_window_enabled:
+            return
+        cache = self._ensure_dynamic_cache()
+        if cache is None:
+            return
+        high_limit = max(0, int(self.streaming_window_config.text_window_high_tokens))
+        low_limit = max(0, int(self.streaming_window_config.text_window_low_tokens))
+        if high_limit <= 0:
+            return
+        target = max(0, low_limit)
+        total_len = self._get_kv_cache_length(cache)
+        if total_len <= high_limit:
+            return
+        dropped_any = False
+        while total_len > target:
+            if not self._drop_next_round(cache):
+                break
+            dropped_any = True
+            total_len = self._get_kv_cache_length(cache)
+
+    # snapshot, vad
+    def save_speculative_snapshot(self) -> SpeculativeSnapshot:
+        """Internal method: save speculative snapshot.
+
+        Called at the start of streaming_generate, saves to self._speculative_snapshot.
+
+        Save strategy:
+        - LLM KV Cache: only record length (restore by truncation, zero extra VRAM)
+        - Audio KV Cache: deep clone (as generate sets it to None)
+        - Mel processor: full state snapshot (including buffer)
+        """
+        # get LLM cache information
+        llm_cache_length = self._get_kv_cache_length()
+        llm_cache_checksum = None
+        if self.llm_past_key_values is not None and hasattr(self.llm_past_key_values, "key_cache"):
+            if len(self.llm_past_key_values.key_cache) > 0:
+                llm_cache_checksum = self.llm_past_key_values.key_cache[0].sum().item()
+
+        # get audio cache length and clone audio_past_key_values
+        audio_cache_length = 0
+        audio_cache_checksum = None
+        audio_past_key_values_clone = None
+        if self.audio_past_key_values is not None:
+            # handle DynamicCache format (Whisper encoder may return this format)
+            if isinstance(self.audio_past_key_values, DynamicCache):
+                if hasattr(self.audio_past_key_values, "key_cache") and len(self.audio_past_key_values.key_cache) > 0:
+                    audio_cache_length = self.audio_past_key_values.key_cache[0].shape[2]
+                    audio_cache_checksum = self.audio_past_key_values.key_cache[0].sum().item()
+                # deep clone DynamicCache
+                cloned_cache = DynamicCache()
+                for k, v in zip(self.audio_past_key_values.key_cache, self.audio_past_key_values.value_cache):
+                    cloned_cache.update(k.clone(), v.clone(), layer_idx=len(cloned_cache.key_cache))
+                audio_past_key_values_clone = cloned_cache
+
+            # handle EncoderDecoderCache format
+            elif isinstance(self.audio_past_key_values, EncoderDecoderCache):
+                self_attn_cache = self.audio_past_key_values.self_attention_cache
+                if hasattr(self_attn_cache, "key_cache") and len(self_attn_cache.key_cache) > 0:
+                    audio_cache_length = self_attn_cache.key_cache[0].shape[2]
+                    audio_cache_checksum = self_attn_cache.key_cache[0].sum().item()
+                # deep clone EncoderDecoderCache
+                cloned_self_attn = DynamicCache()
+                if hasattr(self_attn_cache, "key_cache"):
+                    for k, v in zip(self_attn_cache.key_cache, self_attn_cache.value_cache):
+                        cloned_self_attn.update(k.clone(), v.clone(), layer_idx=len(cloned_self_attn.key_cache))
+                cross_attn_cache = self.audio_past_key_values.cross_attention_cache
+                cloned_cross_attn = DynamicCache()
+                if hasattr(cross_attn_cache, "key_cache"):
+                    for k, v in zip(cross_attn_cache.key_cache, cross_attn_cache.value_cache):
+                        cloned_cross_attn.update(k.clone(), v.clone(), layer_idx=len(cloned_cross_attn.key_cache))
+                audio_past_key_values_clone = EncoderDecoderCache(cloned_self_attn, cloned_cross_attn)
+
+            # handle tuple format (compatible with old format)
+            elif isinstance(self.audio_past_key_values, tuple) and len(self.audio_past_key_values) > 0:
+                audio_cache_length = self.audio_past_key_values[0][0].shape[2]
+                audio_cache_checksum = self.audio_past_key_values[0][0].sum().item()
+                # deep clone audio_past_key_values (tuple of tuples of tensors)
+                audio_past_key_values_clone = tuple(
+                    tuple(t.clone() for t in layer_cache) for layer_cache in self.audio_past_key_values
+                )
+
+        # get mel processor snapshot
+        mel_processor_snapshot = None
+        mel_buffer_checksum = None
+        if hasattr(self, "processor") and self.processor is not None:
+            mel_processor_snapshot = self.processor.get_streaming_snapshot()
+            if mel_processor_snapshot:
+                buf = mel_processor_snapshot.get("buffer")
+                if buf is not None and len(buf) > 0:
+                    mel_buffer_checksum = float(buf.sum())
+
+        # save RNG state (important: for deterministic dithering and other random operations after restoration)
+        rng_state_cpu = torch.get_rng_state()
+        rng_state_cuda = None
+        if torch.cuda.is_available() and self.device.type == "cuda":
+            rng_state_cuda = torch.cuda.get_rng_state(self.device)
+
+        # create snapshot
+        snapshot = SpeculativeSnapshot(
+            llm_cache_length=llm_cache_length,
+            audio_cache_length=audio_cache_length,
+            new_user_msg=self.new_user_msg,
+            llm_generated=self.llm_generated,
+            llm_generate_completed=self.llm_generate_completed,
+            next_round_id=self._next_round_id,
+            pending_round_id=self._pending_round_id,
+            omni_chunk_history_length=len(self._omni_chunk_history),
+            tts_last_turn_tokens=self.tts_last_turn_tokens.clone() if self.tts_last_turn_tokens is not None else None,
+            audio_chunk_idx=self.audio_chunk_idx,
+            mel_processor_snapshot=mel_processor_snapshot,
+            audio_past_key_values=audio_past_key_values_clone,
+            timestamp=time.time(),
+            # debug fields
+            llm_cache_checksum=llm_cache_checksum,
+            audio_cache_checksum=audio_cache_checksum,
+            mel_buffer_checksum=mel_buffer_checksum,
+            # RNG state
+            rng_state_cpu=rng_state_cpu,
+            rng_state_cuda=rng_state_cuda,
+        )
+
+        return snapshot
+
+    def restore_speculative_snapshot(self, snapshot=None) -> bool:
+        """Restore speculative snapshot - called when VAD speculation fails.
+
+        Restores model state to before streaming_generate was called,
+        allowing continued streaming_prefill for newly arrived audio.
+
+        Notes:
+        - Snapshot is saved when streaming_generate is called with enable_speculative_snapshot=True
+        - This method uses the most recent snapshot for restoration
+        - Snapshot is cleared after restore, cannot be called repeatedly
+
+        Returns:
+            bool: Whether restoration was successful
+        """
+        snapshot = snapshot or getattr(self, "_speculative_snapshot", None)
+
+        if snapshot is None:
+            return False
+
+        try:
+            current_cache_length = self._get_kv_cache_length()
+            current_history_length = len(self._omni_chunk_history)
+
+            # 1. truncate LLM KV Cache
+            if current_cache_length > snapshot.llm_cache_length:
+                self._truncate_llm_cache(snapshot.llm_cache_length)
+
+            # 2. restore Audio KV Cache (important: restore from cloned copy)
+            # because streaming_generate will set audio_past_key_values to None
+            self.audio_past_key_values = snapshot.audio_past_key_values
+
+            # 3. restore session state
+            self.new_user_msg = snapshot.new_user_msg
+            self.llm_generated = snapshot.llm_generated
+            self.llm_generate_completed = snapshot.llm_generate_completed
+
+            # 4. restore Round management
+            self._next_round_id = snapshot.next_round_id
+            self._pending_round_id = snapshot.pending_round_id
+
+            # 5. truncate chunk history
+            if current_history_length > snapshot.omni_chunk_history_length:
+                self._omni_chunk_history = self._omni_chunk_history[: snapshot.omni_chunk_history_length]
+
+            # 6. restore TTS state
+            self.tts_last_turn_tokens = snapshot.tts_last_turn_tokens
+
+            # 7. restore streaming processor state
+            self.audio_chunk_idx = snapshot.audio_chunk_idx
+
+            # 8. restore mel processor state (important: otherwise subsequent prefill will fail due to frame number mismatch)
+            if (
+                snapshot.mel_processor_snapshot is not None
+                and hasattr(self, "processor")
+                and self.processor is not None
+            ):
+                self.processor.restore_streaming_snapshot(snapshot.mel_processor_snapshot)
+
+            # 9. restore RNG state (important: ensure determinism of dithering and other random operations after restoration)
+            if snapshot.rng_state_cpu is not None:
+                torch.set_rng_state(snapshot.rng_state_cpu)
+            if snapshot.rng_state_cuda is not None and torch.cuda.is_available():
+                torch.cuda.set_rng_state(snapshot.rng_state_cuda, self.device)
+
+            # 10. clean up temporary states generated during generation
+            if hasattr(self, "_streaming_generated_token_ids"):
+                del self._streaming_generated_token_ids
+            if hasattr(self, "_last_streaming_text"):
+                del self._last_streaming_text
+
+            # 11. clear snapshot (can only be restored once)
+            self._speculative_snapshot = None
+
+            return True
+        except Exception as e:
+            import traceback
+
+            logger.error(traceback.format_exc())
+            return False
+
+    def has_speculative_snapshot(self) -> bool:
+        return getattr(self, "_speculative_snapshot", None) is not None
+
+    def clear_speculative_snapshot(self) -> None:
+        if hasattr(self, "_speculative_snapshot"):
+            self._speculative_snapshot = None
+
+    def _truncate_llm_cache(self, target_length: int) -> None:
+        if self.llm_past_key_values is None:
+            return
+
+        cache = self._ensure_dynamic_cache()
+        if cache is None:
+            return
+
+        current_length = self._get_kv_cache_length(cache)
+        if current_length <= target_length:
+            return
+
+        # truncate each layer of cache
+        for layer_idx in range(len(cache.key_cache)):
+            if cache.key_cache[layer_idx].numel() > 0:
+                cache.key_cache[layer_idx] = cache.key_cache[layer_idx][:, :, :target_length, :].contiguous()
+                cache.value_cache[layer_idx] = cache.value_cache[layer_idx][:, :, :target_length, :].contiguous()
+
+        # update cache metadata
+        cache.crop(target_length)
+        cache._seen_tokens = target_length
+
+    @torch.inference_mode()
+    def streaming_prefill(
+        self,
+        session_id,
+        msgs,
+        omni_mode=True,
+        max_slice_nums=None,
+        use_tts_template=True,
+        enable_thinking=False,
+        is_last_chunk=False,  # for audio chunk, if is the last chunk, set to True
+        tokenizer=None,
+        processor=None,
+        **kwargs,
+    ):
+        from PIL import Image
+
+        assert session_id is not None, "session_id cannot be None"
+        self.is_first = self.session_id is None or session_id != self.session_id
+
+        self.prepare_processor(processor=processor, tokenizer=tokenizer)
+
+        images = []
+        audios = []
+
+        assert len(msgs) == 1
+        copy_msgs = deepcopy(msgs)
+        msg = copy_msgs[0]
+
+        assert msg["role"] in ["system", "user", "assistant"]
+        is_not_system_prefill = msg["role"] != "system"
+
+        content = msg["content"]
+        cur_msgs = []
+        for j, c in enumerate(content):
+            if isinstance(c, Image.Image):
+                images.append(c)
+                cur_msgs.append("<image>./</image>")
+            elif isinstance(c, np.ndarray):
+                audios.append(c)
+                cur_msgs.append("<audio>./</audio>")
+            elif isinstance(c, str):
+                cur_msgs.append(c)
+            else:
+                logger.error(f"Invalid content type: {c}, ignore it.")
+
+        cur_contents = "".join(cur_msgs) if omni_mode else "\n".join(cur_msgs)
+
+        if msg["role"] in ["system", "assistant"]:
+            self.new_user_msg = True
+            self.audio_past_key_values = None
+
+        if self.is_first:
+            self.reset_session(reset_token2wav_cache=False)
+            self.session_id = session_id
+
+            self.init_streaming_processor()
+
+            if msg["role"] == "user":
+                # no system prefill, the first segment of the first user turn
+                # do not use apply_chat_template, manually build prompt to avoid automatic addition of <|im_end|>
+                prompt = "<|im_start|>user\n" + cur_contents
+                self.new_user_msg = False  # mark subsequent segments do not need to add user prefix anymore
+            else:
+                # system or assistant prefill, use apply_chat_template
+                msg["content"] = cur_contents
+                prompt = self.processor.tokenizer.apply_chat_template(
+                    copy_msgs,
+                    tokenize=False,
+                    add_generation_prompt=False,
+                    use_tts_template=use_tts_template,
+                    enable_thinking=enable_thinking,
+                )
+            add_special_tokens = True  # add bos
+        else:
+            # non-first prefill
+            if self.new_user_msg and msg["role"] == "user":
+                # the first segment of the new user turn
+                if self.llm_generated:
+                    if self.llm_generate_completed:
+                        prompt = "<|im_end|>\n<|im_start|>user\n" + cur_contents
+                    else:
+                        prompt = "<|tts_eos|><|im_end|>\n<|im_start|>user\n" + cur_contents
+                else:
+                    prompt = "<|im_start|>user\n" + cur_contents
+                self.new_user_msg = False
+            else:
+                # subsequent segments of the same turn, directly use content
+                prompt = cur_contents
+            add_special_tokens = False
+
+        # when first user audio prefill, ensure audio length satisfies FIRST_CHUNK_MS requirements
+        if is_not_system_prefill and len(audios) > 0 and self.audio_chunk_idx == 0:
+            assert len(audios) == 1, f"streaming mode only supports single audio, currently {len(audios)}"
+            first_chunk_samples = int(self.FIRST_CHUNK_MS * self.SAMPLE_RATE / 1000)
+            if len(audios[0]) < first_chunk_samples:
+                pad_len = first_chunk_samples - len(audios[0])
+                audios[0] = np.concatenate([np.zeros(pad_len, dtype=audios[0].dtype), audios[0]])
+
+        model_inputs = self.processor(
+            [prompt],
+            [images],
+            [audios],
+            max_slice_nums=1 if max_slice_nums is None else max_slice_nums,
+            use_image_id=False,
+            chunk_input=True,
+            return_tensors="pt",
+            max_length=None,
+            sampling_rate=16000,
+            add_special_tokens=add_special_tokens,
+            online_streaming=is_not_system_prefill,
+            audio_chunk_idx=self.audio_chunk_idx,
+            is_last_chunk=is_last_chunk,
+        ).to(self.device)
+
+        if len(audios) > 0 and is_not_system_prefill:
+            self.audio_chunk_idx += 1
+
+        # 1. prepare input embeddings
+        model_inputs["inputs_embeds"], _ = self.get_vllm_embedding(model_inputs)
+        # get audio embedding with audio_past_key_values
+        inputs_embeds = self.get_omni_embedding(
+            model_inputs, input_embeddings=model_inputs["inputs_embeds"], stream_input=is_not_system_prefill
+        )
+
+        if self.is_first:
+            self.audio_past_key_values = None
+
+        round_id = self._next_round_id
+        self._pending_round_id = round_id
+        chunk_type = "system" if msg["role"] == "system" else ("user" if msg["role"] == "user" else "assistant")
+        seq_len = inputs_embeds.shape[1]
+        self._enforce_text_window()
+        cache_length = self._get_kv_cache_length()
+
+        attention_mask = torch.ones((1, cache_length + inputs_embeds.shape[1]), dtype=torch.bool, device=self.device)
+
+        # 2. do prefill
+        outputs = self.llm(
+            past_key_values=self.llm_past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=None,
+            use_cache=True,
+            return_dict=True,
+        )
+
+        self.llm_past_key_values = as_dynamic_cache(outputs["past_key_values"])
+        self._register_chunk(
+            seq_len,
+            chunk_type,
+            round_id=round_id,
+            input_ids=model_inputs["input_ids"],
+            tokenizer=self.processor.tokenizer,
+        )
+        self._enforce_text_window()
+        if self.force_rope_reindex:
+            self._force_reindex_all_cache()
+
+        return prompt
+
+    @torch.inference_mode()
+    def streaming_generate(
+        self,
+        session_id,
+        bos_input=None,
+        generate_audio=True,
+        audio_token_chunk_size=25,  # 25 token/s
+        tts_sampling_params: TTSSamplingParams = TTSSamplingParams(),
+        max_new_tokens=256,
+        enable_thinking=False,
+        use_tts_template=True,
+        do_sample=True,
+        enable_speculative_snapshot=False,
+        tokenizer=None,
+        processor=None,
+        # Teacher forcing (only for the "text → hidden → TTS condition" pipeline in streaming_generate)
+        # When enabled: instead of letting the LLM auto-regressively generate the text to be spoken,
+        # it forces the tokens from teacher_forcing_text to be fed in, using the hidden states
+        # corresponding to these tokens to construct the TTS condition, ensuring the output audio matches the input text.
+        teacher_forcing: bool = False,
+        teacher_forcing_text: str = "",
+        **kwargs,
+    ):
+        # save speculative snapshot (before modifying any state)
+        # for VAD speculative snapshot: if speculative snapshot fails, can call restore_speculative_snapshot() to restore
+        # enable_speculative_snapshot=True when enabled, skip (save some overhead) when disabled
+        if enable_speculative_snapshot:
+            self._speculative_snapshot = self.save_speculative_snapshot()
+
+        # reset buf
+        self.new_user_msg = True
+        self.llm_generated = True
+        self.llm_generate_completed = False
+        self.audio_past_key_values = None
+
+        self.prepare_processor(processor=processor, tokenizer=tokenizer)
+
+        # reset current turn generated token IDs
+        if hasattr(self, "_streaming_generated_token_ids"):
+            del self._streaming_generated_token_ids
+        # reset full generated text
+        if hasattr(self, "_last_streaming_text"):
+            del self._last_streaming_text
+
+        cache = self._ensure_dynamic_cache()
+        cache_length = self._get_kv_cache_length(cache)
+        host_round_id = self._pending_round_id
+
+        ## in single-turn streaming, each call to streaming_generate needs to reinitialize the streaming_processor, enter the next turn
+        self.init_streaming_processor()
+
+        # 1) llm generate token and hidden states per chunk=10, 2) tts generate audio token chunk per chunk=25, 3) yield 1 chunk audio token
+        def audio_chunk_generator(
+            bos_input,
+            tokenizer,
+            generate_audio,
+            tts_sampling_params,
+            max_new_tokens,
+            do_sample,
+            teacher_forcing=False,
+            teacher_forcing_text="",
+            **kwargs,
+        ):
+            generate_chunk_size = 10
+
+            if bos_input is None:
+                bos_input = "".join(
+                    [
+                        "<|im_end|>\n<|im_start|>assistant\n",
+                        "" if enable_thinking else self.think_str.replace("\\n", "\n"),
+                        "<|tts_bos|>" if use_tts_template else "",
+                    ]
+                )
+
+            bos_input_ids = tokenizer.encode(bos_input)
+            bos_input_ids = torch.tensor(bos_input_ids, dtype=torch.long, device=self.device).unsqueeze(0)
+
+            bos_input_embeds = self.llm.get_input_embeddings()(bos_input_ids)
+
+            generation_inputs_embeds = bos_input_embeds
+            generated_ids = torch.empty((1, 0), dtype=torch.long, device=self.device)
+
+            num_chunks_decode = (max_new_tokens + generate_chunk_size - 1) // generate_chunk_size
+
+            conditions = []
+
+            # generate chunk by chunk, each chunk has 10 tokens, each chunk takes last hidden states, and pass tokens to tts
+            llm_streaming_generator = ChunkPrefillChunkGenerate(
+                model=self.llm,
+                tokenizer=tokenizer,
+                terminators=["<|tts_eos|>", "<|im_end|>", "</s>"],
+            )
+
+            if generate_audio:
+                logits_warpers, logits_processors = gen_logits(
+                    num_code=self.tts.config.num_audio_tokens,
+                    repetition_penalty=tts_sampling_params.repetition_penalty,
+                    top_p=tts_sampling_params.top_p,
+                    top_k=tts_sampling_params.top_k,
+                )
+
+                tts_streaming_generator = TTSStreamingGenerator(
+                    model=self.tts,
+                    temperature=tts_sampling_params.temperature,
+                    eos_token=torch.tensor(
+                        [self.tts.config.num_audio_tokens - 1],
+                        dtype=torch.long,
+                        device=self.tts.device,
+                    ),
+                    chunk_size=audio_token_chunk_size,  # s3tokenizer 1s = 25token
+                    tts_last_turn_tokens=self.tts_last_turn_tokens,
+                    logits_processors=logits_processors,
+                    logits_warpers=logits_warpers,
+                )
+
+            # Teacher forcing branch
+            # This branch does not rely on ChunkPrefillChunkGenerate's sampling logic, instead:
+            # 1) First prefill bos_input (assistant + tts_bos) into llm_past_key_values
+            # 2) Tokenize teacher_forcing_text into token ids
+            # 3) Feed tokens one by one into the LLM (teacher forcing), obtaining the last_hidden_states for each token
+            # 4) Use (token_ids, hidden_states) to construct tts condition, then feed it to TTSStreamingGenerator
+            if teacher_forcing:
+                # --- 1) prefill bos_input，延续 streaming_prefill 的 KV cache ---
+                bos_outputs = self.llm(
+                    inputs_embeds=generation_inputs_embeds,
+                    past_key_values=self.llm_past_key_values,
+                    use_cache=True,
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+                self.llm_past_key_values = bos_outputs.past_key_values
+
+                if generate_audio:
+                    # Give a length-0 tensor as speaker embedding (no speaker embedding)
+                    spk_emb = torch.empty(
+                        (bos_input_embeds.shape[0], 0, bos_input_embeds.shape[2]),
+                        dtype=bos_input_embeds.dtype,
+                        device=bos_input_embeds.device,
+                    )
+                    tts_streaming_generator.spk_emb = spk_emb
+
+                # --- 2) tokenize teacher_forcing_text ---
+                tf_text = teacher_forcing_text or ""
+                try:
+                    forced_input_ids = tokenizer(tf_text, add_special_tokens=False, return_tensors="pt")["input_ids"]
+                except Exception:
+                    # Compatible with rare tokenizer return object attributes
+                    forced_input_ids = tokenizer(tf_text, add_special_tokens=False, return_tensors="pt").input_ids
+                forced_input_ids = forced_input_ids.to(self.device)
+
+                total_len = int(forced_input_ids.shape[1])
+                ptr = 0
+
+                # Special case: empty text should also let TTS finish (text_finished=True will automatically concatenate text_eos_embed)
+                if total_len == 0:
+                    if not generate_audio:
+                        yield forced_input_ids, True
+                        return
+                    empty_tts_embeds = torch.empty(
+                        (1, 0, self.tts.config.hidden_size),
+                        dtype=bos_input_embeds.dtype,
+                        device=self.device,
+                    )
+                    if not hasattr(self, "_streaming_generated_token_ids"):
+                        self._streaming_generated_token_ids = []
+                    tts_generator = tts_streaming_generator.generate_with_buffer(
+                        condition=empty_tts_embeds,
+                        text_finished=True,
+                    )
+                    for audio_token_chunk, is_last_audio_chunk in tts_generator:
+                        yield audio_token_chunk, is_last_audio_chunk
+                    self.tts_last_turn_tokens = tts_streaming_generator.tts_last_turn_tokens
+                    self._last_streaming_text = ""
+                    yield None, None
+                    return
+
+                # --- 3) chunk-by-chunk teacher forcing ---
+                while ptr < total_len:
+                    end = min(ptr + generate_chunk_size, total_len)
+                    chunk_ids = forced_input_ids[:, ptr:end]  # [1, chunk_len]
+                    chunk_hidden_list = []
+
+                    for j in range(chunk_ids.shape[1]):
+                        tok = chunk_ids[:, j : j + 1]  # [1, 1]
+                        tok_emb = self.llm.get_input_embeddings()(tok)
+                        out = self.llm(
+                            inputs_embeds=tok_emb,
+                            past_key_values=self.llm_past_key_values,
+                            use_cache=True,
+                            output_hidden_states=True,
+                            return_dict=True,
+                        )
+                        self.llm_past_key_values = out.past_key_values
+                        chunk_hidden_list.append(out.hidden_states[-1])  # [1, 1, hidden]
+
+                    chunk_hidden = torch.cat(chunk_hidden_list, dim=1)  # [1, chunk_len, hidden]
+                    text_finished = end >= total_len
+
+                    # Save token IDs cache (external eval script will use _last_streaming_text to write generated_text)
+                    if not hasattr(self, "_streaming_generated_token_ids"):
+                        self._streaming_generated_token_ids = []
+                    self._streaming_generated_token_ids.extend(chunk_ids[0].tolist())
+
+                    if not generate_audio:
+                        yield chunk_ids, text_finished
+                    else:
+                        llm_embeds = self.tts.emb_text(chunk_ids)
+                        hidden_embeds = self.tts.projector_semantic(chunk_hidden)
+                        if self.tts.config.normalize_projected_hidden:
+                            hidden_embeds = F.normalize(hidden_embeds, p=2, dim=-1)
+                        tts_embeds = llm_embeds + hidden_embeds
+
+                        tts_generator = tts_streaming_generator.generate_with_buffer(
+                            condition=tts_embeds,
+                            text_finished=text_finished,
+                        )
+                        for audio_token_chunk, is_last_audio_chunk in tts_generator:
+                            yield audio_token_chunk, is_last_audio_chunk
+
+                    ptr = end
+                    if text_finished:
+                        if generate_audio:
+                            self.tts_last_turn_tokens = tts_streaming_generator.tts_last_turn_tokens
+                        break
+
+                # Finish: decode this round of text
+                if hasattr(self, "_streaming_generated_token_ids"):
+                    try:
+                        self._last_streaming_text = tokenizer.decode(self._streaming_generated_token_ids)
+                        assistant_input_ids = self._encode_text(tokenizer=tokenizer, text=self._last_streaming_text)
+                        self._finalize_round(
+                            round_id=host_round_id, cache_before=cache_length, assistant_input_ids=assistant_input_ids
+                        )
+                    except Exception:
+                        self._last_streaming_text = None
+                else:
+                    self._last_streaming_text = None
+
+                # Finally send the end signal
+                if generate_audio:
+                    yield None, None
+                else:
+                    return
+                return
+
+            # LLM chunk generate outer loop
+            for chunk_idx in range(num_chunks_decode):
+                is_first_generate_chunk = chunk_idx == 0
+
+                output = llm_streaming_generator.chunk_generate(
+                    inputs_embeds=generation_inputs_embeds,
+                    past_key_values=self.llm_past_key_values,
+                    is_first_generate_chunk=is_first_generate_chunk,
+                    return_hidden_states=True,
+                    chunk_size=generate_chunk_size + 1 * is_first_generate_chunk,
+                    do_sample=do_sample,
+                    temperature=kwargs.get("temperature", 0.7),
+                    top_p=kwargs.get("top_p", 0.8),
+                    top_k=kwargs.get("top_k", 100),
+                    repetition_penalty=kwargs.get("repetition_penalty", 1.02),
+                    length_penalty=kwargs.get("length_penalty", 1.0),
+                    all_input_ids=generated_ids,
+                )
+
+                if output.chunk_token_ids is None:
+                    break
+
+                if is_first_generate_chunk:
+                    if generate_audio:
+                        spk_emb = torch.empty(
+                            (bos_input_embeds.shape[0], 0, bos_input_embeds.shape[2]),
+                            dtype=bos_input_embeds.dtype,
+                            device=bos_input_embeds.device,
+                        )
+                        tts_streaming_generator.spk_emb = spk_emb
+
+                    if output.finished:
+                        yield_chunk_token_ids = output.chunk_token_ids
+                    else:
+                        # the first chunk generated chunk_size + 1 tokens, we only take the first chunk_size tokens,
+                        # the last token is not prefilled, and last hidden states is not obtained
+                        yield_chunk_token_ids = output.chunk_token_ids[:, :-1]
+
+                elif output.finished:
+                    yield_chunk_token_ids = torch.cat([generated_ids[:, -1:], output.chunk_token_ids], dim=1)
+                else:
+                    # in the chunk that is not the first chunk, we need to add the token at the end of the previous chunk,
+                    # it is not prefilled into the model to get last hidden states
+                    # similarly, the last generated token of subsequent chunks is not prefilled, and last hidden states is not obtained,
+                    # so it is not passed out
+                    yield_chunk_token_ids = torch.cat([generated_ids[:, -1:], output.chunk_token_ids[:, :-1]], dim=1)
+
+                if not generate_audio:
+                    chunk_generated_text = tokenizer.decode(yield_chunk_token_ids[0])
+                    yield yield_chunk_token_ids, output.finished
+                else:
+                    # TTS inner loop
+                    # dense connection here is hardcoded to use text-hidden merged as condition
+                    llm_embeds = self.tts.emb_text(yield_chunk_token_ids)
+                    hidden_embeds = output.last_hidden_states
+                    hidden_embeds = self.tts.projector_semantic(hidden_embeds)
+                    if self.tts.config.normalize_projected_hidden:  # default should be opened
+                        hidden_embeds = F.normalize(hidden_embeds, p=2, dim=-1)
+
+                    tts_embeds = llm_embeds + hidden_embeds
+                    conditions.append(tts_embeds)
+
+                    # Store token IDs instead of decoded text to avoid UTF-8 multi-byte character truncation
+                    if not hasattr(self, "_streaming_generated_token_ids"):
+                        self._streaming_generated_token_ids = []
+                    self._streaming_generated_token_ids.extend(yield_chunk_token_ids[0].tolist())
+
+                    # there is buffer generated, each time exactly returns 25 audio tokens,
+                    # the last audio chunk returns audio tokens of variable length, length [0, 25]
+                    tts_generator = tts_streaming_generator.generate_with_buffer(
+                        condition=tts_embeds, text_finished=output.finished
+                    )
+
+                    for audio_token_chunk, is_last_audio_chunk in tts_generator:
+                        yield audio_token_chunk, is_last_audio_chunk
+
+                generated_ids = torch.cat([generated_ids, output.chunk_token_ids], dim=1)
+                generation_inputs_embeds = output.current_inputs_embeds
+                self.llm_past_key_values = output.past_key_values
+
+                if output.finished:
+                    if generate_audio:
+                        self.tts_last_turn_tokens = tts_streaming_generator.tts_last_turn_tokens
+                    break
+
+            # IMPORTANT: Flush remaining TTS buffer when LLM generation ends
+            # This handles BOTH cases:
+            # 1. LLM finished with terminator (output.finished=True) - buffer may still have tokens
+            # 2. LLM hit max chunks limit (output.finished=False) - buffer definitely has tokens
+            if generate_audio:
+                if len(tts_streaming_generator._token_buffer) > 0:
+                    batch = torch.cat(tts_streaming_generator._token_buffer, dim=1)
+                    yield batch, True
+                    tts_streaming_generator._token_buffer = []
+
+            if generate_audio:
+                if hasattr(self, "_streaming_generated_token_ids"):
+                    try:
+                        self._last_streaming_text = tokenizer.decode(self._streaming_generated_token_ids)
+                        assistant_input_ids = self._encode_text(tokenizer=tokenizer, text=self._last_streaming_text)
+                        self._finalize_round(
+                            round_id=host_round_id, cache_before=cache_length, assistant_input_ids=assistant_input_ids
+                        )
+                    except Exception:
+                        self._last_streaming_text = None
+                else:
+                    self._last_streaming_text = None
+
+                yield None, None
+            else:
+                return
+
+        # iter for generating text chunk and audio chunk
+        audio_chunk_generator_iter = audio_chunk_generator(
+            bos_input=bos_input,
+            tokenizer=self.processor.tokenizer,
+            generate_audio=generate_audio,
+            tts_sampling_params=tts_sampling_params,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            teacher_forcing=teacher_forcing,
+            teacher_forcing_text=teacher_forcing_text,
+            **kwargs,
+        )
+
+        if generate_audio:
+            if self.tts.config.audio_tokenizer_type == "s3tokenizer_step_audio":
+                self.tts.audio_tokenizer.stream_cache = torch_clone_recursive(self.token2wav_cache["flow_cache_base"])
+                self.tts.audio_tokenizer.hift_cache_dict = torch_clone_recursive(
+                    self.token2wav_cache["hift_cache_base"]
+                )
+
+                # pre-insert 3-5 prefix 4218 silence tokens, each token corresponds to 0.04s,
+                # adding 5 tokens means introducing 0.2s of silence
+                buffer = [4218] * 3
+                pre_lookahead = 3
+                CHUNK_SIZE = 25
+                chunk_idx = 0
+                prev_text_len = 0  # track text position for streaming text output
+                for audio_token_chunk, is_last_audio_chunk in audio_chunk_generator_iter:
+                    if audio_token_chunk is None:
+                        break
+
+                    buffer += audio_token_chunk.reshape(-1).tolist()
+
+                    if len(buffer) >= CHUNK_SIZE + pre_lookahead:
+                        waveform_chunk = self.tts.audio_tokenizer.stream(
+                            buffer[: CHUNK_SIZE + pre_lookahead],
+                            prompt_wav=None,
+                            last_chunk=is_last_audio_chunk,
+                            return_waveform=True,
+                        )
+
+                        waveform_chunk = torch.from_numpy(waveform_chunk)
+
+                        # get new text chunk corresponding to this waveform
+                        # Decode from accumulated token IDs to avoid UTF-8 multi-byte truncation
+                        new_text = ""
+                        if hasattr(self, "_streaming_generated_token_ids"):
+                            current_text = self.processor.tokenizer.decode(self._streaming_generated_token_ids)
+                            # Filter out trailing replacement characters (incomplete UTF-8 sequences)
+                            safe_end = len(current_text)
+                            while safe_end > 0 and current_text[safe_end - 1] == "\ufffd":
+                                safe_end -= 1
+                            safe_text = current_text[:safe_end]
+                            new_text = safe_text[prev_text_len:]
+                            prev_text_len = len(safe_text)
+
+                        yield waveform_chunk, new_text
+
+                        buffer = buffer[CHUNK_SIZE:]
+                        chunk_idx += 1
+
+                # flush rest
+                if len(buffer) > 0:
+                    waveform_chunk = self.tts.audio_tokenizer.stream(
+                        buffer,
+                        prompt_wav=None,
+                        last_chunk=True,
+                        return_waveform=True,
+                    )
+
+                    waveform_chunk = torch.from_numpy(waveform_chunk)
+
+                    # get remaining new text for the final chunk
+                    # Final chunk: decode all remaining text without filtering
+                    new_text = ""
+                    if hasattr(self, "_streaming_generated_token_ids"):
+                        current_text = self.processor.tokenizer.decode(self._streaming_generated_token_ids)
+                        new_text = current_text[prev_text_len:]
+                        prev_text_len = len(current_text)
+
+                    yield waveform_chunk, new_text
+
+                # maybe the buffer is empty, and text is not empty, should we flush text without wave?
+            else:
+                raise NotImplementedError(f"not supported audio tokenizer: {self.tts.config.audio_tokenizer_type}")
+        else:
+            # For text-only generation, decode tokens and handle partial multi-byte characters
+            yield from streaming_token_decoder(
+                audio_chunk_generator_iter,
+                self.processor.tokenizer,
+                skip_special_tokens=False,
+            )
+
+    def as_duplex(self, device: Optional[str] = None, **kwargs) -> "MiniCPMODuplex":
+        """Convert this MiniCPMO instance to MiniCPMODuplex for full-duplex streaming."""
+        return MiniCPMODuplex.from_existing_model(
+            model=self,
+            device=device,
+            **kwargs,
+        )
+
+
+class MiniCPMODuplex:
+    """MiniCPMODuplex model with full-duplex streaming capabilities.
+
+    This is a wrapper class that provides duplex streaming functionality.
+    Use MiniCPMO.as_duplex() to create from an existing model without reloading.
+    """
+
+    # Default duplex parameters
+    _default_duplex_params = {
+        "generate_audio": True,
+        "ls_mode": "explicit",
+        "max_new_speak_tokens_per_chunk": 20,
+        "text_repetition_penalty": 1.05,
+        "temperature": 0.7,
+        "top_k": 100,
+        "top_p": 0.8,
+        "text_repetition_window_size": 512,
+        "listen_prob_scale": 1.0,
+        "force_listen_count": 0,
+        "tts_temperature": 0.8,
+        "tts_repetition_penalty": 1.05,
+        "enable_float16": False,
+        "n_timesteps": 10,
+        "chunk_ms": 1000,
+        "first_chunk_ms": 1035,
+        "cnn_redundancy_ms": 20,
+        "sample_rate": 16000,
+        "sliding_window_mode": "off",
+        "basic_window_high_tokens": 8000,
+        "basic_window_low_tokens": 6000,
+        "context_previous_max_tokens": 500,
+        "context_max_units": 24,
+    }
+
+    @classmethod
+    def from_existing_model(
+        cls,
+        model: "MiniCPMO",
+        device: Optional[str] = None,
+        **kwargs,
+    ) -> "MiniCPMODuplex":
+        """Create MiniCPMODuplex from an existing MiniCPMO instance."""
+        # Create instance without calling __init__
+        instance = cls.__new__(cls)
+
+        instance.name_or_path = getattr(model.config, "_name_or_path", "")
+
+        # Get default params helper
+        def get_param(name):
+            if name in kwargs:
+                return kwargs[name]
+            return cls._default_duplex_params.get(name)
+
+        instance.generate_audio = get_param("generate_audio")
+        instance.ls_mode = get_param("ls_mode")
+
+        # Determine device
+        if device is not None:
+            instance.device = device
+        else:
+            try:
+                instance.device = str(next(model.parameters()).device)
+            except StopIteration:
+                instance.device = "cuda"
+
+        # Reuse the existing model - THIS IS THE KEY: no reloading!
+        instance.model = model
+        instance.processor = getattr(model, "processor", None)
+        instance.tokenizer = getattr(instance.processor, "tokenizer", None) if instance.processor else None
+
+        if instance.tokenizer is None:
+            from transformers import AutoTokenizer
+
+            instance.tokenizer = AutoTokenizer.from_pretrained(instance.name_or_path, trust_remote_code=True)
+
+        if instance.processor is None:
+            from .processing_minicpmo import MiniCPMOProcessor
+
+            instance.processor = MiniCPMOProcessor.from_pretrained(instance.name_or_path, trust_remote_code=True)
+            instance.processor.tokenizer = instance.tokenizer
+
+        # Ensure model has processor reference (same as __init__)
+        instance.model.processor = instance.processor
+
+        # Initialize TTS (same as __init__)
+        enable_float16 = get_param("enable_float16")
+        n_timesteps = get_param("n_timesteps")
+        instance.model.init_tts(enable_float16=enable_float16, n_timesteps=n_timesteps)
+
+        instance.break_event = threading.Event()
+        instance.session_stop_event = threading.Event()
+
+        # LLM generation config
+        instance.max_new_speak_tokens_per_chunk = get_param("max_new_speak_tokens_per_chunk")
+        instance.text_repetition_penalty = get_param("text_repetition_penalty")
+        instance.temperature = get_param("temperature")
+        instance.top_k = get_param("top_k")
+        instance.top_p = get_param("top_p")
+        instance.text_repetition_window_size = get_param("text_repetition_window_size")
+        instance.listen_prob_scale = get_param("listen_prob_scale")
+        instance.force_listen_count = get_param("force_listen_count")
+
+        # TTS generation config
+        tts_temp_value = get_param("tts_temperature")
+        instance.tts_temperature = torch.tensor([tts_temp_value], dtype=torch.float, device=instance.device)
+        instance.tts_repetition_penalty = get_param("tts_repetition_penalty")
+
+        # Stream config
+        instance.CHUNK_MS = get_param("chunk_ms")
+        instance.FIRST_CHUNK_MS = get_param("first_chunk_ms")
+        instance.CNN_REDUNDANCY_MS = get_param("cnn_redundancy_ms")
+        instance.SAMPLE_RATE = get_param("sample_rate")
+
+        instance.model.CHUNK_MS = instance.CHUNK_MS
+        instance.model.FIRST_CHUNK_MS = instance.FIRST_CHUNK_MS
+        instance.model.CNN_REDUNDANCY_MS = instance.CNN_REDUNDANCY_MS
+        instance.model.SAMPLE_RATE = instance.SAMPLE_RATE
+
+        # Special tokens
+        instance.unit_token_id = instance.tokenizer.convert_tokens_to_ids("<unit>")
+        instance.image_start_token_id = instance.tokenizer.convert_tokens_to_ids("<image>")
+        instance.image_end_token_id = instance.tokenizer.convert_tokens_to_ids("</image>")
+        instance.slice_start_token_id = instance.tokenizer.convert_tokens_to_ids("<slice>")
+        instance.slice_end_token_id = instance.tokenizer.convert_tokens_to_ids("</slice>")
+
+        instance.listen_token_id = instance.tokenizer.convert_tokens_to_ids("<|listen|>")
+        instance.speak_token_id = instance.tokenizer.convert_tokens_to_ids("<|speak|>")
+        instance.tts_bos_token_id = instance.tokenizer.convert_tokens_to_ids("<|tts_bos|>")
+        instance.tts_eos_token_id = instance.tokenizer.convert_tokens_to_ids("<|tts_eos|>")
+
+        instance.chunk_eos_token_id = instance.tokenizer.convert_tokens_to_ids("<|chunk_eos|>")
+        instance.chunk_tts_eos_token_id = instance.tokenizer.convert_tokens_to_ids("<|chunk_tts_eos|>")
+        instance.turn_eos_token_id = instance.tokenizer.convert_tokens_to_ids("<|turn_eos|>")
+
+        instance.chunk_terminator_token_ids = [
+            instance.listen_token_id,
+            instance.chunk_eos_token_id,
+            instance.chunk_tts_eos_token_id,
+        ]
+        instance.turn_terminator_token_ids = [instance.turn_eos_token_id]
+        instance.chunk_speak_token_ids = [instance.speak_token_id]
+
+        instance.tts_pad_id = instance.tokenizer.convert_tokens_to_ids("<|tts_pad|>")
+        bad_token_ids = getattr(instance.tokenizer, "bad_token_ids", [])
+        instance.forbidden_token_ids = [instance.tts_pad_id] + list(bad_token_ids)
+
+        from .utils import StreamDecoder
+
+        instance.decoder = StreamDecoder(
+            llm=instance.model.llm, tokenizer=instance.tokenizer, forbidden_token_ids=instance.forbidden_token_ids
+        )
+
+        # Sliding window config
+        sliding_window_mode = get_param("sliding_window_mode")
+        basic_window_high_tokens = get_param("basic_window_high_tokens")
+        basic_window_low_tokens = get_param("basic_window_low_tokens")
+        context_previous_max_tokens = get_param("context_previous_max_tokens")
+        context_max_units = get_param("context_max_units")
+
+        instance.decoder.set_window_config(
+            DuplexWindowConfig(
+                sliding_window_mode=sliding_window_mode,
+                basic_window_high_tokens=basic_window_high_tokens,
+                basic_window_low_tokens=basic_window_low_tokens,
+                context_previous_max_tokens=context_previous_max_tokens,
+                context_max_units=context_max_units,
+            )
+        )
+        window_enabled = sliding_window_mode != "off"
+        instance.decoder.set_window_enabled(window_enabled)
+
+        instance.tts_logits_processors = None
+        instance.tts_eos_token = None
+        if instance.generate_audio:
+            instance.tts_logits_processors = gen_logits(
+                num_code=instance.model.tts.config.num_audio_tokens,
+                repetition_penalty=instance.tts_repetition_penalty,
+            )
+            instance.tts_eos_token = torch.tensor(
+                [instance.model.tts.config.num_audio_tokens - 1],
+                dtype=torch.long,
+                device=instance.device,
+            )
+
+        instance._reset_streaming_state()
+
+        return instance
+
+    def set_break_event(self):
+        self.break_event.set()
+
+    def clear_break_event(self):
+        self.break_event.clear()
+
+    def set_session_stop(self):
+        self.session_stop_event.set()
+        self.break_event.set()
+
+    def clear_session_stop(self):
+        self.session_stop_event.clear()
+
+    def is_break_set(self) -> bool:
+        return self.break_event.is_set()
+
+    def is_session_stop_set(self) -> bool:
+        return self.session_stop_event.is_set()
+
+    def _init_token2wav_cache(self, prompt_wav_path: str):
+        self.model.tts.audio_tokenizer.cache = None
+        flow_cache, hift_cache = self.model.tts.audio_tokenizer.set_stream_cache(prompt_wav_path)
+        self.flow_cache_base = torch_clone_recursive(flow_cache)
+        self.hift_cache_base = torch_clone_recursive(hift_cache)
+        self.pre_lookahead = int(self.model.tts.audio_tokenizer.flow.pre_lookahead_len)
+        self.token2wav_initialized = True
+
+    def _reset_token2wav_for_new_turn(self):
+        if self.token2wav_initialized:
+            self.model.tts.audio_tokenizer.stream_cache = torch_clone_recursive(self.flow_cache_base)
+            self.model.tts.audio_tokenizer.hift_cache_dict = torch_clone_recursive(self.hift_cache_base)
+            self.token2wav_buffer = [4218] * 3  # silence token prefix
+
+    def _reset_streaming_state(self):
+        self.audio_chunk_idx = 0
+        self.current_turn_ended = True
+        self.speak_count = 0
+        self.res_ids = []
+        self.total_ids = []
+        self.total_hidden = []
+
+        # TTS state
+        self.tts_text_start_pos = 0
+        self.tts_past_key_values = None
+        self.tts_current_turn_start_time = None
+
+        # token2wav state
+        self.token2wav_initialized = False
+        self.token2wav_buffer = []
+        self.flow_cache_base = None
+        self.hift_cache_base = None
+
+        # Audio prefill state
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.pending_logits: Optional[torch.Tensor] = None
+        self.current_mode: Optional[str] = None
+
+        # Force listen state
+        self._streaming_generate_count = 0
+
+        # Schema tracking: record the complete prefill + generate token sequence
+        # prefill_schema_tokens: each element is a list of prefill tokens for a unit
+        # format: [[unit0_prefill_tokens], [unit1_prefill_tokens], ...]
+        self.prefill_schema_tokens = []
+        self._current_unit_prefill_tokens = []
+
+    def prepare(
+        self,
+        prefix_system_prompt: Optional[str] = None,
+        ref_audio: Optional[np.ndarray] = None,
+        prompt_wav_path: Optional[str] = None,
+        context_previous_marker: str = "\n\nprevious: ",
+        **kwargs,
+    ):
+        prefix_system_prompt = prefix_system_prompt or "Streaming Omni Conversation."
+
+        prefix_system_prompt = "<|im_start|>system\n" + prefix_system_prompt
+        suffix_system_prompt = "<|im_end|>"
+        if isinstance(ref_audio, np.ndarray):
+            prefix_system_prompt += "\n<|audio_start|>"
+            suffix_system_prompt = "<|audio_end|>" + suffix_system_prompt
+
+        self.clear_break_event()
+        self.clear_session_stop()
+
+        self._reset_streaming_state()
+        self.decoder.reset()
+
+        self.model.init_streaming_processor()
+
+        if prompt_wav_path is not None and prompt_wav_path and self.generate_audio:
+            self._init_token2wav_cache(prompt_wav_path)
+            self._reset_token2wav_for_new_turn()
+
+        # Prefill system prompt prefix
+        if prefix_system_prompt:
+            tokens = self.tokenizer.encode(prefix_system_prompt, add_special_tokens=False)
+            for token_id in tokens:
+                self.decoder.feed(self.decoder.embed_token(token_id))
+
+        # Prefill reference audio
+        if ref_audio is not None:
+            data = self.processor.process_audio([ref_audio])
+            embeds_nested = self.model.get_audio_embedding(data, chunk_length=self.model.config.audio_chunk_length)
+            embeds = torch.cat([t for g in embeds_nested for t in g], dim=0) if embeds_nested else None
+            if embeds is not None:
+                self.decoder.feed(embeds)
+
+        # register system prompt protection length (protect this part from being removed when sliding window is enabled)
+        if prefix_system_prompt or suffix_system_prompt or ref_audio is not None:
+            if self.decoder._window_config.sliding_window_mode == "context":
+                # Context preserve mode:
+                # initial layout: [prefix] [suffix] [units...]
+                # after the first sliding window: [prefix] [context_previous_marker + content] [suffix] [units...]
+                # register prefix length first, then feed suffix
+                self._prefix_system_prompt = prefix_system_prompt
+                self._suffix_system_prompt = suffix_system_prompt
+                self._ref_audio = ref_audio
+
+                suffix_token_ids = []
+                if suffix_system_prompt:
+                    suffix_token_ids = self.tokenizer.encode(suffix_system_prompt, add_special_tokens=False)
+
+                # register (when cache only has prefix, no suffix, no previous)
+                self.decoder.register_system_prompt_with_context(
+                    suffix_token_ids=suffix_token_ids,
+                    context_previous_marker=context_previous_marker,  # dynamically added after the first sliding window
+                )
+
+                # now feed suffix
+                for token_id in suffix_token_ids:
+                    self.decoder.feed(self.decoder.embed_token(token_id))
+            else:
+                # non-context preserve mode: first feed suffix, then register total length
+                if suffix_system_prompt:
+                    tokens = self.tokenizer.encode(suffix_system_prompt, add_special_tokens=False)
+                    for token_id in tokens:
+                        self.decoder.feed(self.decoder.embed_token(token_id))
+                self.decoder.register_system_prompt()
+
+        if prefix_system_prompt or suffix_system_prompt:
+            if ref_audio is not None:
+                full_prompt = (prefix_system_prompt or "") + "[audio embedding]" + (suffix_system_prompt or "")
+            else:
+                full_prompt = (prefix_system_prompt or "") + (suffix_system_prompt or "")
+
+            return full_prompt
+
+        return ""
+
+    @torch.no_grad()
+    def streaming_prefill(
+        self,
+        audio_waveform: Optional[np.ndarray] = None,
+        frame_list: Optional[list] = None,
+        text_list: Optional[list] = None,
+        max_slice_nums: Union[int, List[int]] = 1,
+        batch_vision_feed: bool = False,
+    ):
+        """Streaming prefill - called once per second, processing audio/video data
+
+        Args:
+            audio_waveform: audio waveform data
+            frame_list: image frame list
+            text_list: text
+            max_slice_nums: maximum number of slices for HD image encoding (default 1, no slicing)
+                           Can be an int (same for all images) or a list matching frame_list length
+            batch_vision_feed: if True, batch all vision embeddings into a single feed call for better performance.
+                              if False (default), feed each embedding individually (original behavior).
+
+        Process:
+            0. determine mode based on input: AUDIO / VISION / OMNI
+            1. feed <unit> token
+            2. get and feed image embed (if frame_list) - return pending logits in VISION MODE
+            3. get and feed audio embed (if audio_waveform) - return pending logits in AUDIO/OMNI MODE
+
+        Returns:
+            dict with keys:
+                - success: bool
+                - cost_vision_process: float (image processing time)
+                - cost_vision_embed: float (vision embedding time)
+                - cost_vision_feed: float (vision feed time)
+                - cost_audio_process: float (audio processing time)
+                - cost_audio_embed: float (audio embedding time)
+                - cost_audio_feed: float (audio feed time)
+                - cost_all: float (total time)
+        """
+        start_time = time.time()
+        cost_vision_process = 0.0
+        cost_vision_embed = 0.0
+        cost_vision_feed = 0.0
+        cost_audio_process = 0.0
+        cost_audio_embed = 0.0
+        cost_audio_feed = 0.0
+
+        def _make_result(success, reasons=""):
+            reason = reasons
+            if isinstance(reasons, list):
+                reason = "; ".join(reasons)
+
+            return {
+                "success": success,
+                "reason": reason,
+                "cost_vision_process": cost_vision_process,
+                "cost_vision_embed": cost_vision_embed,
+                "cost_vision_feed": cost_vision_feed,
+                "cost_audio_process": cost_audio_process,
+                "cost_audio_embed": cost_audio_embed,
+                "cost_audio_feed": cost_audio_feed,
+                "cost_all": time.time() - start_time,
+            }
+
+        if self.is_session_stop_set() or self.is_break_set():
+            return _make_result(False)
+
+        has_frames = frame_list is not None and len(frame_list) > 0
+        has_audio = audio_waveform is not None and len(audio_waveform) > 0
+        has_text = text_list is not None and len(text_list) > 0
+
+        if has_frames and has_audio:
+            mode = "OMNI"
+        elif has_frames:
+            mode = "VISION"
+        elif has_audio:
+            mode = "AUDIO"
+        elif has_text:
+            mode = "TEXT"
+        else:
+            return _make_result(False)
+
+        self.pending_logits = None
+
+        # sliding window: record unit start position
+        self.decoder.register_unit_start()
+
+        # Schema tracking: start new unit, record prefill tokens
+        self._current_unit_prefill_tokens = []
+
+        # Step 1: Feed <unit> token
+        self.decoder.feed(self.decoder.embed_token(self.unit_token_id))
+        self._current_unit_prefill_tokens.append(self.unit_token_id)
+
+        # Step 2: process image
+        if has_frames:
+            t0 = time.time()
+
+            # normalize max_slice_nums to a list matching frame_list length
+            if isinstance(max_slice_nums, int):
+                max_slice_nums_list = [max_slice_nums] * len(frame_list)
+            else:
+                max_slice_nums_list = list(max_slice_nums)
+                if len(max_slice_nums_list) != len(frame_list):
+                    raise ValueError(
+                        f"max_slice_nums list length ({len(max_slice_nums_list)}) "
+                        f"must match frame_list length ({len(frame_list)})"
+                    )
+
+            # check if all max_slice_nums are the same (can use batch processing)
+            all_same = len(set(max_slice_nums_list)) == 1
+
+            if all_same:
+                # all images use the same max_slice_nums, use batch processing
+                processed_frames = self.processor.process_image(frame_list, max_slice_nums=max_slice_nums_list[0])
+                if self.device:
+                    processed_frames = processed_frames.to(self.device)
+            else:
+                # different max_slice_nums per image, process individually and merge
+                all_pixel_values = []
+                all_tgt_sizes = []
+                for frame, max_slices in zip(frame_list, max_slice_nums_list):
+                    pf = self.processor.process_image([frame], max_slice_nums=max_slices)
+                    if self.device:
+                        pf = pf.to(self.device)
+                    # pf["pixel_values"][0] is the list of slices for this image
+                    all_pixel_values.extend(pf["pixel_values"][0])
+                    # pf["tgt_sizes"][0] is the array of target sizes for this image's slices
+                    if hasattr(pf["tgt_sizes"][0], "tolist"):
+                        all_tgt_sizes.extend(pf["tgt_sizes"][0].tolist())
+                    else:
+                        all_tgt_sizes.extend(list(pf["tgt_sizes"][0]))
+
+                # reconstruct processed_frames with merged data
+                processed_frames = {
+                    "pixel_values": [all_pixel_values],
+                    "tgt_sizes": [torch.tensor(all_tgt_sizes) if all_tgt_sizes else []],
+                }
+
+            cost_vision_process = time.time() - t0
+
+            t0 = time.time()
+            # get vision embeddings for all images (each may have multiple slices)
+            # vision_hidden_states is a list, one entry per input image
+            # each entry contains embeddings for [source_image, slice_1, slice_2, ...]
+            vision_hidden_states = self.model.get_vision_embedding(processed_frames)
+            cost_vision_embed = time.time() - t0
+
+            if vision_hidden_states is not None and len(vision_hidden_states) > 0:
+                t0 = time.time()
+
+                # vision_hidden_states[0] contains ALL slices from ALL images (flattened)
+                # shape: [total_slices, 64, D] where total_slices = sum of slices across all images
+                # we need to know how many slices each image has to correctly group them
+
+                # calculate slice counts for each image using get_sliced_grid (lightweight, no actual slicing)
+                slice_counts = []  # e.g., [5, 9] means img1 has 5 slices (1 source + 4 HD), img2 has 9 slices
+                for frame_idx, frame in enumerate(frame_list):
+                    max_slices = max_slice_nums_list[frame_idx]
+                    if hasattr(frame, "size"):
+                        # get_sliced_grid returns [M, N] grid or None if no slicing needed
+                        # total images = 1 (source) + M * N (HD slices)
+                        grid = self.processor.image_processor.get_sliced_grid(
+                            frame.size, max_slices, nerver_split=False
+                        )
+                        if grid is not None:
+                            slice_counts.append(1 + grid[0] * grid[1])  # 1 source + M*N slices
+                        else:
+                            slice_counts.append(1)  # no slicing, only source image
+                    else:
+                        slice_counts.append(1)  # default: single image, no slicing
+
+                # get the flattened embeddings tensor
+                # vision_hidden_states is a list with one element (the batch)
+                # vision_hidden_states[0] shape: [total_slices, 64, D]
+                all_embeds = vision_hidden_states[0]
+
+                # collect all feed operations first, then execute
+                # this allows us to identify the last token for VISION mode logits
+                feed_operations = []  # List of (embed, is_last_for_vision_mode, token_id_or_none)
+
+                embed_idx = 0  # current index in all_embeds
+                for img_idx, num_slices in enumerate(slice_counts):
+                    if num_slices == 0:
+                        continue
+
+                    # the first embedding is always the source image (downsampled overview)
+                    # Feed <image> token
+                    feed_operations.append(
+                        (self.decoder.embed_token(self.image_start_token_id), False, self.image_start_token_id)
+                    )
+                    # Feed source image embedding (shape: [64, D]) - use None to indicate embedding
+                    feed_operations.append((all_embeds[embed_idx], False, None))
+                    # Feed </image> token
+                    feed_operations.append(
+                        (self.decoder.embed_token(self.image_end_token_id), False, self.image_end_token_id)
+                    )
+                    embed_idx += 1
+
+                    # remaining embeddings are HD slices (if num_slices > 1)
+                    if num_slices > 1:
+                        for slice_i in range(1, num_slices):
+                            # Feed <slice> token
+                            feed_operations.append(
+                                (self.decoder.embed_token(self.slice_start_token_id), False, self.slice_start_token_id)
+                            )
+                            # Feed slice embedding (shape: [64, D])
+                            feed_operations.append((all_embeds[embed_idx], False, None))
+                            # Feed </slice> token
+                            feed_operations.append(
+                                (self.decoder.embed_token(self.slice_end_token_id), False, self.slice_end_token_id)
+                            )
+                            embed_idx += 1
+
+                # mark the last operation for VISION mode logits
+                if feed_operations:
+                    feed_operations[-1] = (feed_operations[-1][0], True, feed_operations[-1][2])
+
+                # execute feed operations
+                if batch_vision_feed and feed_operations:
+                    # batch mode: concatenate all embeddings and feed at once
+                    # this reduces LLM forward passes from N to 1
+                    #
+                    # NOTE: batch mode may have slight numerical differences compared to for-loop mode
+                    # due to floating-point precision in attention computation. This is expected behavior
+                    # for causal attention with incremental vs batch computation.
+
+                    all_embeds_list = []
+                    for embed, is_last, token_id in feed_operations:
+                        # ensure all embeddings have shape [L, H]
+                        if embed.dim() == 1:
+                            embed = embed.unsqueeze(0)
+                        all_embeds_list.append(embed)
+
+                    # concatenate all embeddings
+                    # torch.cat requires consistent dtype; embeddings should already be same dtype
+                    all_embeds_to_feed = torch.cat(all_embeds_list, dim=0)  # [total_L, H]
+
+                    if mode == "VISION":
+                        # vision mode needs logits from the last token
+                        self.pending_logits, _ = self.decoder.feed(all_embeds_to_feed, return_logits=True)
+                    else:
+                        # omni mode: just feed, wait for audio to get logits
+                        self.decoder.feed(all_embeds_to_feed)
+
+                    # schema tracking: record all token IDs and embedding markers
+                    for embed, is_last, token_id in feed_operations:
+                        if token_id is not None:
+                            self._current_unit_prefill_tokens.append(token_id)
+                        else:
+                            embed_dim = embed.shape[0] if len(embed.shape) > 1 else 1
+                            self._current_unit_prefill_tokens.append(("img", embed_dim))
+                else:
+                    for embed, is_last, token_id in feed_operations:
+                        if mode == "VISION" and is_last:
+                            # get logits from the last token
+                            self.pending_logits, _ = self.decoder.feed(embed, return_logits=True)
+                        else:
+                            self.decoder.feed(embed)
+                        # schema tracking: record token ID or embedding marker
+                        if token_id is not None:
+                            self._current_unit_prefill_tokens.append(token_id)
+                        else:
+                            # use tuple to mark image embedding: ("img", dim)
+                            embed_dim = embed.shape[0] if len(embed.shape) > 1 else 1
+                            self._current_unit_prefill_tokens.append(("img", embed_dim))
+                # for omni mode, no pending logits needed here (wait for audio)
+
+                cost_vision_feed = time.time() - t0
+
+        # Step 3: process audio (if any)
+        if has_audio:
+            # accumulate audio to buffer
+            self.audio_buffer = np.concatenate([self.audio_buffer, audio_waveform])
+
+            # calculate required audio length
+            if self.audio_chunk_idx == 0:
+                required_samples = int(self.FIRST_CHUNK_MS * self.SAMPLE_RATE / 1000)
+                if len(self.audio_buffer) < required_samples:
+                    padding_samples = required_samples - len(self.audio_buffer)
+                    padding = np.zeros(padding_samples, dtype=np.float32)
+                    self.audio_buffer = np.concatenate([padding, self.audio_buffer])
+            else:
+                required_samples = int(self.CHUNK_MS * self.SAMPLE_RATE / 1000)
+
+            need_samples = self.processor.get_streaming_chunk_size()
+            if len(self.audio_buffer) < need_samples:
+                return _make_result(
+                    False, f"audio not enough: need {need_samples} samples, only {len(self.audio_buffer)}"
+                )
+
+            audio_chunk = self.audio_buffer[:need_samples]
+
+            t0 = time.time()
+            batch_feature = self.processor.process_audio_streaming(
+                audio_chunk,
+                reset=False,
+                return_batch_feature=True,
+            )
+
+            if batch_feature is None or batch_feature.audio_features.shape[-1] == 0:
+                return _make_result(False, "streaming audio processing returned empty")
+
+            # metadata
+            batch_feature.chunk_idx = self.audio_chunk_idx
+            batch_feature.use_extra_context = True
+            batch_feature.prefix_extra_frames = 0 if self.audio_chunk_idx == 0 else 2
+            batch_feature.suffix_extra_frames = 2
+
+            batch_feature = batch_feature.to(self.device)
+            cost_audio_process = time.time() - t0
+
+            t0 = time.time()
+            embeds_nested = self.model.get_audio_embedding_streaming(
+                batch_feature,
+                use_extra_context=batch_feature.use_extra_context,
+                prefix_extra_frames=batch_feature.prefix_extra_frames,
+                suffix_extra_frames=batch_feature.suffix_extra_frames,
+            )
+            audio_embeds = torch.cat([t for g in embeds_nested for t in g], dim=0)
+            cost_audio_embed = time.time() - t0
+
+            t0 = time.time()
+            self.pending_logits, _ = self.decoder.feed(audio_embeds, return_logits=True)
+            cost_audio_feed = time.time() - t0
+
+            # schema tracking: use tuple to mark audio embedding: ("audio", dim)
+            embed_dim = audio_embeds.shape[0] if len(audio_embeds.shape) > 1 else 1
+            self._current_unit_prefill_tokens.append(("audio", embed_dim))
+
+            if self.audio_chunk_idx == 0:
+                cfg = self.processor._streaming_mel_processor.get_config()
+                consumed_ms = int(cfg.get("effective_first_chunk_ms", self.FIRST_CHUNK_MS))
+                consumed_samples = int(consumed_ms * self.SAMPLE_RATE / 1000)
+            else:
+                consumed_samples = int(self.CHUNK_MS * self.SAMPLE_RATE / 1000)
+
+            self.audio_buffer = self.audio_buffer[consumed_samples:]
+
+            self.audio_chunk_idx += 1
+
+        # Step 4: process text
+        if has_text:
+            # concatenate all text items
+            text_content = "".join(text_list) if isinstance(text_list, list) else str(text_list)
+
+            # tokenize text
+            text_token_ids = self.tokenizer.encode(text_content, add_special_tokens=False)
+
+            if len(text_token_ids) > 0:
+                # get token embeddings
+                text_token_ids_tensor = torch.tensor(text_token_ids, dtype=torch.long, device=self.device)
+                text_embeds = self.decoder.embed_token(text_token_ids_tensor)
+
+                # feed to decoder
+                if mode == "TEXT":
+                    # text-only mode: get logits from the last token
+                    self.pending_logits, _ = self.decoder.feed(text_embeds, return_logits=True)
+                else:
+                    # mixed mode: just feed, let other modality get logits
+                    self.decoder.feed(text_embeds)
+
+                # schema tracking: record text token IDs
+                for token_id in text_token_ids:
+                    self._current_unit_prefill_tokens.append(token_id)
+
+        self.current_mode = mode
+
+        if mode == "VISION":
+            self.audio_chunk_idx += 1
+
+        # schema tracking: save current unit's prefill tokens
+        self.prefill_schema_tokens.append(self._current_unit_prefill_tokens)
+
+        return _make_result(True)
+
+    @torch.no_grad()
+    def streaming_generate(
+        self,
+        prompt_wav_path=None,
+        max_new_speak_tokens_per_chunk=20,
+        decode_mode: str = "sampling",
+        temperature=0.7,
+        top_k=100,
+        top_p=0.8,
+        listen_prob_scale=1.0,
+        listen_top_k=None,
+        text_repetition_penalty=1.05,
+        text_repetition_window_size=512,
+    ):
+        start_time = time.time()
+
+        if self.is_session_stop_set() or self.is_break_set():
+            return {
+                "is_listen": True,
+                "text": "",
+                "audio_waveform": self._generate_silence_waveform(),
+                "end_of_turn": True,
+                "current_time": self.audio_chunk_idx,
+                "cost_llm": 0.0,
+                "cost_tts_prep": 0.0,
+                "cost_tts": 0.0,
+                "cost_token2wav": 0.0,
+                "cost_all": time.time() - start_time,
+                "n_tokens": 0,
+                "n_tts_tokens": 0,
+            }
+
+        # check if there are pending logits to process
+        if not hasattr(self, "pending_logits") or self.pending_logits is None:
+            return {
+                "is_listen": True,
+                "text": "",
+                "audio_waveform": self._generate_silence_waveform(),
+                "end_of_turn": False,
+                "current_time": self.audio_chunk_idx,
+                "cost_llm": 0.0,
+                "cost_tts_prep": 0.0,
+                "cost_tts": 0.0,
+                "cost_token2wav": 0.0,
+                "cost_all": time.time() - start_time,
+                "n_tokens": 0,
+                "n_tts_tokens": 0,
+            }
+
+        # use pending logits generated in streaming_prefill
+        logits = self.pending_logits
+        self.pending_logits = None
+
+        # Force listen: check if we should force listen for first N calls
+        force_listen = self._streaming_generate_count < self.force_listen_count
+        self._streaming_generate_count += 1
+
+        total_hidden_in_unit = []
+        total_ids_in_unit = []
+        current_time = self.audio_chunk_idx
+        is_listen = False
+        end_of_turn = False
+
+        llm_start_time = time.time()
+
+        for j in range(max_new_speak_tokens_per_chunk):
+            if j == max_new_speak_tokens_per_chunk - 1:
+                if self.ls_mode == "explicit":
+                    self.decoder.feed(self.decoder.embed_token(self.chunk_eos_token_id))
+                    self.total_ids.append(self.chunk_eos_token_id)
+                    break
+
+            if force_listen:
+                last_id = torch.tensor([self.listen_token_id], dtype=torch.long, device=self.device)
+            else:
+                last_id = self.decoder.decode(
+                    logits=logits,
+                    mode=decode_mode,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                    listen_top_k=listen_top_k,
+                    listen_prob_scale=listen_prob_scale,
+                    text_repetition_penalty=text_repetition_penalty,
+                    text_repetition_window_size=text_repetition_window_size,
+                )
+
+                # if current turn not ended, not allowed to listen (only check when not force_listen)
+                if last_id.item() == self.listen_token_id and (not self.current_turn_ended):
+                    last_id = torch.tensor([self.tts_bos_token_id], dtype=torch.long, device=self.device)
+
+            self.total_ids.append(last_id.item())
+
+            is_listen = last_id.item() == self.listen_token_id
+
+            # termination condition detection
+            if last_id.item() in self.chunk_terminator_token_ids:
+                if self.ls_mode == "explicit":
+                    logits, _ = self.decoder.feed(self.decoder.embed_token(last_id.item()), return_logits=True)
+                break
+            else:
+                # normal speak
+                self.current_turn_ended = False
+
+                if last_id.item() in self.chunk_speak_token_ids:
+                    pass
+                else:
+                    self.res_ids.append(last_id.item())
+                    self.speak_count += 1
+
+                logits, hidden = self.decoder.feed(self.decoder.embed_token(last_id.item()), return_logits=True)
+
+                assert len(hidden.shape) == 3
+                assert hidden.shape[0] == 1
+                assert hidden.shape[1] == 1
+
+                end_of_turn = last_id.item() in self.turn_terminator_token_ids
+
+                if end_of_turn:
+                    self.current_turn_ended = True
+
+                if j != 0:
+                    total_hidden_in_unit.append([last_id.item(), hidden, end_of_turn])
+                    total_ids_in_unit.append(last_id.item())
+
+        # Prefill </unit> token
+        unit_end_id = self.tokenizer.convert_tokens_to_ids("</unit>")
+        self.decoder.feed(self.decoder.embed_token(unit_end_id))
+        self.total_ids.append(unit_end_id)
+
+        # calculate generated text (for sliding window context preserve, filter out special tokens)
+        generated_text = self.tokenizer.decode(total_ids_in_unit, skip_special_tokens=True) if total_ids_in_unit else ""
+
+        # sliding window: register unit end, and check if sliding window is needed
+        input_type = self.current_mode.lower() if self.current_mode else "audio"
+
+        self.decoder.register_unit_end(
+            input_type=input_type,
+            generated_tokens=total_ids_in_unit,
+            is_listen=is_listen,
+            generated_text=generated_text,
+        )
+        # select sliding window method based on sliding window mode
+        if self.decoder._window_config.sliding_window_mode == "context":
+            self.decoder.enforce_window_with_context()
+        elif self.decoder._window_config.sliding_window_mode == "basic":
+            self.decoder.enforce_window()
+
+        llm_end_time = time.time()
+
+        if is_listen:
+            self.total_hidden.append([])
+            return {
+                "is_listen": True,
+                "text": "",
+                "audio_waveform": self._generate_silence_waveform(),
+                "end_of_turn": False,
+                "current_time": current_time,
+                "cost_llm": llm_end_time - llm_start_time,
+                "cost_tts_prep": 0.0,
+                "cost_tts": 0.0,
+                "cost_token2wav": 0.0,
+                "cost_all": time.time() - start_time,
+                "n_tokens": len(total_ids_in_unit),
+                "n_tts_tokens": 0,
+            }
+
+        self.total_hidden.append(total_hidden_in_unit)
+        text = generated_text  # reuse already calculated text
+
+        if not self.generate_audio:
+            return {
+                "is_listen": False,
+                "text": text,
+                "audio_waveform": None,
+                "end_of_turn": end_of_turn,
+                "current_time": current_time,
+                "cost_llm": llm_end_time - llm_start_time,
+                "cost_tts_prep": 0.0,
+                "cost_tts": 0.0,
+                "cost_token2wav": 0.0,
+                "cost_all": time.time() - start_time,
+                "n_tokens": len(total_ids_in_unit),
+                "n_tts_tokens": 0,
+            }
+
+        # TTS generate
+        tts_start_time = time.time()
+        tts_prep_start_time = time.time()
+        tts_condition = self._convert_results_to_tts_input(total_hidden_in_unit)
+        tts_prep_end_time = time.time()
+
+        max_token_per_chunk = 25 + 1
+        min_token_per_chunk = 25 + 1
+
+        if end_of_turn:
+            min_token_per_chunk = 0
+        force_flush = False
+        if self.tts_text_start_pos == 0:  # this is the start of the turn
+            min_token_per_chunk = 0  # allow decoding <1s audio
+            force_flush = True
+
+        if self.tts_current_turn_start_time is None:
+            self.tts_current_turn_start_time = current_time
+
+        new_tokens, old_kv = self.model.tts.generate_chunk(
+            inputs_embeds=tts_condition,
+            temperature=self.tts_temperature,
+            repetition_penalty=self.tts_repetition_penalty,
+            eos_token=self.tts_eos_token,
+            force_no_stop=False,
+            max_new_token=max_token_per_chunk,
+            min_new_tokens=min_token_per_chunk,
+            past_key_values=self.tts_past_key_values,
+            logits_processors=self.tts_logits_processors,
+            text_start_pos=self.tts_text_start_pos,
+        )
+
+        tts_end_time = time.time()
+
+        # update TTS state (note: token2wav reset must be after audio generation, otherwise tokens in buffer will be lost)
+        if end_of_turn:
+            self.tts_text_start_pos = 0
+            self.tts_past_key_values = None
+            self.tts_current_turn_start_time = None
+        else:
+            self.tts_past_key_values = old_kv
+            self.tts_text_start_pos += tts_condition.shape[1] + new_tokens.shape[1]
+
+        # token2wav generation (must be before reset, otherwise tokens in the last but second chunk will be lost)
+        token2wav_start_time = time.time()
+        audio_waveform = self._generate_waveform_from_tokens(
+            new_tokens, prompt_wav_path, end_of_turn, force_flush=force_flush
+        )
+        token2wav_end_time = time.time()
+
+        # reset token2wav state after audio generation, ensure all tokens in buffer are processed
+        if end_of_turn:
+            self._reset_token2wav_for_new_turn()
+
+        end_time = time.time()
+
+        return {
+            "is_listen": False,
+            "text": text,
+            "audio_waveform": audio_waveform,
+            "end_of_turn": end_of_turn,
+            "current_time": current_time,
+            "cost_llm": llm_end_time - llm_start_time,
+            "cost_tts_prep": tts_prep_end_time - tts_prep_start_time,
+            "cost_tts": tts_end_time - tts_start_time,
+            "cost_token2wav": token2wav_end_time - token2wav_start_time,
+            "cost_all": end_time - start_time,
+            "n_tokens": len(total_ids_in_unit),
+            "n_tts_tokens": new_tokens.numel(),
+        }
+
+    def get_session_schema(self, include_embeddings: bool = True) -> str:
+        """get complete schema for current session (includes prefill and generate stages)
+
+        Args:
+            include_embeddings: whether to include embedding placeholders (e.g. [img_embed_64], [audio_embed_50])
+
+        Returns:
+            complete schema string, each unit format:
+            <unit><image>[img_embed_64]</image>[audio_embed_50]<|listen|or|speak|>generated_content</unit>
+        """
+        if not hasattr(self, "prefill_schema_tokens") or not hasattr(self, "total_ids"):
+            return ""
+
+        # get </unit> token id for splitting generate tokens
+        unit_end_token_id = self.tokenizer.convert_tokens_to_ids("</unit>")
+
+        # split generate tokens into each unit
+        generate_units = []
+        current_unit = []
+        for tid in self.total_ids:
+            current_unit.append(tid)
+            if tid == unit_end_token_id:
+                generate_units.append(current_unit)
+                current_unit = []
+
+        # build complete schema
+        full_schema_parts = []
+        num_units = max(len(self.prefill_schema_tokens), len(generate_units))
+
+        for unit_idx in range(num_units):
+            unit_schema = ""
+
+            # prefill part
+            if unit_idx < len(self.prefill_schema_tokens):
+                prefill_tokens = self.prefill_schema_tokens[unit_idx]
+                for item in prefill_tokens:
+                    if isinstance(item, tuple):
+                        # tuple represents embedding: ("img", dim) or ("audio", dim)
+                        embed_type, embed_dim = item
+                        if include_embeddings:
+                            unit_schema += f"[{embed_type}_embed_{embed_dim}]"
+                    else:
+                        # normal token ID
+                        unit_schema += self.tokenizer.decode([item], skip_special_tokens=False)
+
+            # generate part
+            if unit_idx < len(generate_units):
+                unit_schema += self.tokenizer.decode(generate_units[unit_idx], skip_special_tokens=False)
+
+            full_schema_parts.append(unit_schema)
+
+        return "".join(full_schema_parts)
+
+    def get_unit_schemas(self, include_embeddings: bool = True) -> list:
+        """get list of schema for each unit
+
+        Returns:
+            list of schema strings for each unit
+        """
+        if not hasattr(self, "prefill_schema_tokens") or not hasattr(self, "total_ids"):
+            return []
+
+        unit_end_token_id = self.tokenizer.convert_tokens_to_ids("</unit>")
+
+        # split generate tokens into each unit
+        generate_units = []
+        current_unit = []
+        for tid in self.total_ids:
+            current_unit.append(tid)
+            if tid == unit_end_token_id:
+                generate_units.append(current_unit)
+                current_unit = []
+
+        # build schema for each unit
+        unit_schemas = []
+        num_units = max(len(self.prefill_schema_tokens), len(generate_units))
+
+        for unit_idx in range(num_units):
+            unit_schema = ""
+
+            # prefill part
+            if unit_idx < len(self.prefill_schema_tokens):
+                prefill_tokens = self.prefill_schema_tokens[unit_idx]
+                for item in prefill_tokens:
+                    if isinstance(item, tuple):
+                        # tuple represents embedding: ("img", dim) or ("audio", dim)
+                        embed_type, embed_dim = item
+                        if include_embeddings:
+                            unit_schema += f"[{embed_type}_embed_{embed_dim}]"
+                    else:
+                        # normal token ID
+                        unit_schema += self.tokenizer.decode([item], skip_special_tokens=False)
+
+            # generate part
+            if unit_idx < len(generate_units):
+                unit_schema += self.tokenizer.decode(generate_units[unit_idx], skip_special_tokens=False)
+
+            unit_schemas.append(unit_schema)
+
+        return unit_schemas
+
+    def _convert_results_to_tts_input(self, results):
+        """convert LLM hidden states to TTS input"""
+        if len(results) == 0:
+            audio_bos = self.model.tts.emb_text(
+                torch.tensor(
+                    [self.model.tts.audio_bos_token_id],
+                    device=self.model.tts.emb_text.weight.device,
+                    dtype=torch.long,
+                )
+            )
+            return audio_bos.unsqueeze(0)
+
+        llm_tokens = []
+        llm_hidden = []
+        for hidden in results:
+            llm_tokens.append(hidden[0])
+            llm_hidden.append(hidden[1].squeeze(0))
+
+        llm_tokens_tensor = torch.Tensor(llm_tokens).to(self.device, dtype=torch.long)
+        llm_embeds = self.model.tts.emb_text(llm_tokens_tensor)
+
+        llm_hidden_tensor = torch.cat(llm_hidden, dim=0)
+        llm_hidden_tensor = self.model.tts.projector_semantic(llm_hidden_tensor)
+        llm_hidden_tensor = torch.nn.functional.normalize(llm_hidden_tensor, p=2, dim=-1)
+
+        tts_embeds = llm_embeds + llm_hidden_tensor
+
+        audio_bos = self.model.tts.emb_text(
+            torch.tensor(
+                [self.model.tts.audio_bos_token_id],
+                device=self.model.tts.emb_text.weight.device,
+                dtype=torch.long,
+            )
+        )
+
+        tts_embeds = torch.cat([tts_embeds, audio_bos], dim=0)
+        return tts_embeds.unsqueeze(0)
+
+    def _generate_waveform_from_tokens(
+        self,
+        new_tokens: torch.Tensor,
+        prompt_wav_path: Optional[str],
+        is_last_chunk: bool = False,
+        force_flush: bool = False,
+    ) -> Optional[np.ndarray]:
+        if not self.token2wav_initialized:
+            logger.warning("token2wav_initialized is uninitialized")
+            return None
+
+        CHUNK_SIZE = 25
+
+        token_ids = torch.reshape(new_tokens, (-1,)).tolist()
+        self.token2wav_buffer += token_ids
+
+        has_chunk_eos = any(tid in self.chunk_terminator_token_ids for tid in token_ids)
+
+        pcm_bytes_list = []
+
+        # process enough tokens
+        # if there is chunk_eos, try to flush more content
+        if has_chunk_eos or force_flush:
+            # when there is chunk_eos, try to flush more content
+            while len(self.token2wav_buffer) >= self.pre_lookahead + 5:  # at least keep some lookahead
+                chunk_to_process = min(CHUNK_SIZE + self.pre_lookahead, len(self.token2wav_buffer))
+                pcm_bytes = self.model.tts.audio_tokenizer.stream(
+                    self.token2wav_buffer[:chunk_to_process],
+                    prompt_wav=prompt_wav_path,
+                )
+                pcm_bytes_list.append(pcm_bytes)
+                self.token2wav_buffer = self.token2wav_buffer[min(CHUNK_SIZE, chunk_to_process - self.pre_lookahead) :]
+        else:
+            while len(self.token2wav_buffer) >= CHUNK_SIZE + self.pre_lookahead:
+                pcm_bytes = self.model.tts.audio_tokenizer.stream(
+                    self.token2wav_buffer[: CHUNK_SIZE + self.pre_lookahead],
+                    prompt_wav=prompt_wav_path,
+                )
+                pcm_bytes_list.append(pcm_bytes)
+                self.token2wav_buffer = self.token2wav_buffer[CHUNK_SIZE:]
+
+        # if is the last chunk, flush remaining tokens
+        if is_last_chunk and len(self.token2wav_buffer) > 0:
+            pcm_bytes = self.model.tts.audio_tokenizer.stream(
+                self.token2wav_buffer,
+                prompt_wav=prompt_wav_path,
+                last_chunk=True,
+            )
+            pcm_bytes_list.append(pcm_bytes)
+            self.token2wav_buffer = []
+
+        if not pcm_bytes_list:
+            return None
+
+        # merge PCM and convert to numpy array (24kHz, int16 -> float32)
+        all_pcm = b"".join(pcm_bytes_list)
+        if len(all_pcm) == 0:
+            return None
+
+        pcm_np = np.frombuffer(all_pcm, dtype="<i2")
+        audio_waveform = pcm_np.astype(np.float32) / 32768.0
+
+        # left pad with zeros if audio is less than 1 second (24kHz), skip for last chunk
+        min_samples = 24000  # 1 second at 24kHz
+        if not is_last_chunk and len(audio_waveform) < min_samples:
+            pad_length = min_samples - len(audio_waveform)
+            audio_waveform = np.pad(audio_waveform, (pad_length, 0), mode="constant", constant_values=0)
+
+        return audio_waveform
+
+    @staticmethod
+    def _generate_silence_waveform(duration_sec: float = 1.0) -> np.ndarray:
+        """generate silence waveform (24kHz)"""
+        sample_rate = 24000
+        num_samples = int(duration_sec * sample_rate)
+        return np.zeros(num_samples, dtype=np.float32)
+
+    def get_generated_text(self) -> str:
+        return self.tokenizer.decode(self.res_ids)
+
+    def get_current_time(self) -> int:
+        return self.audio_chunk_idx
+
+    def as_simplex(self, reset_session: bool = True, reset_token2wav_cache: bool = False) -> "MiniCPMO":
+        """Convert this MiniCPMODuplex instance back to MiniCPMO for simplex mode.
+
+        Args:
+            reset_session: If True, reset streaming session state (KV cache, etc.).
+                          Recommended when switching from duplex to simplex mode.
+
+        Returns the underlying MiniCPMO model instance without reloading.
+        """
+        if reset_session:
+            self.model.reset_session(reset_token2wav_cache=reset_token2wav_cache)
+        return self.model
+
+
+def get_2d_sincos_pos_embed(embed_dim, image_size):
+    """
+    image_size: image_size or (image_height, image_width)
+    return:
+    pos_embed: [image_height, image_width, embed_dim]
+    """
+    if isinstance(image_size, int):
+        grid_h_size, grid_w_size = image_size, image_size
+    else:
+        grid_h_size, grid_w_size = image_size[0], image_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0])  # (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1])  # (H, W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (H, W)
+    out: (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+
+    emb_sin = np.sin(out)  # (H, W, D/2)
+    emb_cos = np.cos(out)  # (H, W, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+       given learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (batch_size, num_queries, embed_dim)
+    """
+
+    def __init__(
+        self,
+        num_queries,
+        embed_dim,
+        num_heads,
+        kv_dim=None,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        adaptive=False,
+        max_size=(70, 70),
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.adaptive = adaptive
+        self.max_size = max_size
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+
+        self.ln_post = norm_layer(embed_dim)
+        self.proj = nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+
+        self._set_2d_pos_cache(self.max_size)
+
+    def _set_2d_pos_cache(self, max_size, device="cpu"):
+        if is_deepspeed_zero3_enabled():
+            device = "cuda"
+        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(self, tgt_sizes, device):
+        max_h = torch.max(tgt_sizes[:, 0])
+        max_w = torch.max(tgt_sizes[:, 1])
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])]
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, tgt_sizes=None):
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = torch.max(patch_len)
+        key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool, device=device)
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+
+        x = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class MiniCPMWhisperEncoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        try:
+            # compatible old transformers
+            from transformers.models.whisper.modeling_whisper import WHISPER_ATTENTION_CLASSES
+
+            self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+                layer_idx=layer_idx,
+            )
+        except:
+            from transformers.models.whisper.modeling_whisper import WhisperAttention
+
+            self.self_attn = WhisperAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+                layer_idx=layer_idx,
+            )
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = False,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        return outputs
+
+
+# Copied from from transformers.models.whisper.modeling_whisper.WhisperEncoder and add use_cache for streaming inference
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [MiniCPMWhisperEncoderLayer(config, layer_idx=i) for i in range(config.encoder_layers)]
+        )
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        use_extra_context: Optional[bool] = False,
+        prefix_extra_frames: Optional[int] = 1,
+        suffix_extra_frames: Optional[int] = 1,
+        cnn_min_length: Optional[int] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Ignore copy
+        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
+
+        # Optional: pad short input to minimum length for CNN computation consistency
+        original_length = input_features.shape[2]
+        padded_for_cnn = False
+        if cnn_min_length is not None and original_length < cnn_min_length:
+            padded_features = torch.zeros(
+                input_features.shape[0],
+                input_features.shape[1],
+                cnn_min_length,
+                dtype=input_features.dtype,
+                device=input_features.device,
+            )
+            padded_features[:, :, :original_length] = input_features
+            input_features = padded_features
+            padded_for_cnn = True
+
+        conv1_output = self.conv1(input_features)
+        inputs_embeds = nn.functional.gelu(conv1_output)
+        conv2_output = self.conv2(inputs_embeds)
+        inputs_embeds = nn.functional.gelu(conv2_output)
+        # If padding was done before, now need to remove the effect of padding
+        if padded_for_cnn:
+            # Conv1: stride=1, output length=input length
+            # Conv2: stride=2, output length=(input length+1)//2
+            actual_cnn_output_length = (original_length + 1) // 2
+            inputs_embeds = inputs_embeds[:, :, :actual_cnn_output_length]
+
+        # If extra context is used, CNN operations need to remove redundant frames
+        # conv2 stride=2, so the redundant frames in the input will be halved (upward rounding)
+        if use_extra_context:
+            # Input has prefix_extra_frames prefix frames and suffix_extra_frames suffix frames
+            # conv2 stride=2, output length = ceil(input length / 2)
+            # For 2 redundant frames, the output is 1 frame (ceil(2/2) = 1)
+            prefix_to_remove = (prefix_extra_frames + 1) // 2 if prefix_extra_frames > 0 else 0
+            suffix_to_remove = (suffix_extra_frames + 1) // 2 if suffix_extra_frames > 0 else 0
+
+            # Remove redundant frames before and after (batch, channels, time)
+            if prefix_to_remove > 0:
+                inputs_embeds = inputs_embeds[:, :, prefix_to_remove:]
+            if 0 < suffix_to_remove < inputs_embeds.shape[2]:
+                inputs_embeds = inputs_embeds[:, :, :-suffix_to_remove]
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+        past_key_values_length = 0
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
+            elif isinstance(past_key_values, list):
+                past_key_values = EncoderDecoderCache(DynamicCache.from_legacy_cache(past_key_values), DynamicCache())
+            elif isinstance(past_key_values, DynamicCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+            else:
+                pass
+            past_key_values_length = past_key_values.self_attention_cache.get_usable_length(inputs_embeds.shape[1])
+            if inputs_embeds.shape[1] + past_key_values_length > embed_pos.shape[0]:
+                logger.warning("seems the audio is longer than 30s. repeating the last part of the audio")
+                embed_pos_front = embed_pos[past_key_values_length:, :]
+                embed_pos = torch.cat(
+                    (
+                        embed_pos_front,
+                        torch.repeat_interleave(
+                            embed_pos[-1, :].unsqueeze(0),
+                            inputs_embeds.shape[1] - embed_pos.shape[0] + past_key_values_length,
+                            dim=0,
+                        ),
+                    )
+                )
+            else:
+                embed_pos = embed_pos[past_key_values_length : inputs_embeds.shape[1] + past_key_values_length, :]
+        else:
+            embed_pos = embed_pos[: inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                        past_key_values,
+                        use_cache,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                        past_key_values=past_key_values,
+                        use_cache=use_cache,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_encoder_cache = layer_outputs[2 if output_attentions else 1]
+            else:
+                next_encoder_cache = None
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            result = tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+            return result
+        result = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+            past_key_values=next_encoder_cache,
+        )
+
+        return result
+
+
+class MultiModalProjector(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim, out_features=out_dim, bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim, out_features=out_dim, bias=True)
+
+    def forward(self, audio_features):
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.in_dim = config.llm_hidden_size
+        self.out_dim = config.hidden_size
+        self.intermediate_size = config.llm_intermediate_size
+        self.gate_proj = nn.Linear(self.in_dim, self.intermediate_size, bias=True)
+        self.up_proj = nn.Linear(self.in_dim, self.intermediate_size, bias=True)
+        self.down_proj = nn.Linear(self.intermediate_size, self.out_dim, bias=True)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+@dataclass
+class MiniCPMTTSGenerationOutput(ModelOutput):
+    """
+    Output class for MiniCPMTTS generation.
+
+    Args:
+        new_ids (torch.LongTensor): Newly generated audio code sequence, shape (batch_size, sequence_length, num_vq).
+        audio_input_ids (torch.LongTensor): Updated input IDs including condition and generated audio codes, shape (batch_size, full_sequence_length, num_vq).
+        past_key_values (Tuple[Tuple[torch.FloatTensor]]): Tuple containing pre-computed keys and values used for attention mechanism. Each element has shape (batch_size, num_heads, sequence_length, embed_size_per_head).
+        finished (bool): Boolean indicating whether generation is complete.
+    """
+
+    new_ids: torch.LongTensor = None
+    audio_input_ids: torch.LongTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_input_ids: Optional[torch.LongTensor] = None
+    finished: bool = None
+
+
+def make_streaming_chunk_mask_inference(
+    tts_text_scope: List[int],
+    tts_text_mask: torch.Tensor,
+    streaming_audio_chunk_size: int = 50,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cuda"),
+    max_sequence_length: int = 4096,
+):
+    """
+    Example:
+    Input sequence:
+    [t1, t2, t3, t4, t5, [Ptts], a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, ...]
+    Output 4D causal mask:
+    ------- text positions -------
+    [0] <- here is [Stts]
+    [0,   0] <- here is [spk_emb] * N
+    [0,   0,   0]
+    [0,   0,   0,    0]
+    [0,   0,   0,    0,    0]
+    ------- audio positions --------
+    [0,    0, -inf, -inf, -inf, 0] <- here is [Ptts], [Ptts]'s last hidden state should predict the first audio token
+                                v- here is [Ptts]
+    [0,    0, -inf, -inf, -inf, 0, 0]
+    [0,    0, -inf, -inf, -inf, 0, 0, 0]
+    [0,    0, -inf, -inf, -inf, 0, 0, 0, 0]
+    [0,    0, -inf, -inf, -inf, 0, 0, 0, 0, 0]
+    [0,    0, -inf, -inf, -inf, 0, 0, 0, 0, 0, 0] # end of first 1s audio chunk
+    [0,    0, 0   , -inf, -inf, 0, 0, 0, 0, 0, 0, 0]
+    [0,    0, 0   , -inf, -inf, 0, 0, 0, 0, 0, 0, 0, 0]
+    [0,    0, 0   , -inf, -inf, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    [0,    0, 0   , -inf, -inf, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    [0,    0, 0   , -inf, -inf, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    """
+
+    # Create a complete attention mask for input embeds [batch_size, seq_len], without considering audio mask as audio is always at the end
+
+    assert tts_text_mask.dtype == torch.int8
+
+    padding_mask = torch.ones(max_sequence_length, dtype=torch.int8, device=device)
+    padding_mask[tts_text_scope[0] : tts_text_scope[1]] = tts_text_mask
+
+    # Initialize a standard upper triangular causal mask
+    min_dtype = torch.finfo(dtype).min
+
+    causal_mask = torch.full(
+        (max_sequence_length, max_sequence_length),
+        fill_value=min_dtype,
+        dtype=dtype,
+        device=device,
+    )
+    if max_sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+    else:
+        raise ValueError("max_sequence_length of tts could not be 1.")
+
+    # For each data sample
+    audio_token_start = tts_text_scope[1]
+    audio_duration = max_sequence_length - tts_text_scope[1]
+
+    # Record which text chunk the current audio chunk can see up to
+    text_pivot = 0
+    num_valid_text_tokens = torch.sum(tts_text_mask).item() - 1  # [Ptts] excluded
+    # How many audio chunks are in total, the num of buckets should be smaller as possible
+
+    num_text_tokens_per_audio_chunk = 10
+
+    # For each chunk of audio
+    for chunk_idx in range(math.ceil(audio_duration / streaming_audio_chunk_size)):
+        audio_chunk_start = audio_token_start + chunk_idx * streaming_audio_chunk_size
+        audio_chunk_end = audio_token_start + (chunk_idx + 1) * streaming_audio_chunk_size
+        # New text seen by this new audio chunk
+        new_text_this_chunk = num_text_tokens_per_audio_chunk
+        # The right bound of visible text tokens
+        text_pivot = min(new_text_this_chunk + text_pivot, num_valid_text_tokens)
+        # Mask all text chunks after the visible ones
+        # -> [text_pivot, len(tts_text_scope)-1] excluding [Ptts]
+        causal_mask[
+            audio_chunk_start - 1 : audio_chunk_end - 1,
+            # tts_text_scope[0] + text_pivot: tts_text_scope[1],
+            tts_text_scope[0] + text_pivot : tts_text_scope[1] - 1,
+        ] = min_dtype
+
+    # Mask the padding parts in tts_text_masks (no position will attend to it)
+    causal_mask[:, padding_mask == 0] = min_dtype
+
+    # Add extra dimensions, [batch_size, seq_len, seq_len] -> [batch_size, 1, seq_len, seq_len]
+    causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+
+    return causal_mask
+
+
+class MiniCPMTTS(PreTrainedModel):
+    config_class = MiniCPMTTSConfig
+
+    def __init__(self, config: MiniCPMTTSConfig, audio_tokenizer: None):
+        super().__init__(config)
+
+        self.use_llm_hidden_state = config.use_llm_hidden_state
+
+        self.use_text = config.use_text
+        self.streaming = config.streaming
+        self.streaming_text_chunk_min = config.streaming_text_chunk_min
+        self.streaming_text_chunk_max = config.streaming_text_chunk_max
+        self.streaming_audio_chunk_size = config.streaming_audio_chunk_size
+        self.streaming_text_reserved_len = config.streaming_text_reserved_len
+        # streaming tts
+        self.streaming_text_chunk_size = config.streaming_text_chunk_max
+        self.audio_bos_token_id = config.audio_bos_token_id
+        self.num_mel_bins = config.num_mel_bins
+        self.num_vq = config.num_vq
+        self.num_audio_tokens = config.num_audio_tokens
+
+        self.top_p = config.top_p
+        self.top_k = config.top_k
+        self.repetition_penalty = config.repetition_penalty
+
+        self.interleaved = config.interleaved
+        self.attention_type = config.attention_type
+        self.recomputed_chunks = config.recomputed_chunks
+
+        # Two different window size concepts:
+        # 1. chunk_window_size: number of chunks for sliding_recompute mode (default 2)
+        # 2. token_window_size: number of tokens for sliding_window mode (default 300)
+        self.chunk_window_size = config.window_size  # chunk-level window for sliding_recompute
+        self.token_window_size = (
+            config.streaming_sliding_window_audio_window_size
+        )  # token-level window for sliding_window
+
+        # Legacy aliases (for backward compatibility with existing code)
+        self.window_size = self.chunk_window_size  # used in generate_streaming for sliding_recompute
+        self.sliding_window_size = self.token_window_size  # used in TTSStreamingGenerator for sliding_window
+
+        if self.attention_type == "sliding_recompute" and self.chunk_window_size <= self.recomputed_chunks:
+            raise ValueError(
+                f"sliding_recompute requires chunk_window_size > recomputed_chunks, "
+                f"but got chunk_window_size={self.chunk_window_size} and recomputed_chunks={self.recomputed_chunks}"
+            )
+
+        if config.backbone_model == "llama":
+            model_config = LlamaConfig(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                num_attention_heads=config.num_attention_heads,
+                num_hidden_layers=config.num_hidden_layers,
+                num_key_value_heads=config.num_key_value_heads,
+                max_position_embeddings=config.max_position_embeddings,
+                attn_implementation=config.attn_implementation,
+            )
+
+            self.emb_text = nn.Embedding(config.num_text_tokens, config.hidden_size)
+
+            model = LlamaModel(model_config)
+            self.model = model
+        else:
+            raise ValueError(f"Unsupported backbone model: {config.backbone_model}")
+
+        self.projector_spk = self.create_projector(config)
+        self.projector_semantic = self.create_projector(config)
+
+        self.audio_tokenizer = audio_tokenizer
+
+        self.emb_code = nn.ModuleList(
+            [nn.Embedding(config.num_audio_tokens, config.hidden_size) for _ in range(config.num_vq)]
+        )
+
+        self.head_code = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Linear(config.hidden_size, config.num_audio_tokens, bias=False),
+                    name="weight",
+                )
+                for _ in range(config.num_vq)
+            ]
+        )
+
+        self.condition_type = config.condition_type
+
+        return
+
+    @staticmethod
+    def create_projector(config):
+        if config.projector_type == "mlp":
+            return MultiModalProjector(config.llm_dim, config.hidden_size)
+        elif config.projector_type == "minicpm":
+            return MiniCPMMLP(config)
+        elif config.projector_type == "default":
+            return nn.Linear(config.llm_dim, config.hidden_size, bias=False)
+        else:
+            raise ValueError(f"Unsupported projector type: {config.projector_type}")
+
+    # non-streaming
+    @torch.inference_mode()
+    def generate(
+        self,
+        inputs_embeds: torch.Tensor,
+        eos_token: Union[int, torch.Tensor],
+        force_no_stop=False,
+        min_new_token=50,
+        max_new_token=2048,
+        show_tqdm=True,
+        streaming=False,
+        text_lengths=None,
+        sampling_params: TTSSamplingParams = TTSSamplingParams(),
+    ):
+        temperature = torch.tensor(
+            [sampling_params.temperature] * self.config.num_vq,
+            dtype=torch.float,
+            device=self.device,
+        )
+        temperature = (temperature.unsqueeze(0).expand(inputs_embeds.shape[0], -1).contiguous().view(-1, 1)).to(
+            inputs_embeds.device
+        )
+
+        logits_warpers, logits_processors = gen_logits(
+            num_code=self.config.num_audio_tokens,
+            repetition_penalty=sampling_params.repetition_penalty,
+            top_p=sampling_params.top_p,
+            top_k=sampling_params.top_k,
+        )
+
+        # We only support batch size `1` for now
+        assert inputs_embeds.shape[0] == 1
+        eos_token = eos_token.to(inputs_embeds.device)
+        finish = torch.zeros(inputs_embeds.shape[0], device=inputs_embeds.device).bool()
+
+        condition_length = inputs_embeds.shape[1]
+        pbar: Optional[tqdm] = None
+        if show_tqdm:
+            pbar = tqdm(
+                total=max_new_token,
+                desc="code",
+                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}(max) [{elapsed}, {rate_fmt}{postfix}]",
+            )
+
+        if streaming:
+            raise NotImplementedError("this kind of streaming is not supported yet")
+
+        new_tokens = torch.zeros(
+            inputs_embeds.shape[0],
+            max_new_token,
+            self.num_vq,
+            device=inputs_embeds.device,
+            dtype=torch.long,
+        )
+
+        past_key_values = None
+
+        for t in range(max_new_token):
+            audio_bos = False
+            # If this is the first audio token, the case is special
+            if t == 0:
+                audio_bos = True
+                inputs_embeds = inputs_embeds
+                position_ids = torch.tensor(
+                    list(range(0, condition_length)),
+                    dtype=torch.long,
+                    device=self.device,
+                ).unsqueeze(0)
+
+                if streaming:
+                    raise NotImplementedError("this kind of streaming is not supported yet")
+                else:
+                    causal_mask_4d = None
+
+            else:
+                code_emb = []
+                for q in range(self.num_vq):
+                    x = self.emb_code[q](new_tokens[:, t - 1 : t, q])
+                    code_emb.append(x)
+
+                inputs_embeds = torch.stack(code_emb, 3).sum(3)
+
+                position_ids = torch.tensor([condition_length + t - 1], dtype=torch.long, device=self.device).unsqueeze(
+                    0
+                )
+
+                if streaming:
+                    raise NotImplementedError("this kind of streaming is not supported yet")
+                else:
+                    causal_mask_4d = None
+
+            if self.config.backbone_model == "llama":
+                outputs: BaseModelOutputWithPast = self.model(
+                    position_ids=position_ids,
+                    cache_position=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=causal_mask_4d,
+                    use_cache=True,
+                    output_attentions=False,
+                    # return_dict=True,  # Add this to ensure returns dict with past_key_values
+                )
+            else:
+                raise ValueError(f"Unsupported backbone model: {self.config.backbone_model}")
+
+            del position_ids
+            del inputs_embeds
+
+            hidden_states = outputs.last_hidden_state
+            past_key_values = outputs.past_key_values
+
+            with P.cached():
+                logits = torch.empty(
+                    hidden_states.size(0),
+                    hidden_states.size(1),
+                    self.num_audio_tokens,
+                    self.num_vq,
+                    dtype=torch.float,
+                    device=self.device,
+                )
+                for num_vq_iter in range(self.num_vq):
+                    x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                    logits[..., num_vq_iter] = x
+                    del x
+
+            del hidden_states
+
+            logits = logits[:, -1].float()
+
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+
+            logits /= temperature
+
+            if not audio_bos:
+                input_ids_sliced = new_tokens[:, 0:t].permute(0, 2, 1)  # get previous t new tokens
+                logits_token = input_ids_sliced.reshape(
+                    input_ids_sliced.size(0) * input_ids_sliced.size(1),
+                    -1,
+                ).to(self.device)
+
+                del input_ids_sliced
+
+                for logitsProcessors in logits_processors:
+                    logits = logitsProcessors(logits_token, logits)
+
+                for logitsWarpers in logits_warpers:
+                    logits = logitsWarpers(logits_token, logits)
+
+                del logits_token
+
+            if t < min_new_token:
+                logits[:, eos_token] = -torch.inf
+
+            if force_no_stop:
+                logits[:, eos_token] = -torch.inf
+
+            scores = F.softmax(logits, dim=-1)
+
+            del logits
+
+            idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+
+            del scores
+
+            idx_next = idx_next.view(-1, self.num_vq)
+
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+
+            del finish_or
+            new_tokens[:, t] = idx_next
+
+            if t == 0 and finish.any():
+                break
+
+            del idx_next
+
+            if finish.all():
+                break
+
+            if pbar is not None:
+                pbar.update(1)
+
+        if pbar is not None:
+            pbar.close()
+
+        if not finish.all():
+            logger.warning(f"incomplete result. hit max_new_token: {max_new_token}")
+
+        genrated_input_ids = new_tokens[:, 0:t, :]
+
+        return MiniCPMTTSGenerationOutput(
+            new_ids=genrated_input_ids,
+            audio_input_ids=None,  # for update purpose
+            past_key_values=None,  # for update purpose
+            past_input_ids=None,  # for update purpose
+            finished=finish.all(),
+        )
+
+    # fake streaming
+    @torch.inference_mode()
+    def generate_mock_legacy_streaming(
+        self,
+        inputs_embeds: torch.Tensor,
+        eos_token: Union[int, torch.Tensor],
+        force_no_stop=False,
+        min_new_token=50,
+        max_new_token=2048,
+        show_tqdm=True,
+        streaming=False,
+        text_lengths=None,
+        sampling_params: TTSSamplingParams = TTSSamplingParams(),
+        valid_text_length=None,
+    ):
+        assert valid_text_length is not None, "valid_text_length should be not None"
+
+        tts_text_scope = [0, inputs_embeds.shape[1]]
+        tts_text_mask = torch.zeros(inputs_embeds.shape[1], dtype=torch.int8, device=inputs_embeds.device)
+        tts_text_mask[0:valid_text_length] = 1
+        tts_text_mask[-1] = 1  # [Ptts]
+
+        streaming_mask_4d_full = make_streaming_chunk_mask_inference(
+            tts_text_scope=tts_text_scope,
+            tts_text_mask=tts_text_mask,
+            dtype=torch.bfloat16,
+            device=self.device,
+            streaming_audio_chunk_size=50,
+            max_sequence_length=4096,
+        )
+
+        temperature = torch.tensor([0.1, 0.3, 0.1, 0.3], dtype=torch.float, device=self.device)
+        temperature = (temperature.unsqueeze(0).expand(inputs_embeds.shape[0], -1).contiguous().view(-1, 1)).to(
+            inputs_embeds.device
+        )
+
+        logits_warpers, logits_processors = gen_logits(
+            num_code=self.config.num_audio_tokens,
+            repetition_penalty=sampling_params.repetition_penalty,
+            top_p=sampling_params.top_p,
+            top_k=sampling_params.top_k,
+        )
+
+        # We only support batch size `1` for now
+        assert inputs_embeds.shape[0] == 1
+        eos_token = eos_token.to(inputs_embeds.device)
+        finish = torch.zeros(inputs_embeds.shape[0], device=inputs_embeds.device).bool()
+
+        condition_length = inputs_embeds.shape[1]
+        pbar: Optional[tqdm] = None
+        if show_tqdm:
+            pbar = tqdm(
+                total=max_new_token,
+                desc="code",
+                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}(max) [{elapsed}, {rate_fmt}{postfix}]",
+            )
+
+        new_tokens = torch.zeros(
+            inputs_embeds.shape[0],
+            max_new_token,
+            self.num_vq,
+            device=inputs_embeds.device,
+            dtype=torch.long,
+        )
+
+        past_key_values = None
+
+        for t in range(max_new_token):
+            audio_bos = False
+            if t == 0:
+                audio_bos = True
+                inputs_embeds = inputs_embeds
+                position_ids = torch.tensor(
+                    list(range(0, condition_length)),
+                    dtype=torch.long,
+                    device=self.device,
+                ).unsqueeze(0)
+
+                causal_mask_4d = streaming_mask_4d_full[:, :, :condition_length, :condition_length]
+            else:
+                code_emb = []
+                for q in range(self.num_vq):
+                    x = self.emb_code[q](new_tokens[:, t - 1 : t, q])
+                    code_emb.append(x)
+
+                inputs_embeds = torch.stack(code_emb, 3).sum(3)
+
+                position_ids = torch.tensor([condition_length + t - 1], dtype=torch.long, device=self.device).unsqueeze(
+                    0
+                )
+
+                causal_mask_4d = streaming_mask_4d_full[
+                    :,
+                    :,
+                    condition_length + t : condition_length + t + 1,
+                    : condition_length + t,
+                ]
+
+                # get length of past_key_values
+                past_key_values_length = past_key_values[0][0].shape[2]
+
+                assert causal_mask_4d.shape[-1] == (past_key_values_length + 1)
+
+            if self.config.backbone_model == "llama":
+                outputs: BaseModelOutputWithPast = self.model(
+                    position_ids=position_ids,
+                    cache_position=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=causal_mask_4d,
+                    use_cache=True,
+                    output_attentions=False,
+                    # return_dict=True,  # Add this to ensure returns dict with past_key_values
+                )
+            else:
+                raise ValueError(f"Unsupported backbone model: {self.config.backbone_model}")
+
+            del position_ids
+            del inputs_embeds
+
+            hidden_states = outputs.last_hidden_state
+            past_key_values = outputs.past_key_values
+
+            with P.cached():
+                logits = torch.empty(
+                    hidden_states.size(0),
+                    hidden_states.size(1),
+                    self.num_audio_tokens,
+                    self.num_vq,
+                    dtype=torch.float,
+                    device=self.device,
+                )
+                for num_vq_iter in range(self.num_vq):
+                    x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                    logits[..., num_vq_iter] = x
+                    del x
+
+            del hidden_states
+
+            logits = logits[:, -1].float()
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+            logits /= temperature
+
+            if not audio_bos:
+                input_ids_sliced = new_tokens[:, 0:t].permute(0, 2, 1)  # get previous t new tokens
+
+                logits_token = input_ids_sliced.reshape(
+                    input_ids_sliced.size(0) * input_ids_sliced.size(1),
+                    -1,
+                ).to(self.device)
+
+                del input_ids_sliced
+
+                for logitsProcessors in logits_processors:
+                    logits = logitsProcessors(logits_token, logits)
+
+                for logitsWarpers in logits_warpers:
+                    logits = logitsWarpers(logits_token, logits)
+
+                del logits_token
+
+            if t < min_new_token:
+                logits[:, eos_token] = -torch.inf
+
+            if force_no_stop:
+                logits[:, eos_token] = -torch.inf
+
+            scores = F.softmax(logits, dim=-1)
+
+            del logits
+            idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+
+            del scores
+
+            idx_next = idx_next.view(-1, self.num_vq)
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+
+            del finish_or
+            new_tokens[:, t] = idx_next
+
+            if t == 0 and finish.any():
+                break
+
+            del idx_next
+
+            if finish.all():
+                break
+
+            if pbar is not None:
+                pbar.update(1)
+
+        if pbar is not None:
+            pbar.close()
+
+        if not finish.all():
+            logger.warning(f"incomplete result. hit max_new_token: {max_new_token}")
+
+        genrated_input_ids = new_tokens[:, 0:t, :]
+
+        return MiniCPMTTSGenerationOutput(
+            new_ids=genrated_input_ids,
+            audio_input_ids=None,  # for update purpose
+            past_key_values=None,  # for update purpose
+            past_input_ids=None,  # for update purpose
+            finished=finish.all(),
+        )
+
+    # non-streaming, interleave
+    @torch.inference_mode()
+    def generate_chunk(
+        self,
+        inputs_embeds: torch.Tensor,
+        temperature: torch.Tensor,
+        repetition_penalty: float,
+        eos_token: Union[int, torch.Tensor],
+        force_no_stop=False,
+        max_new_token=500,
+        min_new_tokens=0,
+        past_key_values=None,
+        logits_processors=None,
+        text_start_pos=None,
+    ):
+        """For inputs_embeds, it should be like [bs=1, seq_len, hidden_dim], its content is like:
+        |Text BOS|Spk embeds|Text-Hidden states Interleave (if applicable)|Audio BOS|
+        where the last position is the audio BOS token.
+        So, the first iteration in generation directly forward the model with inputs_embeds, and
+        the last hidden states of the last position (Audio BOS) will be decoded to get the first audio token.
+        """
+        logits_warpers, logits_processors = gen_logits(
+            num_code=self.config.num_audio_tokens, repetition_penalty=repetition_penalty
+        )
+
+        # We only support batch size `1` for now
+        assert inputs_embeds.shape[0] == 1
+        eos_token = eos_token.to(inputs_embeds.device)
+        finish = torch.zeros(inputs_embeds.shape[0], device=inputs_embeds.device).bool()
+
+        temperature = (temperature.unsqueeze(0).expand(inputs_embeds.shape[0], -1).contiguous().view(-1, 1)).to(
+            inputs_embeds.device
+        )
+
+        condition_length = inputs_embeds.shape[1]
+
+        new_tokens = torch.zeros(
+            inputs_embeds.shape[0],
+            max_new_token,
+            self.num_vq,
+            device=inputs_embeds.device,
+            dtype=torch.long,
+        )
+
+        for t in range(max_new_token):
+            audio_bos = False
+
+            # If this is the first audio token, the case is special
+            if t == 0:
+                audio_bos = True
+                inputs_embeds_ = inputs_embeds
+                position_ids = torch.tensor(
+                    list(range(text_start_pos, text_start_pos + condition_length)),
+                    dtype=torch.long,
+                    device=self.device,
+                ).unsqueeze(0)
+            else:
+                # Generate the following audio tokens, it is applicable to all other cases, including second and the following calling of `generate`
+                inputs_embeds_ = self.emb_code[0](new_tokens[:, t - 1 : t, 0])
+
+                position_ids = torch.tensor(
+                    [text_start_pos + condition_length + t - 1],  # prefill the previous token
+                    dtype=torch.long,
+                    device=self.device,
+                ).unsqueeze(0)
+
+            outputs: BaseModelOutputWithPast = self.model(
+                position_ids=position_ids,
+                # cache_position=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds_,
+                use_cache=True,
+                output_attentions=False,
+                # return_dict=True,  # Add this to ensure returns dict with past_key_values
+            )
+
+            del position_ids
+            del inputs_embeds_
+
+            hidden_states = outputs.last_hidden_state
+            past_key_values = outputs.past_key_values
+
+            with P.cached():
+                logits = torch.empty(
+                    hidden_states.size(0),
+                    hidden_states.size(1),
+                    self.num_audio_tokens,
+                    self.num_vq,
+                    dtype=torch.float,
+                    device=self.device,
+                )
+                for num_vq_iter in range(self.num_vq):
+                    x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                    logits[..., num_vq_iter] = x
+                    del x
+
+            del hidden_states
+
+            logits = logits[:, -1].float()
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+
+            logits /= temperature
+
+            if not audio_bos:
+                input_ids_sliced = new_tokens[:, 0:t].permute(0, 2, 1)  # get previous t new tokens
+
+                logits_token = input_ids_sliced.reshape(
+                    input_ids_sliced.size(0) * input_ids_sliced.size(1),
+                    -1,
+                ).to(self.device)
+
+                del input_ids_sliced
+
+                for logitsProcessors in logits_processors:
+                    logits = logitsProcessors(logits_token, logits)
+
+                del logits_token
+
+            if force_no_stop or t < min_new_tokens:
+                logits[:, eos_token] = -torch.inf
+
+            scores = F.softmax(logits, dim=-1)
+            del logits
+
+            idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+            del scores
+
+            idx_next = idx_next.view(-1, self.num_vq)
+
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+
+            del finish_or
+            new_tokens[:, t] = idx_next
+
+            if t == 0 and finish.any():
+                break
+
+            del idx_next
+
+            if finish.all():
+                break
+
+        # The latest generated token is not in the range returned this time. If it is an eos token, it is not returned. If it is a normal token, it is not returned.
+        genrated_input_ids = new_tokens[:, 0:t, :]
+
+        return genrated_input_ids, past_key_values
+
+    @torch.inference_mode()
+    def interleaved_generate(
+        self,
+        spk_embeds: torch.Tensor,
+        conditions: List[torch.Tensor],
+        temperature: torch.Tensor,
+        repetition_penalty: float,
+        eos_token: Union[int, torch.Tensor],
+        **kwargs,
+    ):
+        """
+        For inputs_embeds, it should be like [bs=1, seq_len, hidden_dim], its content is like:
+        |Text BOS|Spk embeds|Text-Hidden states Interleave (if applicable)|Audio BOS|
+        where the last position is the audio BOS token.
+        So, the first iteration in generation directly forward the model with inputs_embeds, and the last hidden states of the last position (Audio BOS) will be decoded to get the first audio token.
+        """
+        temperature = torch.tensor([temperature], dtype=torch.float, device=self.device)
+
+        logits_warpers, logits_processors = gen_logits(
+            num_code=self.config.num_audio_tokens,
+            repetition_penalty=repetition_penalty,
+        )
+
+        eos_token = eos_token.to(conditions[0].device)
+
+        num_chunks = len(conditions)
+        text_start_pos = 0
+        last_window_size = 0
+        past_key_values = None
+
+        for idx in range(num_chunks):
+            condition = conditions[idx].to(conditions[0].device)
+            if self.attention_type == "sliding_recompute":
+                recomputed_conditions = []
+
+                if (
+                    idx >= self.window_size
+                    and (idx - self.recomputed_chunks) % (self.window_size - self.recomputed_chunks) == 0
+                ):
+                    for i in range(self.recomputed_chunks):
+                        recomputed_conditions.append(conditions[idx - self.recomputed_chunks + i])
+                        recomputed_conditions.append(
+                            self.emb_code[0](generated_tokens[-self.recomputed_chunks + i][:, :, 0])
+                        )
+                    recomputed_conditions.append(condition)
+                    condition = torch.cat(recomputed_conditions, dim=1)
+
+                    text_start_pos = 0
+                    new_tokens, old_kv = self.generate_chunk(
+                        inputs_embeds=condition,
+                        temperature=temperature,
+                        repetition_penalty=repetition_penalty,
+                        eos_token=eos_token,
+                        force_no_stop=False,
+                        max_new_token=500,
+                        past_key_values=None,
+                        logits_processors=logits_processors,
+                        text_start_pos=text_start_pos,
+                    )
+
+                else:
+                    new_tokens, old_kv = self.generate_chunk(
+                        inputs_embeds=condition,
+                        temperature=temperature,
+                        repetition_penalty=repetition_penalty,
+                        eos_token=eos_token,
+                        force_no_stop=False,
+                        max_new_token=500,
+                        past_key_values=past_key_values,
+                        logits_processors=logits_processors,
+                        text_start_pos=text_start_pos,
+                    )
+            else:
+                new_tokens, old_kv = self.generate_chunk(
+                    inputs_embeds=condition,
+                    temperature=temperature,
+                    repetition_penalty=repetition_penalty,
+                    eos_token=eos_token,
+                    force_no_stop=False,
+                    max_new_token=500,
+                    past_key_values=past_key_values,
+                    logits_processors=logits_processors,
+                    text_start_pos=text_start_pos,
+                )
+
+            past_key_values = []
+            if self.attention_type == "sliding_window" and idx >= 1:
+                for layer_idx in range(len(old_kv)):
+                    past_key_values.append(
+                        (
+                            old_kv[layer_idx][0][:, :, last_window_size:, :],
+                            old_kv[layer_idx][1][:, :, last_window_size:, :],
+                        )
+                    )
+            else:
+                past_key_values = old_kv
+
+            last_window_size = condition.shape[1] + new_tokens.shape[1]
+            text_start_pos += last_window_size
+
+            if idx == 0:
+                generated_tokens = [new_tokens]
+            else:
+                generated_tokens.append(new_tokens)
+
+        return MiniCPMTTSGenerationOutput(new_ids=torch.cat(generated_tokens, dim=1), finished=True)
+
+
+class CustomRepetitionPenaltyLogitsProcessorRepeat:
+    def __init__(self, penalty: float, max_input_ids: int, past_window: int):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+        self.max_input_ids = max_input_ids
+        self.past_window = past_window
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if input_ids.size(1) > self.past_window:
+            input_ids = input_ids.narrow(1, -self.past_window, self.past_window)
+        freq = F.one_hot(input_ids, scores.size(1)).sum(1)
+        if freq.size(0) > self.max_input_ids:
+            freq.narrow(0, self.max_input_ids, freq.size(0) - self.max_input_ids).zero_()
+        alpha = torch.pow(self.penalty, freq)
+        scores = scores.contiguous()
+        inp = scores.multiply(alpha)
+        oth = scores.divide(alpha)
+        con = scores < 0
+        out = torch.where(con, inp, oth)
+        del inp, oth, scores, con, alpha
+        return out
+
+
+def gen_logits(num_code: int, top_p=0.7, top_k=20, repetition_penalty=1.0):
+    logits_warpers = []
+
+    if top_p is not None:
+        logits_warpers.append(TopPLogitsWarper(top_p, min_tokens_to_keep=3))
+
+    if top_k is not None:
+        logits_warpers.append(TopKLogitsWarper(top_k, min_tokens_to_keep=3))
+
+    logits_processors = []
+    if repetition_penalty is not None and repetition_penalty != 1:
+        logits_processors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(repetition_penalty, num_code, 16))
+
+    return logits_warpers, logits_processors
+
+
+# Copy and modified from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
+def prepare_inputs_for_generation(
+    self,
+    input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    inputs_embeds=None,
+    cache_position=None,
+    position_ids=None,
+    use_cache=True,
+    **kwargs,
+):
+    if past_key_values is not None:
+        if isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+        else:
+            cache_length = past_length = past_key_values[0][0].shape[2]
+
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1] :]
+
+            # This clo≠clo≠clone call is needed to avoid recapturing cuda graphs with →rch.comπ≤→rch.comπ≤torch.compile's  mode=reduce−overheadmode=reduce-overheadmode="reduce-overhead, as otherwise the input positionidspositionidsposition_ids would have various stride during the decoding. Here, simply using .contiguous().contiguous().contiguous() is not sufficient as in the batch size = 1 case, positionidspositionidsposition_ids is already contiguous but with varying stride which retriggers a capture.
+            position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+
+    # if ∈putsembeds∈putsembedsinputs_embeds are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and cache_position[0] == 0:
+        model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+    else:
+        # The clone here is for the same reason as for positionidspositionidsposition_ids.
+        model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+
+    if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+        if model_inputs["inputs_embeds"] is not None:
+            batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+            device = model_inputs["inputs_embeds"].device
+        else:
+            batch_size, sequence_length = model_inputs["input_ids"].shape
+            device = model_inputs["input_ids"].device
+
+        dtype = self.lm_head.weight.dtype
+        min_dtype = torch.finfo(dtype).min
+
+        from transformers.models.paligemma.modeling_paligemma import (
+            _prepare_4d_causal_attention_mask_with_cache_position,
+        )
+
+        attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=past_key_values.get_max_length(),
+            dtype=dtype,
+            device=device,
+            min_dtype=min_dtype,
+            cache_position=cache_position,
+            batch_size=batch_size,
+        )
+
+    model_inputs.update(
+        {
+            "position_ids": position_ids,
+            # "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "attention_mask": attention_mask,
+        }
+    )
+
+    return model_inputs