prince-canuma commited on Mar 5

Commit

592c09d

verified ·

1 Parent(s): 9bb169d

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +1 -0
README.md +24 -0
added_tokens.json +107 -0
chat_template.jinja +88 -0
config.json +297 -0
configuration_minicpmo.py +260 -0
generation_config.json +12 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_minicpmo.py +0 -0
modeling_navit_siglip.py +981 -0
preprocessor_config.json +35 -0
processing_minicpmo.py +1665 -0
processor_config.json +89 -0
special_tokens_map.json +580 -0
tokenization_minicpmo_fast.py +120 -0
tokenizer.json +3 -0
tokenizer_config.json +6989 -0
utils.py +2417 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,24 @@

+---
+license: apache-2.0
+pipeline_tag: any-to-any
+library_name: transformers
+tags:
+- minicpm-o
+- minicpm-v
+- multimodal
+- full-duplex
+- mlx
+---
+# mlx-community/MiniCPM-o-4_5-4bit
+This model was converted to MLX format from [`openbmb/MiniCPM-o-4_5`]() using mlx-vlm version **0.3.13**.
+Refer to the [original model card](https://huggingface.co/openbmb/MiniCPM-o-4_5) for more details on the model.
+## Use with mlx
+```bash
+pip install -U mlx-vlm
+```
+```bash
+python -m mlx_vlm.generate --model mlx-community/MiniCPM-o-4_5-4bit --max-tokens 100 --temperature 0.0 --prompt "Describe this image." --image <path_to_image>
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "</answer>": 151686,
+  "</box>": 151674,
+  "</focus>": 151688,
+  "</image>": 151670,
+  "</image_id>": 151682,
+  "</image_save_to>": 151696,
+  "</line>": 151690,
+  "</perception>": 151692,
+  "</point>": 151678,
+  "</quad>": 151676,
+  "</ref>": 151672,
+  "</slice>": 151680,
+  "</source_image>": 151694,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "</unit>": 151684,
+  "<answer>": 151685,
+  "<box>": 151673,
+  "<focus>": 151687,
+  "<image>": 151669,
+  "<image_id>": 151681,
+  "<image_save_to>": 151695,
+  "<line>": 151689,
+  "<perception>": 151691,
+  "<point>": 151677,
+  "<quad>": 151675,
+  "<ref>": 151671,
+  "<slice>": 151679,
+  "<source_image>": 151693,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<unit>": 151683,
+  "<|audio_end|>": 151699,
+  "<|audio_start|>": 151697,
+  "<|audio|>": 151698,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|emotion_end|>": 151711,
+  "<|emotion_start|>": 151710,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|interrupt|>": 151707,
+  "<|listen|>": 151705,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|pitch_end|>": 151715,
+  "<|pitch_start|>": 151714,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|speak|>": 151706,
+  "<|speed_end|>": 151713,
+  "<|speed_start|>": 151712,
+  "<|spk_bos|>": 151700,
+  "<|spk_eos|>": 151702,
+  "<|spk|>": 151701,
+  "<|turn_bos|>": 151716,
+  "<|timbre_10|>": 151726,
+  "<|timbre_11|>": 151727,
+  "<|timbre_12|>": 151728,
+  "<|timbre_13|>": 151729,
+  "<|timbre_14|>": 151730,
+  "<|timbre_15|>": 151731,
+  "<|timbre_16|>": 151732,
+  "<|timbre_17|>": 151733,
+  "<|timbre_18|>": 151734,
+  "<|timbre_19|>": 151735,
+  "<|turn_eos|>": 151717,
+  "<|timbre_20|>": 151736,
+  "<|timbre_21|>": 151737,
+  "<|timbre_22|>": 151738,
+  "<|timbre_23|>": 151739,
+  "<|timbre_24|>": 151740,
+  "<|timbre_25|>": 151741,
+  "<|timbre_26|>": 151742,
+  "<|timbre_27|>": 151743,
+  "<|timbre_28|>": 151744,
+  "<|timbre_29|>": 151745,
+  "<|chunk_eos|>": 151718,
+  "<|timbre_30|>": 151746,
+  "<|timbre_31|>": 151747,
+  "<|chunk_bos|>": 151719,
+  "<|chunk_tts_bos|>": 151720,
+  "<|chunk_tts_eos|>": 151721,
+  "<|tts_pad|>": 151722,
+  "<|timbre_7|>": 151723,
+  "<|timbre_8|>": 151724,
+  "<|timbre_9|>": 151725,
+  "<|tts_bos|>": 151703,
+  "<|tts_eos|>": 151704,
+  "<|vad_end|>": 151709,
+  "<|vad_start|>": 151708,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,88 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+    {%- if use_tts_template is defined and use_tts_template is true %}
+        {{- '<|tts_bos|>' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,297 @@

+{
+    "architectures": [
+        "MiniCPMO"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "audio_chunk_length": 1.0,
+    "audio_config": {
+        "_attn_implementation_autoset": true,
+        "_name_or_path": "openai/whisper-medium",
+        "activation_dropout": 0.0,
+        "activation_function": "gelu",
+        "apply_spec_augment": false,
+        "architectures": [
+            "MiniCPMWhisperEncoder"
+        ],
+        "attention_dropout": 0.0,
+        "begin_suppress_tokens": [
+            220,
+            50257
+        ],
+        "bos_token_id": 50257,
+        "classifier_proj_size": 256,
+        "d_model": 1024,
+        "decoder_attention_heads": 16,
+        "decoder_ffn_dim": 4096,
+        "decoder_layerdrop": 0.0,
+        "decoder_layers": 24,
+        "decoder_start_token_id": 50258,
+        "dropout": 0.0,
+        "encoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "encoder_layerdrop": 0.0,
+        "encoder_layers": 24,
+        "eos_token_id": 50257,
+        "forced_decoder_ids": [
+            [
+                1,
+                50259
+            ],
+            [
+                2,
+                50359
+            ],
+            [
+                3,
+                50363
+            ]
+        ],
+        "init_std": 0.02,
+        "mask_feature_length": 10,
+        "mask_feature_min_masks": 0,
+        "mask_feature_prob": 0.0,
+        "mask_time_length": 10,
+        "mask_time_min_masks": 2,
+        "mask_time_prob": 0.05,
+        "max_length": 448,
+        "max_source_positions": 1500,
+        "max_target_positions": 448,
+        "median_filter_width": 7,
+        "model_type": "whisper",
+        "num_hidden_layers": 24,
+        "num_mel_bins": 80,
+        "pad_token_id": 50257,
+        "scale_embedding": false,
+        "suppress_tokens": [
+            1,
+            2,
+            7,
+            8,
+            9,
+            10,
+            14,
+            25,
+            26,
+            27,
+            28,
+            29,
+            31,
+            58,
+            59,
+            60,
+            61,
+            62,
+            63,
+            90,
+            91,
+            92,
+            93,
+            359,
+            503,
+            522,
+            542,
+            873,
+            893,
+            902,
+            918,
+            922,
+            931,
+            1350,
+            1853,
+            1982,
+            2460,
+            2627,
+            3246,
+            3253,
+            3268,
+            3536,
+            3846,
+            3961,
+            4183,
+            4667,
+            6585,
+            6647,
+            7273,
+            9061,
+            9383,
+            10428,
+            10929,
+            11938,
+            12033,
+            12331,
+            12562,
+            13793,
+            14157,
+            14635,
+            15265,
+            15618,
+            16553,
+            16604,
+            18362,
+            18956,
+            20075,
+            21675,
+            22520,
+            26130,
+            26161,
+            26435,
+            28279,
+            29464,
+            31650,
+            32302,
+            32470,
+            36865,
+            42863,
+            47425,
+            49870,
+            50254,
+            50258,
+            50358,
+            50359,
+            50360,
+            50361,
+            50362
+        ],
+        "torch_dtype": "float32",
+        "use_cache": true,
+        "use_weighted_layer_sum": false,
+        "vocab_size": 51865
+    },
+    "audio_pool_step": 5,
+    "auto_map": {
+        "AutoConfig": "configuration_minicpmo.MiniCPMOConfig",
+        "AutoModel": "modeling_minicpmo.MiniCPMO",
+        "AutoModelForCausalLM": "modeling_minicpmo.MiniCPMO"
+    },
+    "batch_vision_input": true,
+    "bos_token_id": 151643,
+    "drop_vision_last_layer": false,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "image_size": 448,
+    "init_audio": true,
+    "init_tts": true,
+    "init_vision": true,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "listen_speak_type": "asr",
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "minicpmo",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "patch_size": 14,
+    "quantization": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine"
+    },
+    "quantization_config": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine"
+    },
+    "query_num": 64,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "slice_config": {
+        "max_slice_nums": 1,
+        "model_type": "minicpmv",
+        "patch_size": 14,
+        "scale_resolution": 448
+    },
+    "slice_mode": true,
+    "sliding_window": null,
+    "stream_input": true,
+    "tie_word_embeddings": false,
+    "transformers_version": "4.51.0",
+    "tts_config": {
+        "_attn_implementation_autoset": true,
+        "attention_type": "full_attention",
+        "attn_implementation": "sdpa",
+        "audio_bos_token_id": 151687,
+        "audio_tokenizer_sample_rate": 16000,
+        "audio_tokenizer_type": "s3tokenizer",
+        "aug_layer_loss_weight": false,
+        "aug_loss_weight": false,
+        "backbone_model": "llama",
+        "condition_type": "hidden_text_merge",
+        "cosyvoice_config_path": null,
+        "cosyvoice_model_dir": null,
+        "filter_tts_loss": false,
+        "hidden_act": "silu",
+        "hidden_size": 768,
+        "interleaved": false,
+        "intermediate_size": 3072,
+        "llm_dim": 4096,
+        "llm_dim_model_base": 256,
+        "llm_down_scale": false,
+        "llm_hidden_size": 4096,
+        "llm_intermediate_size": 768,
+        "long_weight": 0.1,
+        "max_position_embeddings": 4096,
+        "model_type": "minicpmtts",
+        "normalize_projected_hidden": true,
+        "num_attention_heads": 12,
+        "num_audio_tokens": 6562,
+        "num_hidden_layers": 20,
+        "num_key_value_heads": 12,
+        "num_mel_bins": 100,
+        "num_text_tokens": 152064,
+        "num_vq": 1,
+        "projector_type": "mlp",
+        "recomputed_chunks": 1,
+        "s3_stream_chunk_size": 25,
+        "s3_stream_generate": false,
+        "s3_stream_n_timesteps": 10,
+        "s3_stream_prelook_size": 3,
+        "short_weight": 0.1,
+        "streaming": false,
+        "streaming_audio_chunk_size": 50,
+        "streaming_sliding_window": false,
+        "streaming_sliding_window_audio_frame_rate": 50,
+        "streaming_sliding_window_audio_init_text_length": 10,
+        "streaming_sliding_window_audio_window_size": 300,
+        "streaming_sliding_window_average_speed": 5,
+        "streaming_sliding_window_fast_speed": 7,
+        "streaming_sliding_window_max_text_len": 500,
+        "streaming_sliding_window_slow_speed": 3,
+        "streaming_sliding_window_text_window_size": 50,
+        "streaming_text_chunk_max": 7,
+        "streaming_text_chunk_min": 3,
+        "streaming_text_reserved_len": 300,
+        "text_eos_token_id": 151692,
+        "tts_filter_loss_fix": false,
+        "use_llm_hidden_state": false,
+        "use_text": true,
+        "window_size": 2
+    },
+    "use_cache": true,
+    "use_image_id": true,
+    "use_sliding_window": false,
+    "version": "4.5",
+    "vision_batch_size": 16,
+    "vision_config": {
+        "_attn_implementation_autoset": true,
+        "attention_dropout": 0.0,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "image_size": 980,
+        "intermediate_size": 4304,
+        "layer_norm_eps": 1e-06,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_channels": 3,
+        "num_hidden_layers": 27,
+        "patch_size": 14
+    },
+    "vocab_size": 151748
+}

configuration_minicpmo.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Union
+from transformers import PretrainedConfig
+from transformers import Qwen3Config
+from transformers import WhisperConfig
+from transformers.utils import logging
+from .modeling_navit_siglip import SiglipVisionConfig
+logger = logging.get_logger(__name__)
+class MiniCPMVSliceConfig(PretrainedConfig):
+    model_type = "minicpmv"
+    def __init__(
+        self,
+        patch_size=14,
+        max_slice_nums=9,
+        scale_resolution=448,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.max_slice_nums = max_slice_nums
+        self.scale_resolution = scale_resolution
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if config_dict.get("model_type") == "minicpmv":
+            config_dict = config_dict["slice_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class MiniCPMTTSConfig(PretrainedConfig):
+    model_type = "minicpmtts"
+    def __init__(
+        self,
+        llm_dim: int = 2560,
+        llm_intermediate_size: int = 768,
+        llm_down_scale: bool = False,
+        llm_dim_model_base: int = 256,
+        projector_type: str = "mlp",
+        hidden_act: str = "silu",
+        aug_loss_weight: bool = False,
+        aug_layer_loss_weight: bool = False,
+        filter_tts_loss: bool = False,
+        tts_filter_loss_fix: bool = False,
+        long_weight: float = 0.1,
+        short_weight: float = 0.1,
+        hidden_size: int = 768,
+        intermediate_size: int = 3072,
+        num_attention_heads: int = 12,
+        num_hidden_layers: int = 20,
+        num_key_value_heads: int = 12,
+        max_position_embeddings: int = 4096,
+        num_audio_tokens: int = 4097,
+        num_text_tokens: int = 21178,
+        num_mel_bins: int = 100,
+        num_vq: int = 1,
+        use_llm_hidden_state: bool = False,
+        audio_bos_token_id: int = 21132,
+        text_eos_token_id: int = 21133,
+        use_text: bool = True,
+        streaming: bool = False,
+        streaming_text_chunk_min: int = 3,
+        streaming_text_chunk_max: int = 7,
+        streaming_text_reserved_len: int = 300,
+        streaming_audio_chunk_size: int = 50,
+        attn_implementation: str = "sdpa",
+        condition_type: str = "llm_hidden",
+        backbone_model: str = "llama",
+        audio_tokenizer_type: str = "wavtokenizer",
+        audio_tokenizer_sample_rate: int = 24000,
+        streaming_sliding_window: bool = False,
+        streaming_sliding_window_max_text_len: int = 500,
+        streaming_sliding_window_average_speed: int = 5,
+        streaming_sliding_window_fast_speed: int = 7,
+        streaming_sliding_window_slow_speed: int = 3,
+        streaming_sliding_window_audio_frame_rate: int = 50,
+        streaming_sliding_window_audio_init_text_length: int = 10,
+        streaming_sliding_window_audio_window_size: int = 300,
+        normalize_projected_hidden: bool = False,
+        interleaved: bool = False,
+        attention_type: str = "sliding_recompute",
+        recomputed_chunks: int = 1,
+        window_size: int = 2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.llm_dim = llm_dim
+        self.llm_hidden_size = llm_dim
+        self.llm_intermediate_size = llm_intermediate_size
+        self.llm_down_scale = llm_down_scale
+        self.llm_dim_model_base = llm_dim_model_base
+        self.projector_type = projector_type
+        self.aug_loss_weight = aug_loss_weight
+        self.aug_layer_loss_weight = aug_layer_loss_weight
+        self.tts_filter_loss_fix = tts_filter_loss_fix
+        self.filter_tts_loss = filter_tts_loss
+        self.long_weight = long_weight
+        self.short_weight = short_weight
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.num_key_value_heads = num_key_value_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.num_audio_tokens = num_audio_tokens
+        self.num_text_tokens = num_text_tokens
+        self.num_mel_bins = num_mel_bins
+        self.num_vq = num_vq
+        self.use_llm_hidden_state = use_llm_hidden_state
+        self.audio_bos_token_id = audio_bos_token_id
+        self.text_eos_token_id = text_eos_token_id
+        self.use_text = use_text
+        self.streaming = streaming
+        self.streaming_text_chunk_min = streaming_text_chunk_min
+        self.streaming_text_chunk_max = streaming_text_chunk_max
+        self.streaming_text_reserved_len = streaming_text_reserved_len
+        self.streaming_audio_chunk_size = streaming_audio_chunk_size
+        self.attn_implementation = attn_implementation
+        self.condition_type = condition_type
+        self.backbone_model = backbone_model
+        self.audio_tokenizer_type = audio_tokenizer_type
+        self.audio_tokenizer_sample_rate = audio_tokenizer_sample_rate
+        self.streaming_sliding_window = streaming_sliding_window
+        self.streaming_sliding_window_max_text_len = streaming_sliding_window_max_text_len
+        self.streaming_sliding_window_average_speed = streaming_sliding_window_average_speed
+        self.streaming_sliding_window_fast_speed = streaming_sliding_window_fast_speed
+        self.streaming_sliding_window_slow_speed = streaming_sliding_window_slow_speed
+        self.streaming_sliding_window_audio_frame_rate = streaming_sliding_window_audio_frame_rate
+        self.streaming_sliding_window_audio_init_text_length = streaming_sliding_window_audio_init_text_length
+        self.streaming_sliding_window_audio_window_size = streaming_sliding_window_audio_window_size
+        self.normalize_projected_hidden = normalize_projected_hidden
+        self.interleaved = interleaved
+        self.attention_type = attention_type
+        self.recomputed_chunks = recomputed_chunks
+        self.window_size = window_size
+class MiniCPMOConfig(Qwen3Config):
+    model_type = "minicpmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    default_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 980,
+        "intermediate_size": 4304,
+        "model_type": "siglip",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+    def __init__(
+        self,
+        use_cache=True,
+        query_num=64,
+        image_size=448,
+        drop_vision_last_layer=True,
+        batch_vision_input=True,
+        slice_config=None,
+        vision_config=None,
+        audio_config=None,
+        tts_config=None,
+        use_image_id=True,
+        vision_batch_size=16,
+        audio_pool_step=5,
+        audio_chunk_length=1.0,
+        stream_input=False,
+        listen_speak_type="asr",
+        init_vision=True,
+        init_audio=True,
+        init_tts=True,
+        **kwargs,
+    ):
+        self.use_cache = use_cache
+        self.query_num = query_num
+        self.image_size = image_size
+        self.drop_vision_last_layer = drop_vision_last_layer
+        self.batch_vision_input = batch_vision_input
+        self.use_image_id = use_image_id
+        self.vision_batch_size = vision_batch_size
+        self.audio_pool_step = audio_pool_step
+        self.audio_chunk_length = audio_chunk_length
+        self.stream_input = stream_input
+        self.listen_speak_type = listen_speak_type
+        self.init_vision = init_vision
+        self.init_audio = init_audio
+        self.init_tts = init_tts
+        if slice_config is None:
+            self.slice_config = MiniCPMVSliceConfig(max_slice_nums=1)
+        else:
+            self.slice_config = MiniCPMVSliceConfig(**slice_config)
+        self.slice_mode = True
+        # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
+        if vision_config is None:
+            self.vision_config = SiglipVisionConfig(**self.default_vision_config)
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = SiglipVisionConfig(**vision_config)
+        elif isinstance(vision_config, SiglipVisionConfig):
+            self.vision_config = vision_config
+        if audio_config is None:
+            self.audio_config = WhisperConfig()
+        elif isinstance(audio_config, dict):
+            self.audio_config = WhisperConfig(**audio_config)
+        elif isinstance(audio_config, WhisperConfig):
+            self.audio_config = audio_config
+        if tts_config is None:
+            self.tts_config = MiniCPMTTSConfig()
+        elif isinstance(tts_config, dict):
+            self.tts_config = MiniCPMTTSConfig(**tts_config)
+        elif isinstance(tts_config, MiniCPMTTSConfig):
+            self.tts_config = tts_config
+        self.patch_size = self.vision_config.patch_size
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f90098983cc0aa9ebbe229df10c3de352f33a07ef57c321e76437bab5fa5a2c7
+size 5361810143

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94eec5685bbbdd2611070d789aa5e5beba9a8fcbd9ba6849762160a9813a0a1d
+size 781665788

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_minicpmo.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_navit_siglip.py ADDED Viewed

	@@ -0,0 +1,981 @@

+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Siglip model."""
+# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings
+from transformers.utils import add_start_docstrings_to_model_forward
+from transformers.utils import is_flash_attn_2_available
+from transformers.utils import logging
+from transformers.utils import ModelOutput
+from transformers.utils import replace_return_docstrings
+logger = logging.get_logger(__name__)
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_vision_model"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis  # noqa
+    from flash_attn.bert_padding import pad_input
+    from flash_attn.bert_padding import unpad_input
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # if past_key_value is not None:
+        #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(query_states, key_states, value_states, attention_mask, query_length)
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.self_attn = SiglipAttention(config) if not self._use_flash_attention_2 else SiglipFlashAttention2(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+    _no_split_modules = []
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self._use_flash_attention_2
+                else patch_attention_mask
+            )
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        if not return_dict:
+            return (last_hidden_state, None) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "image_processor_type": "MiniCPMVImageProcessor",
+  "feature_extractor_type": "MiniCPMAAudioProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor",
+    "AutoImageProcessor": "processing_minicpmo.MiniCPMVImageProcessor",
+    "AutoFeatureExtractor": "processing_minicpmo.MiniCPMAAudioProcessor"
+  },
+  "processor_class": "MiniCPMOProcessor",
+  "max_slice_nums": 9,
+  "scale_resolution": 448,
+  "patch_size": 14,
+  "use_image_id": true,
+  "image_feature_size": 64,
+  "im_start": "<image>",
+  "im_end": "</image>",
+  "slice_start": "<slice>",
+  "slice_end": "</slice>",
+  "unk": "<unk>",
+  "im_id_start": "<image_id>",
+  "im_id_end": "</image_id>",
+  "slice_mode": true,
+  "audio_pool_step": 5,
+  "norm_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "norm_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "version": 4.5
+}

processing_minicpmo.py ADDED Viewed

	@@ -0,0 +1,1665 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import re
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoImageProcessor
+from transformers.audio_utils import spectrogram
+from transformers.audio_utils import window_function
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_transforms import to_channel_dimension_format
+from transformers.image_utils import ChannelDimension
+from transformers.image_utils import ImageInput
+from transformers.image_utils import infer_channel_dimension_format
+from transformers.image_utils import is_torch_tensor
+from transformers.image_utils import to_numpy_array
+from transformers.image_utils import valid_images
+from transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput
+from transformers.tokenization_utils_base import TextInput
+from transformers.utils import is_torch_device
+from transformers.utils import is_torch_dtype
+from transformers.utils import requires_backends
+from transformers.utils import TensorType
+def recursive_converter(converter, value):
+    if isinstance(value, list):
+        new_value = []
+        for v in value:
+            new_value += [recursive_converter(converter, v)]
+        return new_value
+    else:
+        return converter(value)
+class MiniCPMOBatchFeature(BatchFeature):
+    """Extend from BatchFeature for supporting various image size"""
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        if tensor_type is None:
+            return self
+        is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
+        def converter(value):
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+                    return tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+        for key, value in self.items():
+            self[key] = recursive_converter(converter, value)
+        return self
+    def to(self, *args, **kwargs) -> "MiniCPMOBatchFeature":
+        requires_backends(self, ["torch"])
+        import torch
+        def cast_tensor(v):
+            if not torch.is_tensor(v):
+                return v
+            if torch.is_floating_point(v):
+                return v.to(*args, **kwargs)
+            elif device is not None:
+                return v.to(device=device)
+            else:
+                return v
+        new_data = {}
+        device = kwargs.get("device")
+        if device is None and len(args) > 0:
+            arg = args[0]
+            if is_torch_dtype(arg):
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            new_data[k] = recursive_converter(cast_tensor, v)
+        self.data = new_data
+        return self
+class MiniCPMVImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+    def __init__(self, max_slice_nums=9, scale_resolution=448, patch_size=14, **kwargs):
+        super().__init__(**kwargs)
+        self.max_slice_nums = max_slice_nums
+        self.scale_resolution = scale_resolution
+        self.patch_size = patch_size
+        self.use_image_id = kwargs.pop("use_image_id", False)
+        self.image_feature_size = kwargs.pop("image_feature_size", 64)
+        self.im_start_token = kwargs.pop("im_start", "<image>")
+        self.im_end_token = kwargs.pop("im_end", "</image>")
+        self.slice_start_token = kwargs.pop("slice_start", "<slice>")
+        self.slice_end_token = kwargs.pop("slice_end", "</slice>")
+        self.unk_token = kwargs.pop("unk", "<unk>")
+        self.im_id_start = kwargs.pop("im_id_start", "<image_id>")
+        self.im_id_end = kwargs.pop("im_id_end", "</image_id>")
+        self.slice_mode = kwargs.pop("slice_mode", True)
+        self.mean = np.array(kwargs.pop("norm_mean", [0.5, 0.5, 0.5]))
+        self.std = np.array(kwargs.pop("norm_std", [0.5, 0.5, 0.5]))
+        self.version = kwargs.pop("version", 2.0)
+    @staticmethod
+    def ensure_divide(length, patch_size):
+        return max(round(length / patch_size) * patch_size, patch_size)
+    def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False):
+        width, height = original_size
+        if (width * height > scale_resolution * scale_resolution) or allow_upscale:
+            r = width / height
+            height = int(scale_resolution / math.sqrt(r))
+            width = int(height * r)
+        best_width = self.ensure_divide(width, patch_size)
+        best_height = self.ensure_divide(height, patch_size)
+        return best_width, best_height
+    def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False):
+        width, height = original_size
+        grid_x, grid_y = grid
+        refine_width = self.ensure_divide(width, grid_x)
+        refine_height = self.ensure_divide(height, grid_y)
+        grid_width = refine_width / grid_x
+        grid_height = refine_height / grid_y
+        best_grid_size = self.find_best_resize(
+            (grid_width, grid_height), scale_resolution, patch_size, allow_upscale=allow_upscale
+        )
+        refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
+        return refine_size
+    @staticmethod
+    def split_to_patches(image, grid):
+        patches = []
+        width, height = image.size
+        grid_x = int(width / grid[0])
+        grid_y = int(height / grid[1])
+        for i in range(0, height, grid_y):
+            images = []
+            for j in range(0, width, grid_x):
+                box = (j, i, j + grid_x, i + grid_y)
+                patch = image.crop(box)
+                images.append(patch)
+            patches.append(images)
+        return patches
+    def slice_image(self, image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
+        original_size = image.size
+        source_image = None
+        best_grid = self.get_sliced_grid(original_size, max_slice_nums, never_split)
+        patches = []
+        if best_grid is None:
+            # dont need to slice, upsample
+            best_size = self.find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=True)
+            source_image = image.resize(best_size, resample=Image.Resampling.BICUBIC)
+        else:
+            # source image, down-sampling and ensure divided by patch_size
+            best_resize = self.find_best_resize(original_size, scale_resolution, patch_size)
+            source_image = image.copy().resize(best_resize, resample=Image.Resampling.BICUBIC)
+            refine_size = self.get_refine_size(
+                original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
+            )
+            refine_image = image.resize(refine_size, resample=Image.Resampling.BICUBIC)
+            patches = self.split_to_patches(refine_image, best_grid)
+        return source_image, patches, best_grid
+    def get_grid_placeholder(self, grid):
+        if grid is None:
+            return ""
+        slice_image_placeholder = (
+            self.slice_start_token + self.unk_token * self.image_feature_size + self.slice_end_token
+        )
+        cols = grid[0]
+        rows = grid[1]
+        slices = []
+        for i in range(rows):
+            lines = []
+            for j in range(cols):
+                lines.append(slice_image_placeholder)
+            slices.append("".join(lines))
+        slice_placeholder = "\n".join(slices)
+        return slice_placeholder
+    def get_image_id_placeholder(self, idx=0):
+        return f"{self.im_id_start}{idx}{self.im_id_end}"
+    def get_sliced_images(self, image, max_slice_nums=None):
+        slice_images = []
+        if not self.slice_mode:
+            return [image]
+        max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
+        assert max_slice_nums > 0
+        source_image, patches, sliced_grid = self.slice_image(
+            image, max_slice_nums, self.scale_resolution, self.patch_size  # default: 9  # default: 448  # default: 14
+        )
+        slice_images.append(source_image)
+        if len(patches) > 0:
+            for i in range(len(patches)):
+                for j in range(len(patches[0])):
+                    slice_images.append(patches[i][j])
+        return slice_images
+    def get_sliced_grid(self, image_size, max_slice_nums, nerver_split=False):
+        original_width, original_height = image_size
+        log_ratio = math.log(original_width / original_height)
+        ratio = original_width * original_height / (self.scale_resolution * self.scale_resolution)
+        multiple = min(math.ceil(ratio), max_slice_nums)
+        if multiple <= 1 or nerver_split:
+            return None
+        candidate_split_grids_nums = []
+        for i in [multiple - 1, multiple, multiple + 1]:
+            if i == 1 or i > max_slice_nums:
+                continue
+            candidate_split_grids_nums.append(i)
+        candidate_grids = []
+        for split_grids_nums in candidate_split_grids_nums:
+            m = 1
+            while m <= split_grids_nums:
+                if split_grids_nums % m == 0:
+                    candidate_grids.append([m, split_grids_nums // m])
+                m += 1
+        best_grid = [1, 1]
+        min_error = float("inf")
+        for grid in candidate_grids:
+            error = abs(log_ratio - math.log(grid[0] / grid[1]))
+            if error < min_error:
+                best_grid = grid
+                min_error = error
+        return best_grid
+    def get_slice_image_placeholder(self, image_size, image_idx=0, max_slice_nums=None, use_image_id=None):
+        max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
+        assert max_slice_nums > 0
+        grid = self.get_sliced_grid(image_size=image_size, max_slice_nums=max_slice_nums)
+        image_placeholder = self.im_start_token + self.unk_token * self.image_feature_size + self.im_end_token
+        use_image_id = self.use_image_id if use_image_id is None else bool(use_image_id)
+        if use_image_id:
+            final_placeholder = self.get_image_id_placeholder(image_idx) + image_placeholder
+        else:
+            final_placeholder = image_placeholder
+        if self.slice_mode:
+            final_placeholder = final_placeholder + self.get_grid_placeholder(grid=grid)
+        return final_placeholder
+    @staticmethod
+    def to_pil_image(image, rescale=None) -> Image.Image:
+        """Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back
+        as the last axis if needed.
+        Args:
+            image (`Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+                The image to convert to the PIL Image format.
+            rescale (`bool`, *optional*):
+                whether to apply the scaling factor (to make pixel values integers between 0 and 255). Will
+                default to `True` if the image type is a floating type, `False` otherwise.
+        """
+        if isinstance(image, Image.Image):
+            return image
+        if is_torch_tensor(image):
+            image = image.numpy()
+        if isinstance(image, np.ndarray):
+            if rescale is None:
+                # rescale default to the array being of floating type.
+                rescale = isinstance(image.flat[0], np.floating)
+            # If the channel as been moved to first dim, we put it back at the end.
+            if image.ndim == 3 and image.shape[0] in [1, 3]:
+                image = image.transpose(1, 2, 0)
+            if rescale:
+                image = image * 255
+            image = image.astype(np.uint8)
+            return Image.fromarray(image)
+        return image
+    def reshape_by_patch(self, image):
+        image = torch.from_numpy(image)
+        patch_size = self.patch_size
+        patches = torch.nn.functional.unfold(image, (patch_size, patch_size), stride=(patch_size, patch_size))
+        patches = patches.reshape(image.size(0), patch_size, patch_size, -1)
+        patches = patches.permute(0, 1, 3, 2).reshape(image.size(0), patch_size, -1)
+        return patches.numpy()
+    def preprocess(
+        self,
+        images: Union[Image.Image, List[Image.Image], List[List[Image.Image]]],
+        do_pad: Optional[bool] = True,
+        max_slice_nums: int = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> MiniCPMOBatchFeature:
+        if isinstance(images, Image.Image):
+            images_list = [[images]]
+        elif isinstance(images[0], Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
+        new_images_list = []
+        image_sizes_list = []
+        tgt_sizes_list = []
+        for _images in images_list:
+            if _images is None or len(_images) == 0:
+                new_images_list.append([])
+                image_sizes_list.append([])
+                tgt_sizes_list.append([])
+                continue
+            if not valid_images(_images):
+                raise ValueError(
+                    "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                    "torch.Tensor, tf.Tensor or jax.ndarray."
+                )
+            _images = [self.to_pil_image(image).convert("RGB") for image in _images]
+            input_data_format = infer_channel_dimension_format(np.array(_images[0]))
+            new_images = []
+            image_sizes = [image.size for image in _images]
+            tgt_sizes = []
+            for image in _images:
+                image_patches = self.get_sliced_images(image, max_slice_nums)
+                image_patches = [to_numpy_array(image).astype(np.float32) / 255 for image in image_patches]
+                image_patches = [
+                    self.normalize(image=image, mean=self.mean, std=self.std, input_data_format=input_data_format)
+                    for image in image_patches
+                ]
+                image_patches = [
+                    to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                    for image in image_patches
+                ]
+                for slice_image in image_patches:
+                    new_images.append(self.reshape_by_patch(slice_image))
+                    tgt_sizes.append(
+                        np.array((slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size))
+                    )
+            if tgt_sizes:
+                tgt_sizes = np.vstack(tgt_sizes)
+            new_images_list.append(new_images)
+            image_sizes_list.append(image_sizes)
+            tgt_sizes_list.append(tgt_sizes)
+        return MiniCPMOBatchFeature(
+            data={"pixel_values": new_images_list, "image_sizes": image_sizes_list, "tgt_sizes": tgt_sizes_list},
+            tensor_type=return_tensors,
+        )
+AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
+def chunk_audio(audio: np.ndarray, max_duration_seconds: int = 30, sample_rate: int = 16000) -> List[np.ndarray]:
+    """split long audio into chunks
+    Args:
+        audio:
+        max_duration_seconds:
+        sample_rate:
+    Returns:
+        chunks
+    """
+    max_len = int(max_duration_seconds * sample_rate)
+    if len(audio) <= max_len:
+        return [audio]
+    chunks = []
+    for i in range(0, len(audio), max_len):
+        chunk = audio[i : i + max_len]
+        chunks.append(chunk)
+    return chunks
+def process_audio_batch(
+    audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
+    feature_extractor,
+    sampling_rate: int = 16000,
+    max_duration_seconds: int = 30,
+    return_attention_mask: bool = True,
+) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    """extract audio mel features
+    Args:
+        audios:
+        feature_extractor: WhisperFeatureExtractor
+        sampling_rate:
+        max_duration_seconds:
+        return_attention_mask:
+    Returns:
+        (audio_features, audio_feature_lens)
+        audio_features: [batch_size, n_mels, max_frames]
+        audio_feature_lens:
+    """
+    if isinstance(audios, np.ndarray):
+        audios_list = [[audios]]
+    elif len(audios) > 0 and isinstance(audios[0], np.ndarray):
+        audios_list = [audios]
+    else:
+        audios_list = audios
+    audio_features_all = []
+    audio_feature_lens_list = []
+    for batch_audios in audios_list:
+        batch_lens = []
+        for audio in batch_audios:
+            chunks = chunk_audio(audio, max_duration_seconds, sampling_rate)
+            for chunk in chunks:
+                audio_input = feature_extractor(
+                    chunk,
+                    sampling_rate=sampling_rate,
+                    return_tensors="pt",
+                    padding="max_length",
+                    return_attention_mask=return_attention_mask,
+                )
+                audio_feature = audio_input["input_features"]  # [1, 80, frames]
+                if return_attention_mask:
+                    actual_len = audio_input["attention_mask"].sum(dim=1)  # Tensor([frames])
+                    audio_feature = audio_feature[:, :, : actual_len[0]]
+                    batch_lens.append(actual_len[0])
+                else:
+                    batch_lens.append(torch.tensor(audio_feature.shape[2]))
+                audio_features_all.append(audio_feature.squeeze(0))  # [80, frames]
+        if len(batch_lens) > 0:
+            audio_feature_lens_list.append(torch.hstack(batch_lens))
+        else:
+            audio_feature_lens_list.append(torch.tensor([]))
+    # pad to same length
+    if audio_features_all:
+        audio_features = torch.nn.utils.rnn.pad_sequence(
+            [feat.transpose(0, 1) for feat in audio_features_all], batch_first=True, padding_value=0.0
+        ).transpose(
+            1, 2
+        )  # [batch, 80, max_frames]
+    else:
+        audio_features = torch.tensor([])
+    return audio_features, audio_feature_lens_list
+def regroup_audio_features(
+    audio_features: torch.Tensor, audio_feature_lens: List[torch.Tensor], regroup_seconds: int, fps: int = 100
+) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    """regroup audio features to fixed duration
+    Args:
+        audio_features: [batch, n_mels, frames]
+        audio_feature_lens: each batch's actual length
+        regroup_seconds: regroup duration (seconds)
+        fps: frames per second
+    Returns:
+        (regrouped_features, regrouped_lens)
+    """
+    # flatten to continuous frames sequence
+    all_lens = []
+    for lens in audio_feature_lens:
+        if isinstance(lens, torch.Tensor):
+            all_lens.extend(lens.tolist())
+        elif isinstance(lens, list):
+            all_lens.extend([int(x) for x in lens])
+    if len(all_lens) == 0:
+        return torch.tensor([]), []
+    # concatenate all valid features
+    flat_slices = [audio_features[i, :, :L] for i, L in enumerate(all_lens)]  # [n_mels, L]
+    if len(flat_slices) == 1:
+        full_feat = flat_slices[0]
+    else:
+        full_feat = torch.cat(flat_slices, dim=1)  # [n_mels, total_frames]
+    # split to fixed frames
+    frames_per_seg = int(regroup_seconds * fps)
+    segments = []
+    for start in range(0, full_feat.size(1), frames_per_seg):
+        seg = full_feat[:, start : start + frames_per_seg]
+        if seg.size(1) > 0:
+            segments.append(seg)
+    if len(segments) == 0:
+        return torch.tensor([]), []
+    # pad and convert to batch
+    seg_lens = [s.size(1) for s in segments]
+    segs_transposed = [s.transpose(0, 1) for s in segments]
+    padded = torch.nn.utils.rnn.pad_sequence(segs_transposed, batch_first=True, padding_value=0.0)  # [N, max_T, n_mels]
+    padded = padded.transpose(1, 2)  # [N, n_mels, max_T]
+    lens_tensor = torch.tensor(seg_lens, dtype=torch.int32, device=padded.device)
+    return padded, [lens_tensor]
+class MiniCPMAAudioProcessor(WhisperFeatureExtractor):
+    """
+    On top of WhisperFeatureExtractor:
+    - support dynamic_log_norm (original max-8dB, adjustable dynamic_range_db)
+    - or fixed log_floor_db (e.g. -10dB)
+        - this is because we need to do streaming scheme, in which we can't do dynamic setting
+        - this can be modified in the middle, through set_dynamic_log_norm
+    Two paths (torch / numpy) keep consistent clipping and scaling order:
+        log10 -> (dynamic/fixed lower limit clipping) -> (+4)/4
+    """
+    def __init__(
+        self,
+        *args,
+        dynamic_log_norm: bool = True,
+        dynamic_range_db: float = 8.0,
+        log_floor_db: float = -10.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.dynamic_log_norm = bool(dynamic_log_norm)
+        self.dynamic_range_db = float(dynamic_range_db)
+        self.log_floor_db = float(log_floor_db)
+    def set_spac_log_norm(
+        self,
+        dynamic_range_db: Optional[float] = None,
+        log_floor_db: Optional[float] = None,
+        *,
+        inplace: bool = True,
+    ) -> "MiniCPMAAudioProcessor":
+        """Hot update dynamic/fixed lower limit strategy.
+        Args:
+            enabled: True=use dynamic threshold (max - dynamic_range_db), False=use fixed lower limit log_floor_db.
+                    None means keep unchanged.
+            dynamic_range_db: dynamic range (dB), only effective when enabled=True. None means keep unchanged.
+            log_floor_db: fixed log floor (dB, usually <= 0), only effective when enabled=False. None means keep unchanged.
+            inplace: True directly modify current instance; False return a shallow copy and modify on it.
+        Returns:
+            self or new instance (when inplace=False).
+        """
+        target = self if inplace else copy.copy(self)
+        if dynamic_range_db is not None:
+            val = float(dynamic_range_db)
+            if val < 0:
+                raise ValueError("dynamic_range_db must be >= 0.")
+            target.dynamic_log_norm = True  # explicitly set the value to dynamic mode
+            target.dynamic_range_db = val
+        if log_floor_db is not None:
+            val = float(log_floor_db)
+            # usually log10(mel) maximum is not more than ~0dB, floor should be <= 0; here do loose validation
+            if val > 0:
+                raise ValueError("log_floor_db should be <= 0 (log10 scale).")
+            target.dynamic_log_norm = False  # explicitly set the value to fixed lower limit mode
+            target.log_floor_db = val
+        return target
+    def _np_extract_fbank_features(self, waveform_batch: np.ndarray, device: str) -> np.ndarray:
+        """NumPy version consistent with upstream, but replace max-8dB with configurable dynamic/fixed lower limit clipping."""
+        if device != "cpu":
+            raise ValueError(
+                f"Got device `{device}` for feature extraction, but feature extraction on CUDA accelerator "
+                "devices requires torch. Set device='cpu' or install torch."
+            )
+        log_spec_batch: List[np.ndarray] = []
+        for waveform in waveform_batch:
+            # generate log10 Mel
+            log_spec = spectrogram(
+                waveform,
+                window_function(self.n_fft, "hann"),
+                frame_length=self.n_fft,
+                hop_length=self.hop_length,
+                power=2.0,
+                dither=self.dither,
+                mel_filters=self.mel_filters,
+                log_mel="log10",
+            )
+            # consistent with upstream: remove the last frame
+            log_spec = log_spec[:, :-1]
+            # dynamic/fixed clipping
+            if self.dynamic_log_norm:
+                threshold = log_spec.max() - self.dynamic_range_db
+                log_spec = np.maximum(log_spec, threshold)
+            else:
+                log_spec = np.maximum(log_spec, self.log_floor_db)
+            # consistent with Whisper linear scaling
+            log_spec = (log_spec + 4.0) / 4.0
+            log_spec_batch.append(log_spec)
+        return np.array(log_spec_batch)
+    def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
+        if torch is None:
+            raise RuntimeError("PyTorch is not installed, cannot compute STFT on GPU.")
+        waveform = torch.from_numpy(waveform).to(device, torch.float32)
+        window = torch.hann_window(self.n_fft, device=device)
+        if self.dither != 0.0:
+            waveform = waveform + self.dither * torch.randn_like(waveform)
+        stft = torch.stft(waveform, n_fft=self.n_fft, hop_length=self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)  # [n_mels, 1+n_fft//2]
+        mel_spec = mel_filters.T @ magnitudes  # [..., n_mels, T]
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()  # <= 0
+        if self.dynamic_log_norm:
+            if waveform.dim() == 2:
+                max_val_t = log_spec.max(dim=2, keepdim=True)[0]  # over T
+                max_val_bt = max_val_t.max(dim=1, keepdim=True)[0]  # over mel
+                threshold = max_val_bt - self.dynamic_range_db
+                log_spec = torch.maximum(log_spec, threshold)
+            else:
+                threshold = log_spec.max() - self.dynamic_range_db
+                log_spec = torch.maximum(log_spec, threshold)
+        else:
+            floor_tensor = torch.tensor(self.log_floor_db, dtype=log_spec.dtype, device=log_spec.device)
+            log_spec = torch.maximum(log_spec, floor_tensor)
+        log_spec = (log_spec + 4.0) / 4.0
+        if device != "cpu":
+            log_spec = log_spec.detach().cpu()
+        return log_spec.numpy()
+    def process(self, *args, **kwargs):
+        """Alias of __call__ for convenience."""
+        return self.__call__(*args, **kwargs)
+class StreamingMelProcessorExact:
+    """Strictly offline equivalent streaming Mel processor.
+    - accumulate all historical audio into buffer; use the same feature_extractor to calculate the entire mel after each addition.
+    - only output "stable" frames: the frame center does not depend on future (right) context, i.e. center + n_fft//2 <= current buffer length.
+    - output the last batch of frames at the end (flush), ensuring complete consistency with offline full-calculation.
+    Cost: Each call performs feature extraction on the accumulated buffer (can be optimized to incremental if needed).
+    """
+    def __init__(
+        self,
+        feature_extractor: MiniCPMAAudioProcessor,
+        chunk_ms: int = 100,
+        first_chunk_ms: Optional[int] = None,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        hop_length: int = 160,
+        n_mels: int = 80,
+        cnn_redundancy_ms: int = 10,  # (given in ms, usually 10ms=1 frame)
+        # sliding window parameters
+        enable_sliding_window: bool = False,  # whether to enable sliding window
+        slide_trigger_seconds: float = 30.0,  # trigger threshold for sliding window in seconds
+        slide_stride_seconds: float = 10.0,  # stride for sliding window in seconds
+    ):
+        self.feature_extractor = feature_extractor
+        self.chunk_ms = chunk_ms
+        self.first_chunk_ms = first_chunk_ms if first_chunk_ms is not None else chunk_ms
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.n_mels = n_mels
+        self.chunk_samples = int(round(chunk_ms * sample_rate / 1000))
+        self.chunk_frames = self.chunk_samples // hop_length
+        # align to hop_length to avoid frame boundary issues
+        hop = self.hop_length
+        raw_first_samples = int(round(self.first_chunk_ms * sample_rate / 1000))
+        aligned_first = max(hop, (raw_first_samples // hop) * hop)
+        self.first_chunk_samples = aligned_first
+        self.half_window = n_fft // 2  # required right context
+        # redundancy frames (in frames), <=1 frame: 10ms → 1 frame
+        self.cnn_redundancy_ms = cnn_redundancy_ms
+        self.cnn_redundancy_samples = int(cnn_redundancy_ms * sample_rate / 1000)
+        self.cnn_redundancy_frames = max(0, self.cnn_redundancy_samples // hop_length)
+        # sliding window configuration (Trigger mode)
+        self.enable_sliding_window = enable_sliding_window
+        self.trigger_seconds = slide_trigger_seconds
+        self.slide_seconds = slide_stride_seconds
+        # shift/base (global frame coordinates)
+        self.left_samples_dropped = 0  # samples dropped from the left
+        self.base_T = 0  # index of the "global frame" corresponding to mel_full[:, :, 0]
+        self.reset()
+    def reset(self):
+        self.buffer = np.zeros(0, dtype=np.float32)
+        self.last_emitted_T = 0
+        self.total_samples_processed = 0
+        self.chunk_count = 0
+        self.is_first = True
+        self.left_samples_dropped = 0
+        self.base_T = 0
+    def get_chunk_size(self) -> int:
+        return self.first_chunk_samples if self.is_first else self.chunk_samples
+    def get_expected_output_frames(self) -> int:
+        raise NotImplementedError("get_expected_output_frames is not implemented")
+    def _extract_full(self) -> torch.Tensor:
+        # when buffer length is less than n_fft, Whisper's internal STFT will raise an error in center=True and pad mode
+        # (pad is greater than input length). At this time, there is no stable frame to output, so return empty features directly.
+        if len(self.buffer) < self.n_fft:
+            raise ValueError(f"buffer length is shorter than n_fft {len(self.buffer)} < {self.n_fft}")
+        # if buffer length is less than 5s, use set_spac_log_norm(log_floor_db=-10) or the last cached result
+        if len(self.buffer) < 5 * self.sample_rate:
+            # TODO: here the best is to do some experiments to choose the best one, now this is selected through experience, can see MiniCPMAAudioProcessor's main implementation
+            self.feature_extractor.set_spac_log_norm(log_floor_db=-10)
+        # if buffer length is greater than 5s, use set_spac_log_norm(dynamic_range_db=8)
+        else:
+            self.feature_extractor.set_spac_log_norm(dynamic_range_db=8)
+        feats = self.feature_extractor(
+            self.buffer,
+            sampling_rate=self.sample_rate,
+            return_tensors="pt",
+            padding=False,
+        )
+        return feats.input_features  # [1, 80, T]
+    def _stable_frames_count(self) -> int:
+        # number of stable frames = floor((len(buffer) - half_window) / hop) + 1, minimum is 0
+        L = int(self.buffer.shape[0])
+        if L <= 0:
+            return 0
+        if L < self.half_window:
+            return 0
+        return max(0, (L - self.half_window) // self.hop_length + 1)
+    def _maybe_slide_buffer(self):
+        """Trigger mode sliding window: when the buffer reaches the trigger threshold, slide a fixed length window."""
+        if not self.enable_sliding_window:
+            return
+        sr = self.sample_rate
+        hop = self.hop_length
+        L = len(self.buffer)
+        # convert seconds to samples
+        trigger_samples = int(self.trigger_seconds * sr)
+        stride_samples = int(self.slide_seconds * sr)
+        # check if the trigger threshold is reached
+        if L < trigger_samples:
+            return
+        # calculate the number of samples to drop (fixed sliding stride_samples)
+        drop = stride_samples
+        # cannot drop the left context that is still needed for subsequent emission
+        # in trigger mode, we only need to protect the minimum necessary data
+        # i.e. ensure that we do not discard frames that may be needed in the future
+        last_emitted_local = self.last_emitted_T - self.base_T
+        # only protect necessary context (e.g. the most recent 1 second data)
+        min_keep_seconds = 1.0  # keep at least 1 second of data to ensure continuity
+        min_keep_samples = int(min_keep_seconds * sr)
+        # guard_samples are the minimum samples we must keep
+        guard_samples = min(min_keep_samples, L - drop)
+        # limit: do not exceed the safe boundary; and align hop
+        max_allowed_drop = max(0, L - guard_samples)
+        drop = min(drop, max_allowed_drop)
+        drop = (drop // hop) * hop
+        if drop <= 0:
+            return
+        # truly drop & update base
+        self.buffer = self.buffer[drop:]
+        self.left_samples_dropped += drop
+        self.base_T += drop // hop
+    def process(self, audio_chunk: np.ndarray, is_last_chunk: bool = False) -> Tuple[torch.Tensor, Dict]:
+        self.chunk_count += 1
+        # append to buffer
+        if len(self.buffer) == 0:
+            self.buffer = audio_chunk.astype(np.float32, copy=True)
+        else:
+            self.buffer = np.concatenate([self.buffer, audio_chunk.astype(np.float32, copy=True)])
+        # sliding window processing
+        self._maybe_slide_buffer()
+        # full extraction (for the current window)
+        mel_full = self._extract_full()
+        T_full = mel_full.shape[-1]  # local frames in the current window
+        stable_T = min(T_full, self._stable_frames_count())  # local stable frames
+        stable_T_global = self.base_T + stable_T  # map to global frame coordinates
+        # plan the core frames for the current emission (global coordinates)
+        core_start_g = self.last_emitted_T
+        core_end_g = core_start_g + self.chunk_frames
+        required_stable_g = core_end_g + self.cnn_redundancy_frames
+        if stable_T_global >= required_stable_g or is_last_chunk:
+            emit_start_g = max(0, core_start_g - self.cnn_redundancy_frames)
+            emit_end_g = core_end_g + self.cnn_redundancy_frames
+            # global -> local index
+            emit_start = max(0, emit_start_g - self.base_T)
+            emit_end = emit_end_g - self.base_T
+            emit_start = max(0, min(emit_start, T_full))
+            emit_end = max(emit_start, min(emit_end, T_full))
+            mel_output = mel_full[:, :, emit_start:emit_end]
+            self.last_emitted_T = core_end_g  # only advance the core frame pointer (global)
+        else:
+            mel_output = mel_full[:, :, 0:0]
+        self.total_samples_processed += len(audio_chunk)
+        self.is_first = False
+        info = {
+            "type": "exact_chunk",
+            "chunk_number": self.chunk_count,
+            "emitted_frames": mel_output.shape[-1],
+            "stable_T": stable_T,
+            "T_full": T_full,
+            "base_T": self.base_T,
+            "stable_T_global": stable_T_global,
+            "buffer_len_samples": int(self.buffer.shape[0]),
+            "left_samples_dropped": self.left_samples_dropped,
+            "core_start": core_start_g,  # if keep the original field name, use the global value here
+            "core_end": core_end_g,  # same as above
+        }
+        return mel_output, info
+    def flush(self) -> torch.Tensor:
+        """Called when the stream ends, output the remaining unemitted frames, ensuring consistency with offline (calculated by global coordinates)."""
+        if len(self.buffer) == 0:
+            return torch.zeros(1, 80, 0)
+        mel_full = self._extract_full()
+        T_local = mel_full.shape[-1]
+        T_global = self.base_T + T_local
+        if self.last_emitted_T < T_global:
+            start_l = max(0, self.last_emitted_T - self.base_T)
+            tail = mel_full[:, :, start_l:]
+            self.last_emitted_T = T_global
+            return tail
+        return mel_full[:, :, 0:0]
+    def get_config(self) -> Dict:
+        return {
+            "chunk_ms": self.chunk_ms,
+            "first_chunk_ms": self.first_chunk_ms,
+            "effective_first_chunk_ms": self.first_chunk_samples / self.sample_rate * 1000.0,
+            "sample_rate": self.sample_rate,
+            "n_fft": self.n_fft,
+            "hop_length": self.hop_length,
+            "cnn_redundancy_ms": self.cnn_redundancy_ms,
+            "cnn_redundancy_frames": self.cnn_redundancy_frames,
+            "enable_sliding_window": self.enable_sliding_window,
+            "trigger_seconds": self.trigger_seconds,
+            "slide_seconds": self.slide_seconds,
+        }
+    def get_state(self) -> Dict:
+        return {
+            "chunk_count": self.chunk_count,
+            "last_emitted_T": self.last_emitted_T,
+            "total_samples_processed": self.total_samples_processed,
+            "buffer_len": int(self.buffer.shape[0]),
+            "base_T": self.base_T,
+            "left_samples_dropped": self.left_samples_dropped,
+        }
+    def get_snapshot(self) -> Dict:
+        """Get a complete state snapshot (including buffer), used for recovery from a fast start.
+        Returns:
+            A dictionary containing the complete state, which can be used to restore the snapshot
+        """
+        buffer_copy = self.buffer.copy()
+        snapshot = {
+            "chunk_count": self.chunk_count,
+            "last_emitted_T": self.last_emitted_T,
+            "total_samples_processed": self.total_samples_processed,
+            "buffer": buffer_copy,
+            "base_T": self.base_T,
+            "left_samples_dropped": self.left_samples_dropped,
+            "is_first": self.is_first,
+            # save the state of the feature_extractor (key: ensure determinism of mel feature extraction)
+            "fe_dynamic_log_norm": getattr(self.feature_extractor, "dynamic_log_norm", None),
+            "fe_dynamic_range_db": getattr(self.feature_extractor, "dynamic_range_db", None),
+            "fe_log_floor_db": getattr(self.feature_extractor, "log_floor_db", None),
+        }
+        return snapshot
+    def restore_snapshot(self, snapshot: Dict) -> None:
+        """Restore state from a snapshot
+        Args:
+            snapshot: the snapshot dictionary returned by get_snapshot
+        """
+        # record the state before restoration
+        prev_state = {
+            "chunk_count": self.chunk_count,
+            "last_emitted_T": self.last_emitted_T,
+            "buffer_len": len(self.buffer),
+        }
+        # restore state
+        self.chunk_count = snapshot["chunk_count"]
+        self.last_emitted_T = snapshot["last_emitted_T"]
+        self.total_samples_processed = snapshot["total_samples_processed"]
+        self.buffer = snapshot["buffer"].copy()  # copy buffer
+        self.base_T = snapshot["base_T"]
+        self.left_samples_dropped = snapshot["left_samples_dropped"]
+        self.is_first = snapshot["is_first"]
+        # restore the state of the feature_extractor (key: ensure determinism of mel feature extraction)
+        if snapshot.get("fe_dynamic_log_norm") is not None:
+            self.feature_extractor.dynamic_log_norm = snapshot["fe_dynamic_log_norm"]
+        if snapshot.get("fe_dynamic_range_db") is not None:
+            self.feature_extractor.dynamic_range_db = snapshot["fe_dynamic_range_db"]
+        if snapshot.get("fe_log_floor_db") is not None:
+            self.feature_extractor.log_floor_db = snapshot["fe_log_floor_db"]
+class MiniCPMOProcessor(ProcessorMixin):
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    audio_processor_class = "AutoFeatureExtractor"
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, audio_processor=None, tokenizer=None, **kwargs):
+        super().__init__(image_processor, audio_processor, tokenizer)
+        self.version = image_processor.version if image_processor else None
+        # audio feature pooling step, needs to be consistent with config.audio_pool_step
+        self.pool_step = kwargs.get("audio_pool_step", 5)
+        # initialize the streaming audio processor
+        self._streaming_mel_processor = None
+        if audio_processor is not None:
+            self._init_streaming_processor()
+    def get_audio_placeholder(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        """
+        Public method to get audio placeholder string for vLLM integration.
+        Args:
+            audio_lens: Length of audio in samples
+            chunk_input: Whether to use chunked processing
+            chunk_length: Chunk length in seconds
+        Returns:
+            Audio placeholder string
+        """
+        pool_step = self.pool_step
+        feature_lens = math.ceil(audio_lens / self.audio_processor.hop_length)
+        feature_lens = (feature_lens - 1) // 2 + 1
+        output_lens = (feature_lens - pool_step) // pool_step + 1
+        if chunk_input:
+            fbank_feat_in_chunk = int(chunk_length * 100)
+            cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
+            audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+            num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
+            place_holders = ""
+            total_unk_len = 0
+            for _ in range(num_audio_chunks):
+                unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
+                place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
+                total_unk_len += unk_len
+            audio_placeholder = place_holders
+        else:
+            audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
+        return audio_placeholder
+    def _init_streaming_processor(
+        self,
+        chunk_ms: int = 100,
+        cnn_redundancy_ms: int = 0,
+        *,
+        mode: str = "exact",
+        first_chunk_ms: Optional[int] = None,
+        enable_sliding_window: bool = False,
+        slide_trigger_seconds: float = 30.0,
+        slide_stride_seconds: float = 10.0,
+    ):
+        """Initialize the streaming processor
+        Args:
+            chunk_ms: Chunk size in milliseconds, also the sliding step.
+            cnn_redundancy_ms: CNN boundary redundancy in milliseconds (before and after), 0 means standard mode.
+            mode: streaming processing mode, currently only supports "exact"
+            first_chunk_ms: the size of the first chunk (milliseconds), if not specified, it is the same as chunk_ms
+            enable_sliding_window: whether to enable sliding window (trigger mode)
+            slide_trigger_seconds: trigger threshold for sliding window in seconds
+            slide_stride_seconds: stride for sliding window in seconds
+        """
+        if mode == "exact":
+            self._streaming_mel_processor = StreamingMelProcessorExact(
+                feature_extractor=self.audio_processor,
+                chunk_ms=chunk_ms,
+                first_chunk_ms=first_chunk_ms,
+                sample_rate=16000,
+                cnn_redundancy_ms=cnn_redundancy_ms,
+                enable_sliding_window=enable_sliding_window,
+                slide_trigger_seconds=slide_trigger_seconds,
+                slide_stride_seconds=slide_stride_seconds,
+            )
+        else:
+            raise ValueError(f"Unsupported mode: {mode}, only 'exact' is supported")
+        self._streaming_mode = mode if mode in ["exact"] else ("exact")
+    def set_streaming_mode(
+        self,
+        mode: str = "exact",
+        chunk_ms: int = 100,
+        cnn_redundancy_ms: int = 0,
+        *,
+        first_chunk_ms: Optional[int] = None,
+        enable_sliding_window: bool = False,
+        slide_trigger_seconds: float = 30.0,
+        slide_stride_seconds: float = 10.0,
+    ):
+        """Set streaming processing mode
+        Args:
+            mode: streaming processing mode, currently only supports "exact"
+            chunk_ms: chunk size in milliseconds, also the sliding step.
+            cnn_redundancy_ms: CNN boundary redundancy in milliseconds (before and after), 0 means standard mode.
+            first_chunk_ms: the size of the first chunk (milliseconds), if not specified, it is the same as chunk_ms
+            enable_sliding_window: whether to enable sliding window (trigger mode)
+            slide_trigger_seconds: trigger threshold for sliding window in seconds
+            slide_stride_seconds: stride for sliding window in seconds
+        """
+        if self.audio_processor is None:
+            raise ValueError("audio_processor is not set, cannot initialize the streaming processor")
+        self._init_streaming_processor(
+            chunk_ms=chunk_ms,
+            cnn_redundancy_ms=cnn_redundancy_ms,
+            mode=mode,
+            first_chunk_ms=first_chunk_ms,
+            enable_sliding_window=enable_sliding_window,
+            slide_trigger_seconds=slide_trigger_seconds,
+            slide_stride_seconds=slide_stride_seconds,
+        )
+    def process_image(
+        self,
+        images: Optional[ImageInput] = None,
+        do_pad: bool = True,
+        max_slice_nums: int = 1,
+        return_tensors: str = "pt",
+    ) -> MiniCPMOBatchFeature:
+        """Process image data
+        Args:
+            images: input images
+            do_pad: whether to pad
+            max_slice_nums: maximum number of slices
+            return_tensors: return tensor type
+        Returns:
+            MiniCPMOBatchFeature object
+        """
+        if images is None:
+            return MiniCPMOBatchFeature(data={"pixel_values": [[]], "image_sizes": [[]], "tgt_sizes": [[]]})
+        result = self.image_processor(
+            images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
+        )
+        model_inputs = {
+            "pixel_values": result.get("pixel_values", [[]]),
+            "image_sizes": result.get("image_sizes", [[]]),
+            "tgt_sizes": result.get("tgt_sizes", [[]]),
+        }
+        return MiniCPMOBatchFeature(data=model_inputs)
+    def process_audio(
+        self,
+        audios: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        sampling_rate: int = 16000,
+        regroup_to_seconds: Optional[int] = None,
+        fps: int = 100,
+    ) -> MiniCPMOBatchFeature:
+        """Process audio data in batch
+        Args:
+            audios: audio data
+            sampling_rate: sampling rate
+            regroup_to_seconds: regroup duration in seconds
+            fps: frames per second
+        Returns:
+            MiniCPMOBatchFeature object
+        """
+        if audios is None:
+            return MiniCPMOBatchFeature(data={"audio_features": [], "audio_feature_lens": []})
+        audio_features, audio_feature_lens = process_audio_batch(
+            audios=audios,
+            feature_extractor=self.audio_processor,
+            sampling_rate=sampling_rate,
+            max_duration_seconds=30,
+            return_attention_mask=True,
+        )
+        if regroup_to_seconds is not None and len(audio_features) > 0:
+            audio_features, audio_feature_lens = regroup_audio_features(
+                audio_features=audio_features,
+                audio_feature_lens=audio_feature_lens,
+                regroup_seconds=regroup_to_seconds,
+                fps=fps,
+            )
+        model_inputs = {"audio_features": audio_features, "audio_feature_lens": audio_feature_lens}
+        return MiniCPMOBatchFeature(data=model_inputs)
+    def process_audio_streaming(
+        self,
+        audio_chunk: np.ndarray,
+        reset: bool = False,
+        return_batch_feature: bool = False,
+        is_last_chunk: bool = False,
+    ) -> Union[Tuple[torch.Tensor, dict], MiniCPMOBatchFeature]:
+        """Process audio chunk in streaming
+        Args:
+            audio_chunk: audio data chunk (any audio, e.g. first process 125ms, then process 100ms)
+            reset: whether to reset the processor state
+            return_batch_feature: whether to return MiniCPMOBatchFeature format (consistent with process_audio)
+        Returns:
+            If return_batch_feature=False:
+                (audio_features, info)
+                - audio_features: [1, 80, n_frames] mel features
+                - info: processing information dictionary
+            If return_batch_feature=True:
+                MiniCPMOBatchFeature object, containing:
+                - audio_features: [1, 80, n_frames] mel features
+                - audio_feature_lens: [tensor([n_frames])]
+                - info: processing information (as an extra attribute)
+        """
+        if self._streaming_mel_processor is None:
+            raise ValueError("Streaming processor not initialized, please ensure audio_processor is set")
+        if reset:
+            self._streaming_mel_processor.reset()
+        # process chunk
+        mel_features, info = self._streaming_mel_processor.process(audio_chunk, is_last_chunk=is_last_chunk)
+        # determine the return format based on the parameters
+        if return_batch_feature:
+            # return the format consistent with process_audio
+            # note: info returns emitted_frames, which represents the actual output frames
+            n_frames = info.get("emitted_frames", mel_features.shape[-1])
+            model_inputs = {
+                "audio_features": mel_features,
+                "audio_feature_lens": [torch.tensor([n_frames])],
+                "streaming_info": info,  # add streaming processing information
+            }
+            return MiniCPMOBatchFeature(data=model_inputs)
+        else:
+            return mel_features, info
+    def reset_streaming(self):
+        if self._streaming_mel_processor is not None:
+            self._streaming_mel_processor.reset()
+    def get_streaming_chunk_size(self) -> int:
+        if self._streaming_mel_processor is None:
+            raise ValueError("Streaming processor not initialized")
+        return self._streaming_mel_processor.get_chunk_size()
+    def configure_streaming(
+        self,
+        chunk_ms: int = 100,
+        enable_sliding_window: bool = False,
+        slide_trigger_seconds: float = 30.0,
+        slide_stride_seconds: float = 10.0,
+    ):
+        """Configure streaming processor parameters
+        Args:
+            chunk_ms: chunk size in milliseconds
+            enable_sliding_window: whether to enable sliding window (trigger mode)
+            slide_trigger_seconds: trigger threshold for sliding window in seconds
+            slide_stride_seconds: stride for sliding window in seconds
+        """
+        if self.audio_processor is None:
+            raise ValueError("audio_processor is not set")
+        self._init_streaming_processor(
+            chunk_ms=chunk_ms,
+            enable_sliding_window=enable_sliding_window,
+            slide_trigger_seconds=slide_trigger_seconds,
+            slide_stride_seconds=slide_stride_seconds,
+        )
+    def get_streaming_config(self) -> dict:
+        if self._streaming_mel_processor is None:
+            return {}
+        return self._streaming_mel_processor.get_config()
+    def get_streaming_state(self) -> dict:
+        if self._streaming_mel_processor is None:
+            return {}
+        return self._streaming_mel_processor.get_state()
+    def get_streaming_snapshot(self) -> dict:
+        if self._streaming_mel_processor is None:
+            return {}
+        return self._streaming_mel_processor.get_snapshot()
+    def restore_streaming_snapshot(self, snapshot: dict) -> None:
+        if self._streaming_mel_processor is None:
+            return
+        if not snapshot:
+            return
+        self._streaming_mel_processor.restore_snapshot(snapshot)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
+        images: ImageInput = None,
+        audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]] = None,
+        audio_parts: Optional[list] = None,
+        max_length: Optional[int] = None,
+        do_pad: Optional[bool] = True,
+        max_slice_nums: int = None,
+        use_image_id: bool = True,
+        stream_input: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        sampling_rate: Optional[int] = 16000,
+        online_streaming: bool = False,
+        audio_chunk_idx: int = 0,
+        is_last_chunk: bool = False,
+        **kwargs,
+    ) -> MiniCPMOBatchFeature:
+        if images is not None:
+            image_inputs = self.process_image(
+                images=images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
+            )
+        else:
+            image_inputs = None
+        audio_features, audio_feature_lens, audio_phs = self.audio_feature_extract(
+            audios,
+            audio_parts,
+            stream_input,
+            sampling_rate,
+            online_streaming=online_streaming,
+            is_last_chunk=is_last_chunk,
+        )
+        model_inputs = self._convert_omni_to_inputs(
+            image_inputs,
+            audio_phs,
+            text,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+            max_length=max_length,
+            **kwargs,
+        )
+        model_inputs["audio_features"] = audio_features
+        model_inputs["audio_feature_lens"] = audio_feature_lens
+        result = MiniCPMOBatchFeature(data={**model_inputs})
+        if online_streaming:
+            result.use_extra_context = True
+            result.prefix_extra_frames = 0 if audio_chunk_idx == 0 else 2
+            result.suffix_extra_frames = 2
+            result.chunk_idx = audio_chunk_idx
+        return result
+    def audio_feature_extract(
+        self,
+        audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]], None] = None,
+        audio_parts: Optional[list] = None,
+        stream_input: Optional[bool] = False,
+        sampling_rate: Optional[int] = None,
+        chunk_length: Optional[int] = 1,
+        online_streaming: bool = False,
+        is_last_chunk: bool = False,
+        **kwargs,
+    ):
+        if audios is None:
+            return [], [], []
+        if isinstance(audios, np.ndarray):
+            audios_list = [[audios]]
+        elif isinstance(audios[0], np.ndarray):
+            audios_list = [audios]
+        else:
+            audios_list = audios
+        if audio_parts is not None:
+            assert len(audio_parts) == len(audios_list)
+            for parts, audios in zip(audio_parts, audios_list):
+                assert len(parts) == len(audios)
+        audio_feature_lens_list = []
+        audio_ph_list = []
+        audio_features_all = []
+        # audio placeholder not dependent on audio_parts
+        for audios in audios_list:
+            if audios:
+                audio_ph_list.append(
+                    [
+                        self.get_audio_placeholder(len(a), chunk_input=stream_input, chunk_length=chunk_length)
+                        for a in audios
+                    ]
+                )
+            else:
+                audio_ph_list.append([])
+        for idx, audios in enumerate(audios_list):
+            if audio_parts is not None:
+                # same audio part merge
+                audio_part = audio_parts[idx]
+                merge_audio = []
+                cur_audio = []
+                for aid, (part, audio) in enumerate(zip(audio_part, audios)):
+                    if aid == 0 or audio_part[aid] == audio_part[aid - 1]:
+                        cur_audio.append(audio)
+                    else:
+                        merge_audio.append(np.hstack(cur_audio))
+                        cur_audio = [audio]
+                if cur_audio:
+                    merge_audio.append(np.hstack(cur_audio))
+            else:
+                merge_audio = audios
+            # If the audio exceeds 30 seconds, split it into chunks every 30 seconds.
+            final_merge_audio = []
+            max_audio_inp_len = 30 * sampling_rate
+            for audio in merge_audio:
+                if len(audio) <= max_audio_inp_len:
+                    final_merge_audio.append(audio)
+                else:
+                    for i in range(math.ceil(len(audio) / max_audio_inp_len)):
+                        final_merge_audio.append(audio[i * max_audio_inp_len : (i + 1) * max_audio_inp_len])
+            audio_feature_lens = []
+            if audios:
+                if online_streaming:
+                    # online streaming: only support single audio, directly use process_audio_streaming return format
+                    assert (
+                        len(final_merge_audio) == 1
+                    ), f"online streaming mode only supports single audio, currently there are {len(final_merge_audio)}"
+                    audio = final_merge_audio[0]
+                    result = self.process_audio_streaming(
+                        audio, reset=False, return_batch_feature=True, is_last_chunk=is_last_chunk
+                    )
+                    audio_features_all.append(
+                        result["audio_features"].squeeze(0)
+                    )  # [1, 80, T] -> [80, T], keep consistent with batch processing
+                    audio_feature_lens_list.append(result["audio_feature_lens"][0])
+                else:
+                    # batch processing
+                    audio_inputs = self.audio_processor(
+                        final_merge_audio,
+                        sampling_rate=sampling_rate,
+                        return_attention_mask=True,
+                        padding="max_length",
+                        return_tensors="pt",
+                        **kwargs,
+                    )
+                    audio_feature = audio_inputs["input_features"]
+                    actual_lens = audio_inputs["attention_mask"].sum(dim=1)
+                    for feat, lens in zip(audio_feature, actual_lens):
+                        audio_features_all.append(feat[:, :lens])
+                        audio_feature_lens.append(lens)
+                    audio_feature_lens = torch.hstack(audio_feature_lens)
+                    audio_feature_lens_list.append(audio_feature_lens)
+            else:
+                audio_feature_lens_list.append([])
+        if audio_features_all:
+            audio_features = [i.permute(1, 0) for i in audio_features_all]
+            audio_features = torch.nn.utils.rnn.pad_sequence(
+                audio_features, batch_first=True, padding_value=0.0
+            ).permute(0, 2, 1)
+        else:
+            audio_features = []
+        return audio_features, audio_feature_lens_list, audio_ph_list
+    def _convert(self, input_str, max_inp_length: Optional[int] = None):
+        old_input_ids = self.tokenizer.encode(input_str)
+        listen_token_id = self.tokenizer.convert_tokens_to_ids("<|listen|>")
+        input_ids = []
+        for token in old_input_ids:
+            if token != listen_token_id:
+                input_ids.append(token)
+        if max_inp_length is not None:
+            input_ids = input_ids[:max_inp_length]
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        ## image bound
+        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
+        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
+        image_start_idx = torch.where(start_cond)[0]
+        image_start_idx += 1
+        image_end_idx = torch.where(end_cond)[0]
+        valid_image_nums = max(len(image_start_idx), len(image_end_idx))
+        image_bounds = torch.hstack(
+            [
+                image_start_idx[:valid_image_nums].unsqueeze(-1),
+                image_end_idx[:valid_image_nums].unsqueeze(-1),
+            ]
+        )
+        ##  audio bound
+        audio_start_idx = torch.where(input_ids == self.tokenizer.audio_start_id)[0]
+        audio_end_idx = torch.where(input_ids == self.tokenizer.audio_end_id)[0]
+        assert len(audio_start_idx) == len(audio_end_idx)
+        audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
+        spk_start_idx = torch.where(input_ids == self.tokenizer.spk_start_id)[0]
+        spk_end_idx = torch.where(input_ids == self.tokenizer.spk_end_id)[0]
+        assert len(spk_start_idx) == len(spk_end_idx)
+        spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
+        return input_ids, image_bounds, audio_bounds, spk_bounds
+    def _convert_omni_to_inputs(
+        self,
+        images,
+        audio_phs,
+        texts: Union[str, List[str]],
+        truncation=None,
+        max_length=None,
+        max_slice_nums=None,
+        use_image_id=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        if images is None and audio_phs is None:
+            model_inputs = self.tokenizer(
+                texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs
+            )
+            return MiniCPMOBatchFeature(data={**model_inputs})
+        image_pattern = "<image>./</image>"
+        audio_pattern = "<audio>./</audio>"
+        split_pattern = f"({image_pattern}|{audio_pattern})"
+        if isinstance(texts, str):
+            texts = [texts]
+        bs = len(texts)
+        if images is not None:
+            images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
+        else:
+            images, image_sizes, tgt_sizes = [[]] * bs, [[]] * bs, [[]] * bs
+        input_ids_list = []
+        image_bounds_list = []
+        audio_bounds_list = []
+        spk_bounds_list = []
+        for index, text in enumerate(texts):
+            text_chunks = re.split(split_pattern, text)
+            image_tags = re.findall(image_pattern, text)
+            audio_tags = re.findall(audio_pattern, text)
+            if image_tags:
+                assert images is not None
+                assert len(image_tags) == len(image_sizes[index])
+            if audio_tags:
+                assert audio_phs is not None
+                assert len(audio_tags) == len(audio_phs[index])
+            image_id = 0
+            audio_id = 0
+            for i, chunk in enumerate(text_chunks):
+                if chunk == image_pattern:
+                    image_placeholder = self.image_processor.get_slice_image_placeholder(
+                        image_sizes[index][image_id], image_id, max_slice_nums, use_image_id
+                    )
+                    image_id += 1
+                    text_chunks[i] = image_placeholder
+                elif chunk == audio_pattern:
+                    audio_placeholder = audio_phs[index][audio_id]
+                    audio_id += 1
+                    text_chunks[i] = audio_placeholder
+            final_text = "".join(text_chunks)
+            input_ids, image_bounds, audio_bounds, spk_bounds = self._convert(final_text, max_length)
+            input_ids_list.append(input_ids)
+            image_bounds_list.append(image_bounds)
+            audio_bounds_list.append(audio_bounds)
+            spk_bounds_list.append(spk_bounds)
+        padded_input_ids, padding_lengths = self.pad(input_ids_list, padding_side="left")
+        attention_mask = torch.ones_like(padded_input_ids, dtype=torch.bool)
+        for i, length in enumerate(padding_lengths):
+            image_bounds_list[i] = image_bounds_list[i] + length
+            audio_bounds_list[i] = audio_bounds_list[i] + length
+            spk_bounds_list[i] = spk_bounds_list[i] + length
+            attention_mask[i, :length] = False
+        data = {
+            "input_ids": padded_input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": images,
+            "image_sizes": image_sizes,
+            "image_bound": image_bounds_list,
+            "tgt_sizes": tgt_sizes,
+            "audio_bounds": audio_bounds_list,
+            "spk_bounds": spk_bounds_list,
+        }
+        return data
+    def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
+        items = []
+        if isinstance(inputs[0], list):
+            assert isinstance(inputs[0][0], torch.Tensor)
+            for it in inputs:
+                for tr in it:
+                    items.append(tr)
+        else:
+            assert isinstance(inputs[0], torch.Tensor)
+            items = inputs
+        batch_size = len(items)
+        shape = items[0].shape
+        dim = len(shape)
+        assert dim <= 2
+        if max_length is None:
+            max_length = 0
+        max_length = max(max_length, max(item.shape[-1] for item in items))
+        min_length = min(item.shape[-1] for item in items)
+        dtype = items[0].dtype
+        if dim == 0:
+            return torch.stack([item for item in items], dim=0), [0]
+        elif dim == 1:
+            if max_length == min_length:
+                return torch.stack([item for item in items], dim=0), [0] * batch_size
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        else:
+            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+        padding_length = []
+        for i, item in enumerate(items):
+            if dim == 1:
+                if padding_side == "left":
+                    tensor[i, -len(item) :] = item.clone()
+                else:
+                    tensor[i, : len(item)] = item.clone()
+            elif dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item) :, :] = item.clone()
+                else:
+                    tensor[i, : len(item), :] = item.clone()
+            padding_length.append(tensor.shape[-1] - len(item))
+        return tensor, padding_length

processor_config.json ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  "audio_processor": {
+    "audio_pool_step": 5,
+    "auto_map": {
+      "AutoFeatureExtractor": "processing_minicpmo.MiniCPMAAudioProcessor",
+      "AutoImageProcessor": "processing_minicpmo.MiniCPMVImageProcessor",
+      "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor"
+    },
+    "chunk_length": 30,
+    "dither": 0.0,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 80,
+    "hop_length": 160,
+    "im_end": "</image>",
+    "im_id_end": "</image_id>",
+    "im_id_start": "<image_id>",
+    "im_start": "<image>",
+    "image_feature_size": 64,
+    "image_processor_type": "MiniCPMVImageProcessor",
+    "max_slice_nums": 9,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "norm_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "norm_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "patch_size": 14,
+    "return_attention_mask": false,
+    "sampling_rate": 16000,
+    "scale_resolution": 448,
+    "slice_end": "</slice>",
+    "slice_mode": true,
+    "slice_start": "<slice>",
+    "unk": "<unk>",
+    "use_image_id": true,
+    "version": 4.5
+  },
+  "image_processor": {
+    "audio_pool_step": 5,
+    "auto_map": {
+      "AutoFeatureExtractor": "processing_minicpmo.MiniCPMAAudioProcessor",
+      "AutoImageProcessor": "processing_minicpmo.MiniCPMVImageProcessor",
+      "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor"
+    },
+    "im_end": "</image>",
+    "im_id_end": "</image_id>",
+    "im_id_start": "<image_id>",
+    "im_start": "<image>",
+    "image_feature_size": 64,
+    "image_processor_type": "MiniCPMOImageProcessor",
+    "max_slice_nums": 9,
+    "mean": [
+      [
+        [
+          0.5,
+          0.5,
+          0.5
+        ]
+      ]
+    ],
+    "patch_size": 14,
+    "scale_resolution": 448,
+    "slice_end": "</slice>",
+    "slice_mode": true,
+    "slice_start": "<slice>",
+    "std": [
+      [
+        [
+          0.5,
+          0.5,
+          0.5
+        ]
+      ]
+    ],
+    "unk": "<unk>",
+    "use_image_id": true,
+    "version": 4.5
+  },
+  "processor_class": "MiniCPMOProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,580 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<point>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</point>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<slice>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</slice>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</image_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<unit>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</unit>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<focus>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</focus>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<perception>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</perception>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<source_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</source_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image_save_to>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</image_save_to>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|audio_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|audio_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|spk_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|spk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|spk_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tts_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tts_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|listen|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|speak|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|interrupt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|vad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|vad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|emotion_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|emotion_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|speed_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|speed_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|pitch_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|pitch_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|turn_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|turn_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_tts_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_tts_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tts_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|im_start|>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": "<unk>"
+}

tokenization_minicpmo_fast.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from transformers import Qwen2TokenizerFast
+class MiniCPMOTokenizerFast(Qwen2TokenizerFast):
+    def __init__(self, **kwargs):
+        self._bad_token_ids = kwargs.pop("bad_token_ids", [])
+        super().__init__(**kwargs)
+        # image
+        self.im_start = "<image>"
+        self.im_end = "</image>"
+        self.ref_start = "<ref>"
+        self.ref_end = "</ref>"
+        self.box_start = "<box>"
+        self.box_end = "</box>"
+        self.quad_start = "<quad>"
+        self.quad_end = "</quad>"
+        self.slice_start = "<slice>"
+        self.slice_end = "</slice>"
+        self.im_id_start = "<image_id>"
+        self.im_id_end = "</image_id>"
+        # audio
+        self.audio_start = "<|audio_start|>"
+        self.audio_end = "<|audio_end|>"
+        self.spk_start = "<|spk_bos|>"
+        self.spk_end = "<|spk_eos|>"
+        self.tts_start = "<|tts_bos|>"
+        self.tts_end = "<|tts_eos|>"
+    @property
+    def eos_id(self):
+        return self.eos_token_id
+    @property
+    def bos_id(self):
+        return self.bos_token_id
+    @property
+    def unk_id(self):
+        return self.unk_token_id
+    @property
+    def im_start_id(self):
+        return self.convert_tokens_to_ids(self.im_start)
+    @property
+    def im_end_id(self):
+        return self.convert_tokens_to_ids(self.im_end)
+    @property
+    def slice_start_id(self):
+        return self.convert_tokens_to_ids(self.slice_start)
+    @property
+    def slice_end_id(self):
+        return self.convert_tokens_to_ids(self.slice_end)
+    @property
+    def im_id_start_id(self):
+        return self.convert_tokens_to_ids(self.im_id_start)
+    @property
+    def im_id_end_id(self):
+        return self.convert_tokens_to_ids(self.im_id_end)
+    @property
+    def audio_start_id(self):
+        return self.convert_tokens_to_ids(self.audio_start)
+    @property
+    def audio_end_id(self):
+        return self.convert_tokens_to_ids(self.audio_end)
+    @property
+    def spk_start_id(self):
+        return self.convert_tokens_to_ids(self.spk_start)
+    @property
+    def spk_end_id(self):
+        return self.convert_tokens_to_ids(self.spk_end)
+    @property
+    def tts_start_id(self):
+        return self.convert_tokens_to_ids(self.tts_start)
+    @property
+    def tts_end_id(self):
+        return self.convert_tokens_to_ids(self.tts_end)
+    @staticmethod
+    def escape(text: str) -> str:
+        return text
+    @staticmethod
+    def unescape(text: str) -> str:
+        return text
+    @property
+    def bad_token_ids(self) -> List[int]:
+        return self._bad_token_ids

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66664f87759d9e829e7ef0ded96976727374dcd7ca6f3ae9bfe89bbda541e5af
+size 11437708

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,6989 @@

+{
+  "add_prefix_space": false,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_minicpmo_fast.MiniCPMOTokenizerFast",
+      null
+    ]
+  },
+  "backend": "tokenizers",
+  "bad_token_ids": [
+    7,
+    8,
+    94,
+    95,
+    96,
+    97,
+    98,
+    99,
+    100,
+    101,
+    102,
+    103,
+    104,
+    105,
+    106,
+    107,
+    108,
+    109,
+    110,
+    111,
+    112,
+    113,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    120,
+    121,
+    122,
+    123,
+    124,
+    125,
+    126,
+    127,
+    128,
+    129,
+    130,
+    131,
+    132,
+    133,
+    134,
+    135,
+    136,
+    137,
+    138,
+    139,
+    140,
+    141,
+    142,
+    143,
+    144,
+    145,
+    146,
+    147,
+    148,
+    149,
+    150,
+    151,
+    152,
+    153,
+    154,
+    155,
+    156,
+    157,
+    158,
+    159,
+    160,
+    161,
+    162,
+    163,
+    164,
+    165,
+    166,
+    167,
+    168,
+    169,
+    170,
+    171,
+    172,
+    173,
+    174,
+    175,
+    176,
+    177,
+    178,
+    179,
+    180,
+    181,
+    182,
+    183,
+    184,
+    185,
+    186,
+    187,
+    198,
+    201,
+    222,
+    223,
+    224,
+    225,
+    226,
+    227,
+    228,
+    229,
+    230,
+    231,
+    232,
+    233,
+    234,
+    235,
+    236,
+    237,
+    238,
+    239,
+    240,
+    241,
+    242,
+    243,
+    244,
+    245,
+    246,
+    247,
+    248,
+    249,
+    250,
+    251,
+    252,
+    253,
+    254,
+    255,
+    271,
+    280,
+    317,
+    319,
+    320,
+    340,
+    341,
+    345,
+    368,
+    378,
+    382,
+    397,
+    401,
+    445,
+    456,
+    463,
+    492,
+    510,
+    515,
+    532,
+    543,
+    555,
+    568,
+    593,
+    624,
+    626,
+    630,
+    636,
+    692,
+    698,
+    699,
+    701,
+    715,
+    735,
+    736,
+    741,
+    751,
+    756,
+    797,
+    863,
+    871,
+    873,
+    876,
+    881,
+    899,
+    921,
+    935,
+    955,
+    972,
+    982,
+    1005,
+    1006,
+    1010,
+    1019,
+    1022,
+    1066,
+    1106,
+    1138,
+    1141,
+    1153,
+    1155,
+    1157,
+    1171,
+    1188,
+    1193,
+    1213,
+    1215,
+    1218,
+    1248,
+    1255,
+    1259,
+    1264,
+    1277,
+    1278,
+    1295,
+    1305,
+    1326,
+    1337,
+    1339,
+    1394,
+    1406,
+    1423,
+    1428,
+    1433,
+    1439,
+    1445,
+    1447,
+    1476,
+    1500,
+    1505,
+    1507,
+    1540,
+    1548,
+    1554,
+    1572,
+    1583,
+    1625,
+    1647,
+    1648,
+    1666,
+    1683,
+    1686,
+    1711,
+    1719,
+    1727,
+    1747,
+    1755,
+    1771,
+    1781,
+    1789,
+    1823,
+    1827,
+    1837,
+    1871,
+    1883,
+    1926,
+    1939,
+    1956,
+    2012,
+    2026,
+    2075,
+    2099,
+    2129,
+    2139,
+    2140,
+    2146,
+    2194,
+    2217,
+    2219,
+    2226,
+    2235,
+    2242,
+    2262,
+    2278,
+    2289,
+    2303,
+    2306,
+    2315,
+    2333,
+    2357,
+    2376,
+    2394,
+    2398,
+    2399,
+    2440,
+    2457,
+    2467,
+    2490,
+    2529,
+    2533,
+    2546,
+    2553,
+    2558,
+    2561,
+    2570,
+    2586,
+    2592,
+    2597,
+    2601,
+    2605,
+    2741,
+    2751,
+    2759,
+    2761,
+    2775,
+    2785,
+    2791,
+    2815,
+    2822,
+    2834,
+    2837,
+    2858,
+    2877,
+    2879,
+    2920,
+    2960,
+    3011,
+    3023,
+    3025,
+    3032,
+    3044,
+    3071,
+    3079,
+    3148,
+    3153,
+    3189,
+    3190,
+    3203,
+    3211,
+    3219,
+    3224,
+    3237,
+    3268,
+    3269,
+    3276,
+    3284,
+    3299,
+    3302,
+    3315,
+    3325,
+    3374,
+    3396,
+    3402,
+    3407,
+    3456,
+    3471,
+    3475,
+    3485,
+    3489,
+    3490,
+    3507,
+    3515,
+    3518,
+    3534,
+    3554,
+    3568,
+    3576,
+    3584,
+    3593,
+    3609,
+    3610,
+    3622,
+    3623,
+    3634,
+    3646,
+    3648,
+    3659,
+    3679,
+    3715,
+    3724,
+    3733,
+    3747,
+    3755,
+    3759,
+    3764,
+    3789,
+    3804,
+    3860,
+    3870,
+    3876,
+    3879,
+    3913,
+    3932,
+    3964,
+    3975,
+    3989,
+    3997,
+    4010,
+    4015,
+    4035,
+    4071,
+    4080,
+    4140,
+    4167,
+    4175,
+    4192,
+    4199,
+    4257,
+    4277,
+    4283,
+    4291,
+    4292,
+    4319,
+    4333,
+    4346,
+    4380,
+    4386,
+    4390,
+    4397,
+    4424,
+    4430,
+    4432,
+    4455,
+    4456,
+    4461,
+    4474,
+    4539,
+    4546,
+    4555,
+    4557,
+    4567,
+    4579,
+    4590,
+    4609,
+    4610,
+    4639,
+    4660,
+    4667,
+    4710,
+    4723,
+    4806,
+    4811,
+    4821,
+    4857,
+    4860,
+    4866,
+    4875,
+    4877,
+    4891,
+    4894,
+    4919,
+    4921,
+    4930,
+    4945,
+    4956,
+    4957,
+    4967,
+    4973,
+    5031,
+    5038,
+    5065,
+    5078,
+    5125,
+    5130,
+    5133,
+    5134,
+    5140,
+    5194,
+    5210,
+    5231,
+    5267,
+    5304,
+    5349,
+    5378,
+    5384,
+    5393,
+    5410,
+    5422,
+    5432,
+    5433,
+    5434,
+    5438,
+    5453,
+    5455,
+    5463,
+    5468,
+    5484,
+    5502,
+    5513,
+    5550,
+    5563,
+    5564,
+    5581,
+    5613,
+    5618,
+    5620,
+    5661,
+    5680,
+    5684,
+    5691,
+    5731,
+    5758,
+    5783,
+    5784,
+    5801,
+    5808,
+    5809,
+    5860,
+    5872,
+    5894,
+    5895,
+    5900,
+    5902,
+    5905,
+    5907,
+    5928,
+    5929,
+    5945,
+    5953,
+    5957,
+    5959,
+    5960,
+    5966,
+    5969,
+    5993,
+    6011,
+    6060,
+    6098,
+    6107,
+    6110,
+    6114,
+    6138,
+    6142,
+    6163,
+    6201,
+    6204,
+    6211,
+    6228,
+    6235,
+    6253,
+    6260,
+    6274,
+    6320,
+    6332,
+    6336,
+    6360,
+    6390,
+    6413,
+    6416,
+    6421,
+    6425,
+    6475,
+    6518,
+    6522,
+    6557,
+    6561,
+    6564,
+    6567,
+    6571,
+    6599,
+    6606,
+    6620,
+    6636,
+    6684,
+    6699,
+    6734,
+    6762,
+    6779,
+    6796,
+    6856,
+    6870,
+    6881,
+    6882,
+    6901,
+    6903,
+    6913,
+    6926,
+    6933,
+    6958,
+    6965,
+    6987,
+    7021,
+    7026,
+    7045,
+    7088,
+    7115,
+    7129,
+    7144,
+    7158,
+    7165,
+    7195,
+    7213,
+    7229,
+    7235,
+    7252,
+    7257,
+    7287,
+    7307,
+    7311,
+    7317,
+    7318,
+    7320,
+    7331,
+    7333,
+    7356,
+    7360,
+    7367,
+    7368,
+    7392,
+    7457,
+    7492,
+    7502,
+    7511,
+    7520,
+    7534,
+    7536,
+    7537,
+    7547,
+    7552,
+    7569,
+    7588,
+    7613,
+    7620,
+    7624,
+    7631,
+    7635,
+    7644,
+    7694,
+    7705,
+    7723,
+    7731,
+    7805,
+    7832,
+    7835,
+    7851,
+    7885,
+    7908,
+    8006,
+    8013,
+    8022,
+    8032,
+    8068,
+    8079,
+    8096,
+    8097,
+    8119,
+    8132,
+    8154,
+    8158,
+    8204,
+    8244,
+    8268,
+    8283,
+    8306,
+    8324,
+    8361,
+    8389,
+    8390,
+    8401,
+    8488,
+    8508,
+    8519,
+    8525,
+    8533,
+    8563,
+    8588,
+    8616,
+    8620,
+    8623,
+    8637,
+    8680,
+    8739,
+    8754,
+    8762,
+    8797,
+    8803,
+    8805,
+    8824,
+    8839,
+    8843,
+    8902,
+    8908,
+    8917,
+    8945,
+    8956,
+    8972,
+    8997,
+    8998,
+    9035,
+    9049,
+    9098,
+    9139,
+    9147,
+    9206,
+    9231,
+    9248,
+    9260,
+    9272,
+    9284,
+    9304,
+    9321,
+    9338,
+    9340,
+    9365,
+    9401,
+    9404,
+    9469,
+    9470,
+    9531,
+    9533,
+    9541,
+    9563,
+    9568,
+    9577,
+    9604,
+    9607,
+    9623,
+    9638,
+    9639,
+    9640,
+    9697,
+    9719,
+    9722,
+    9730,
+    9739,
+    9743,
+    9747,
+    9758,
+    9764,
+    9791,
+    9828,
+    9843,
+    9849,
+    9879,
+    9900,
+    9909,
+    9912,
+    9921,
+    9940,
+    9957,
+    9973,
+    9992,
+    10040,
+    10075,
+    10079,
+    10081,
+    10086,
+    10108,
+    10116,
+    10131,
+    10133,
+    10149,
+    10170,
+    10171,
+    10203,
+    10236,
+    10239,
+    10278,
+    10293,
+    10297,
+    10340,
+    10343,
+    10346,
+    10368,
+    10370,
+    10384,
+    10389,
+    10407,
+    10417,
+    10419,
+    10444,
+    10448,
+    10452,
+    10478,
+    10483,
+    10556,
+    10569,
+    10583,
+    10595,
+    10624,
+    10639,
+    10662,
+    10663,
+    10683,
+    10697,
+    10699,
+    10722,
+    10727,
+    10731,
+    10741,
+    10764,
+    10768,
+    10874,
+    10896,
+    10907,
+    10937,
+    10939,
+    10941,
+    10947,
+    10952,
+    10959,
+    10961,
+    10977,
+    10996,
+    11014,
+    11022,
+    11024,
+    11043,
+    11057,
+    11120,
+    11125,
+    11132,
+    11142,
+    11146,
+    11162,
+    11166,
+    11173,
+    11195,
+    11248,
+    11322,
+    11328,
+    11334,
+    11336,
+    11378,
+    11384,
+    11436,
+    11462,
+    11481,
+    11484,
+    11488,
+    11512,
+    11520,
+    11534,
+    11557,
+    11559,
+    11583,
+    11600,
+    11614,
+    11620,
+    11621,
+    11648,
+    11688,
+    11700,
+    11730,
+    11743,
+    11776,
+    11781,
+    11801,
+    11805,
+    11843,
+    11858,
+    11884,
+    11895,
+    11970,
+    11973,
+    11974,
+    11985,
+    11993,
+    11995,
+    12006,
+    12022,
+    12024,
+    12047,
+    12078,
+    12084,
+    12113,
+    12130,
+    12143,
+    12145,
+    12148,
+    12186,
+    12236,
+    12259,
+    12266,
+    12274,
+    12279,
+    12286,
+    12298,
+    12306,
+    12310,
+    12317,
+    12367,
+    12381,
+    12397,
+    12410,
+    12431,
+    12437,
+    12470,
+    12489,
+    12491,
+    12504,
+    12512,
+    12520,
+    12526,
+    12530,
+    12531,
+    12533,
+    12573,
+    12584,
+    12585,
+    12587,
+    12602,
+    12619,
+    12628,
+    12663,
+    12668,
+    12706,
+    12760,
+    12787,
+    12797,
+    12798,
+    12805,
+    12806,
+    12832,
+    12860,
+    12861,
+    12892,
+    12918,
+    12924,
+    12950,
+    12951,
+    12954,
+    13003,
+    13005,
+    13053,
+    13056,
+    13058,
+    13087,
+    13092,
+    13106,
+    13116,
+    13135,
+    13148,
+    13165,
+    13174,
+    13201,
+    13204,
+    13211,
+    13246,
+    13268,
+    13274,
+    13285,
+    13287,
+    13321,
+    13338,
+    13344,
+    13345,
+    13362,
+    13426,
+    13441,
+    13453,
+    13456,
+    13465,
+    13493,
+    13512,
+    13519,
+    13531,
+    13555,
+    13558,
+    13567,
+    13645,
+    13698,
+    13744,
+    13749,
+    13836,
+    13838,
+    13846,
+    13861,
+    13869,
+    13887,
+    13913,
+    13921,
+    13941,
+    13949,
+    13976,
+    13982,
+    13988,
+    14001,
+    14013,
+    14015,
+    14031,
+    14088,
+    14090,
+    14094,
+    14108,
+    14146,
+    14160,
+    14161,
+    14167,
+    14171,
+    14208,
+    14217,
+    14223,
+    14273,
+    14280,
+    14288,
+    14326,
+    14328,
+    14333,
+    14362,
+    14368,
+    14382,
+    14414,
+    14426,
+    14437,
+    14442,
+    14451,
+    14467,
+    14498,
+    14512,
+    14520,
+    14546,
+    14559,
+    14573,
+    14599,
+    14618,
+    14621,
+    14654,
+    14669,
+    14675,
+    14676,
+    14687,
+    14702,
+    14705,
+    14711,
+    14731,
+    14750,
+    14808,
+    14818,
+    14922,
+    14925,
+    14929,
+    14936,
+    14942,
+    14959,
+    14979,
+    14998,
+    15005,
+    15014,
+    15028,
+    15046,
+    15047,
+    15049,
+    15063,
+    15075,
+    15076,
+    15078,
+    15084,
+    15087,
+    15096,
+    15127,
+    15143,
+    15146,
+    15157,
+    15197,
+    15213,
+    15224,
+    15225,
+    15238,
+    15240,
+    15253,
+    15290,
+    15297,
+    15299,
+    15306,
+    15329,
+    15348,
+    15362,
+    15374,
+    15393,
+    15397,
+    15414,
+    15424,
+    15434,
+    15436,
+    15441,
+    15452,
+    15483,
+    15514,
+    15538,
+    15573,
+    15600,
+    15617,
+    15620,
+    15639,
+    15674,
+    15675,
+    15677,
+    15694,
+    15716,
+    15746,
+    15752,
+    15761,
+    15766,
+    15785,
+    15793,
+    15798,
+    15799,
+    15827,
+    15842,
+    15852,
+    15861,
+    15865,
+    15896,
+    15899,
+    15902,
+    15906,
+    15912,
+    15921,
+    15927,
+    15956,
+    15979,
+    15984,
+    15989,
+    16018,
+    16043,
+    16057,
+    16060,
+    16075,
+    16094,
+    16117,
+    16159,
+    16160,
+    16179,
+    16218,
+    16235,
+    16240,
+    16264,
+    16315,
+    16324,
+    16360,
+    16361,
+    16368,
+    16418,
+    16425,
+    16448,
+    16466,
+    16499,
+    16511,
+    16512,
+    16518,
+    16630,
+    16633,
+    16640,
+    16667,
+    16707,
+    16738,
+    16747,
+    16751,
+    16760,
+    16778,
+    16813,
+    16817,
+    16825,
+    16860,
+    16863,
+    16864,
+    16885,
+    16909,
+    16913,
+    16921,
+    16930,
+    16968,
+    16975,
+    16978,
+    16980,
+    17008,
+    17016,
+    17027,
+    17036,
+    17049,
+    17055,
+    17057,
+    17084,
+    17094,
+    17096,
+    17118,
+    17135,
+    17140,
+    17156,
+    17158,
+    17185,
+    17199,
+    17223,
+    17255,
+    17260,
+    17262,
+    17277,
+    17284,
+    17351,
+    17383,
+    17400,
+    17401,
+    17409,
+    17419,
+    17429,
+    17435,
+    17462,
+    17467,
+    17476,
+    17477,
+    17483,
+    17525,
+    17550,
+    17554,
+    17560,
+    17566,
+    17587,
+    17591,
+    17607,
+    17609,
+    17621,
+    17622,
+    17634,
+    17641,
+    17642,
+    17643,
+    17671,
+    17690,
+    17701,
+    17705,
+    17706,
+    17729,
+    17730,
+    17745,
+    17767,
+    17783,
+    17794,
+    17815,
+    17825,
+    17826,
+    17830,
+    17844,
+    17848,
+    17868,
+    17894,
+    17918,
+    17928,
+    17935,
+    17973,
+    17981,
+    17989,
+    18005,
+    18059,
+    18082,
+    18113,
+    18137,
+    18140,
+    18205,
+    18208,
+    18221,
+    18236,
+    18238,
+    18259,
+    18268,
+    18285,
+    18292,
+    18334,
+    18363,
+    18364,
+    18381,
+    18390,
+    18396,
+    18421,
+    18459,
+    18505,
+    18507,
+    18535,
+    18543,
+    18544,
+    18552,
+    18556,
+    18574,
+    18584,
+    18585,
+    18591,
+    18611,
+    18639,
+    18682,
+    18722,
+    18733,
+    18742,
+    18751,
+    18754,
+    18769,
+    18772,
+    18797,
+    18811,
+    18850,
+    18893,
+    18913,
+    18914,
+    18949,
+    18959,
+    18978,
+    19011,
+    19065,
+    19066,
+    19088,
+    19107,
+    19113,
+    19123,
+    19129,
+    19134,
+    19141,
+    19144,
+    19153,
+    19184,
+    19235,
+    19238,
+    19239,
+    19245,
+    19261,
+    19306,
+    19324,
+    19328,
+    19343,
+    19347,
+    19356,
+    19376,
+    19377,
+    19385,
+    19421,
+    19457,
+    19468,
+    19475,
+    19494,
+    19506,
+    19513,
+    19514,
+    19536,
+    19546,
+    19549,
+    19564,
+    19587,
+    19595,
+    19611,
+    19630,
+    19676,
+    19687,
+    19704,
+    19707,
+    19713,
+    19738,
+    19741,
+    19778,
+    19788,
+    19799,
+    19814,
+    19821,
+    19886,
+    19896,
+    19901,
+    19916,
+    19930,
+    19946,
+    19956,
+    19973,
+    19987,
+    20024,
+    20026,
+    20047,
+    20078,
+    20098,
+    20103,
+    20136,
+    20206,
+    20225,
+    20232,
+    20235,
+    20258,
+    20267,
+    20281,
+    20289,
+    20305,
+    20307,
+    20318,
+    20320,
+    20338,
+    20354,
+    20356,
+    20375,
+    20382,
+    20386,
+    20388,
+    20390,
+    20398,
+    20418,
+    20447,
+    20472,
+    20484,
+    20571,
+    20574,
+    20582,
+    20591,
+    20625,
+    20635,
+    20698,
+    20707,
+    20709,
+    20711,
+    20741,
+    20747,
+    20749,
+    20758,
+    20770,
+    20778,
+    20779,
+    20833,
+    20862,
+    20867,
+    20879,
+    20885,
+    20912,
+    20919,
+    20941,
+    20996,
+    21005,
+    21012,
+    21035,
+    21054,
+    21082,
+    21096,
+    21103,
+    21128,
+    21135,
+    21168,
+    21174,
+    21182,
+    21200,
+    21205,
+    21227,
+    21228,
+    21230,
+    21238,
+    21246,
+    21253,
+    21280,
+    21293,
+    21298,
+    21358,
+    21370,
+    21406,
+    21421,
+    21457,
+    21518,
+    21530,
+    21549,
+    21560,
+    21562,
+    21579,
+    21613,
+    21620,
+    21636,
+    21650,
+    21654,
+    21656,
+    21668,
+    21672,
+    21675,
+    21680,
+    21686,
+    21689,
+    21696,
+    21702,
+    21704,
+    21731,
+    21783,
+    21796,
+    21798,
+    21849,
+    21865,
+    21905,
+    21906,
+    21929,
+    21949,
+    21956,
+    21972,
+    21974,
+    21987,
+    22009,
+    22022,
+    22025,
+    22042,
+    22050,
+    22071,
+    22074,
+    22078,
+    22084,
+    22085,
+    22116,
+    22129,
+    22158,
+    22165,
+    22224,
+    22225,
+    22247,
+    22297,
+    22312,
+    22322,
+    22336,
+    22337,
+    22343,
+    22345,
+    22367,
+    22370,
+    22428,
+    22438,
+    22460,
+    22498,
+    22525,
+    22558,
+    22580,
+    22597,
+    22607,
+    22612,
+    22614,
+    22649,
+    22663,
+    22677,
+    22689,
+    22701,
+    22712,
+    22726,
+    22746,
+    22754,
+    22762,
+    22768,
+    22788,
+    22796,
+    22842,
+    22857,
+    22859,
+    22861,
+    22886,
+    22895,
+    22940,
+    22956,
+    22983,
+    22987,
+    22988,
+    22991,
+    23014,
+    23032,
+    23052,
+    23053,
+    23083,
+    23093,
+    23135,
+    23147,
+    23158,
+    23174,
+    23231,
+    23254,
+    23272,
+    23315,
+    23341,
+    23351,
+    23359,
+    23369,
+    23373,
+    23398,
+    23421,
+    23426,
+    23431,
+    23433,
+    23439,
+    23443,
+    23444,
+    23459,
+    23474,
+    23478,
+    23479,
+    23482,
+    23509,
+    23515,
+    23530,
+    23547,
+    23569,
+    23571,
+    23585,
+    23599,
+    23600,
+    23625,
+    23637,
+    23669,
+    23687,
+    23754,
+    23758,
+    23781,
+    23794,
+    23866,
+    23872,
+    23884,
+    23891,
+    23894,
+    23912,
+    23934,
+    24008,
+    24011,
+    24021,
+    24023,
+    24032,
+    24041,
+    24045,
+    24071,
+    24076,
+    24077,
+    24097,
+    24123,
+    24135,
+    24144,
+    24147,
+    24148,
+    24159,
+    24183,
+    24197,
+    24212,
+    24216,
+    24226,
+    24254,
+    24265,
+    24305,
+    24312,
+    24316,
+    24320,
+    24333,
+    24337,
+    24338,
+    24345,
+    24352,
+    24365,
+    24375,
+    24377,
+    24389,
+    24391,
+    24429,
+    24437,
+    24447,
+    24460,
+    24476,
+    24485,
+    24497,
+    24520,
+    24539,
+    24616,
+    24697,
+    24727,
+    24734,
+    24735,
+    24749,
+    24750,
+    24772,
+    24796,
+    24816,
+    24817,
+    24825,
+    24827,
+    24831,
+    24840,
+    24850,
+    24853,
+    24864,
+    24881,
+    24948,
+    24974,
+    24978,
+    24985,
+    25010,
+    25011,
+    25035,
+    25046,
+    25049,
+    25089,
+    25107,
+    25110,
+    25125,
+    25138,
+    25145,
+    25162,
+    25190,
+    25209,
+    25249,
+    25289,
+    25315,
+    25319,
+    25321,
+    25340,
+    25346,
+    25389,
+    25401,
+    25409,
+    25435,
+    25447,
+    25452,
+    25464,
+    25467,
+    25468,
+    25490,
+    25495,
+    25501,
+    25521,
+    25526,
+    25534,
+    25547,
+    25556,
+    25571,
+    25589,
+    25592,
+    25624,
+    25639,
+    25644,
+    25703,
+    25731,
+    25733,
+    25746,
+    25759,
+    25772,
+    25781,
+    25786,
+    25797,
+    25805,
+    25821,
+    25829,
+    25865,
+    25870,
+    25885,
+    25890,
+    25897,
+    25906,
+    25910,
+    25912,
+    25917,
+    25923,
+    25928,
+    25931,
+    25933,
+    25956,
+    25974,
+    26027,
+    26043,
+    26072,
+    26074,
+    26082,
+    26087,
+    26097,
+    26126,
+    26169,
+    26188,
+    26230,
+    26238,
+    26245,
+    26254,
+    26259,
+    26280,
+    26285,
+    26311,
+    26336,
+    26384,
+    26405,
+    26409,
+    26432,
+    26442,
+    26454,
+    26469,
+    26487,
+    26496,
+    26506,
+    26525,
+    26546,
+    26556,
+    26566,
+    26578,
+    26579,
+    26601,
+    26609,
+    26637,
+    26690,
+    26717,
+    26731,
+    26771,
+    26772,
+    26782,
+    26794,
+    26809,
+    26828,
+    26840,
+    26850,
+    26853,
+    26856,
+    26861,
+    26903,
+    26919,
+    26927,
+    26974,
+    26999,
+    27007,
+    27010,
+    27027,
+    27042,
+    27050,
+    27062,
+    27070,
+    27085,
+    27095,
+    27113,
+    27119,
+    27126,
+    27144,
+    27151,
+    27166,
+    27184,
+    27189,
+    27191,
+    27209,
+    27214,
+    27220,
+    27249,
+    27260,
+    27271,
+    27275,
+    27301,
+    27311,
+    27316,
+    27325,
+    27352,
+    27365,
+    27398,
+    27424,
+    27427,
+    27435,
+    27438,
+    27439,
+    27443,
+    27486,
+    27487,
+    27506,
+    27510,
+    27517,
+    27538,
+    27568,
+    27580,
+    27584,
+    27610,
+    27614,
+    27631,
+    27640,
+    27654,
+    27659,
+    27668,
+    27676,
+    27701,
+    27704,
+    27757,
+    27762,
+    27766,
+    27767,
+    27771,
+    27772,
+    27818,
+    27845,
+    27855,
+    27866,
+    27876,
+    27884,
+    27901,
+    27907,
+    27978,
+    27982,
+    27999,
+    28019,
+    28038,
+    28075,
+    28094,
+    28107,
+    28114,
+    28116,
+    28127,
+    28145,
+    28154,
+    28168,
+    28175,
+    28197,
+    28207,
+    28247,
+    28261,
+    28266,
+    28272,
+    28281,
+    28311,
+    28325,
+    28329,
+    28336,
+    28348,
+    28372,
+    28375,
+    28382,
+    28389,
+    28394,
+    28406,
+    28429,
+    28447,
+    28489,
+    28520,
+    28565,
+    28570,
+    28581,
+    28653,
+    28654,
+    28665,
+    28669,
+    28671,
+    28673,
+    28680,
+    28706,
+    28712,
+    28743,
+    28757,
+    28864,
+    28875,
+    28880,
+    28907,
+    28918,
+    28927,
+    28958,
+    28974,
+    28978,
+    28987,
+    29043,
+    29076,
+    29083,
+    29084,
+    29089,
+    29110,
+    29113,
+    29122,
+    29128,
+    29150,
+    29184,
+    29191,
+    29209,
+    29220,
+    29237,
+    29240,
+    29254,
+    29324,
+    29333,
+    29345,
+    29355,
+    29359,
+    29376,
+    29389,
+    29465,
+    29487,
+    29529,
+    29555,
+    29562,
+    29592,
+    29605,
+    29616,
+    29621,
+    29633,
+    29636,
+    29675,
+    29710,
+    29721,
+    29766,
+    29768,
+    29771,
+    29773,
+    29776,
+    29811,
+    29816,
+    29833,
+    29836,
+    29879,
+    29893,
+    29932,
+    29941,
+    29955,
+    29963,
+    29975,
+    29976,
+    30006,
+    30008,
+    30018,
+    30034,
+    30085,
+    30122,
+    30139,
+    30154,
+    30212,
+    30218,
+    30247,
+    30260,
+    30263,
+    30274,
+    30323,
+    30325,
+    30354,
+    30375,
+    30376,
+    30389,
+    30390,
+    30395,
+    30407,
+    30409,
+    30424,
+    30438,
+    30458,
+    30463,
+    30467,
+    30469,
+    30475,
+    30484,
+    30493,
+    30520,
+    30523,
+    30535,
+    30558,
+    30584,
+    30625,
+    30631,
+    30634,
+    30638,
+    30645,
+    30678,
+    30695,
+    30723,
+    30734,
+    30736,
+    30749,
+    30750,
+    30757,
+    30779,
+    30793,
+    30828,
+    30831,
+    30838,
+    30850,
+    30873,
+    30885,
+    30916,
+    30917,
+    30948,
+    30949,
+    30958,
+    30981,
+    30991,
+    31159,
+    31200,
+    31218,
+    31225,
+    31239,
+    31257,
+    31278,
+    31286,
+    31295,
+    31296,
+    31305,
+    31307,
+    31338,
+    31356,
+    31368,
+    31411,
+    31418,
+    31424,
+    31436,
+    31470,
+    31482,
+    31483,
+    31487,
+    31501,
+    31540,
+    31567,
+    31587,
+    31601,
+    31673,
+    31688,
+    31706,
+    31707,
+    31715,
+    31723,
+    31729,
+    31732,
+    31740,
+    31764,
+    31781,
+    31797,
+    31813,
+    31816,
+    31856,
+    31870,
+    31884,
+    31888,
+    31906,
+    31911,
+    31931,
+    31933,
+    31951,
+    31975,
+    31980,
+    31982,
+    32013,
+    32014,
+    32028,
+    32057,
+    32068,
+    32083,
+    32087,
+    32090,
+    32148,
+    32162,
+    32181,
+    32184,
+    32203,
+    32208,
+    32224,
+    32254,
+    32263,
+    32268,
+    32279,
+    32295,
+    32314,
+    32326,
+    32343,
+    32356,
+    32373,
+    32386,
+    32423,
+    32431,
+    32450,
+    32465,
+    32484,
+    32495,
+    32511,
+    32544,
+    32545,
+    32548,
+    32574,
+    32585,
+    32596,
+    32598,
+    32602,
+    32609,
+    32616,
+    32623,
+    32624,
+    32636,
+    32647,
+    32691,
+    32735,
+    32737,
+    32756,
+    32798,
+    32805,
+    32843,
+    32868,
+    32872,
+    32881,
+    32926,
+    32968,
+    32985,
+    32989,
+    32991,
+    32996,
+    33010,
+    33013,
+    33016,
+    33021,
+    33028,
+    33030,
+    33077,
+    33084,
+    33089,
+    33093,
+    33104,
+    33130,
+    33152,
+    33157,
+    33173,
+    33179,
+    33199,
+    33293,
+    33294,
+    33333,
+    33351,
+    33352,
+    33397,
+    33413,
+    33418,
+    33419,
+    33424,
+    33436,
+    33440,
+    33442,
+    33478,
+    33484,
+    33498,
+    33543,
+    33546,
+    33552,
+    33556,
+    33579,
+    33593,
+    33621,
+    33626,
+    33641,
+    33645,
+    33660,
+    33666,
+    33673,
+    33687,
+    33694,
+    33711,
+    33734,
+    33739,
+    33761,
+    33766,
+    33783,
+    33794,
+    33800,
+    33804,
+    33810,
+    33826,
+    33862,
+    33871,
+    33916,
+    33928,
+    33929,
+    33933,
+    33943,
+    33947,
+    33949,
+    33955,
+    33959,
+    33986,
+    33987,
+    33989,
+    33999,
+    34007,
+    34010,
+    34026,
+    34052,
+    34081,
+    34082,
+    34083,
+    34135,
+    34141,
+    34143,
+    34149,
+    34184,
+    34203,
+    34232,
+    34297,
+    34299,
+    34300,
+    34321,
+    34322,
+    34327,
+    34331,
+    34332,
+    34341,
+    34369,
+    34403,
+    34410,
+    34420,
+    34425,
+    34429,
+    34480,
+    34483,
+    34499,
+    34546,
+    34577,
+    34583,
+    34590,
+    34600,
+    34622,
+    34629,
+    34642,
+    34649,
+    34670,
+    34690,
+    34759,
+    34764,
+    34773,
+    34793,
+    34799,
+    34811,
+    34812,
+    34820,
+    34821,
+    34829,
+    34864,
+    34866,
+    34885,
+    34895,
+    34923,
+    34939,
+    34944,
+    34973,
+    34985,
+    34992,
+    35017,
+    35049,
+    35055,
+    35063,
+    35065,
+    35066,
+    35075,
+    35099,
+    35106,
+    35115,
+    35117,
+    35125,
+    35146,
+    35173,
+    35178,
+    35184,
+    35190,
+    35199,
+    35200,
+    35208,
+    35219,
+    35229,
+    35248,
+    35274,
+    35289,
+    35293,
+    35297,
+    35306,
+    35311,
+    35330,
+    35334,
+    35335,
+    35358,
+    35392,
+    35393,
+    35394,
+    35410,
+    35419,
+    35420,
+    35429,
+    35438,
+    35452,
+    35460,
+    35496,
+    35521,
+    35528,
+    35537,
+    35538,
+    35553,
+    35582,
+    35635,
+    35636,
+    35653,
+    35674,
+    35676,
+    35677,
+    35680,
+    35687,
+    35696,
+    35712,
+    35718,
+    35720,
+    35721,
+    35755,
+    35786,
+    35811,
+    35829,
+    35833,
+    35850,
+    35855,
+    35864,
+    35866,
+    35888,
+    35898,
+    35963,
+    36001,
+    36014,
+    36046,
+    36073,
+    36087,
+    36088,
+    36097,
+    36109,
+    36131,
+    36168,
+    36170,
+    36174,
+    36180,
+    36197,
+    36206,
+    36207,
+    36246,
+    36260,
+    36288,
+    36289,
+    36308,
+    36323,
+    36330,
+    36334,
+    36360,
+    36363,
+    36368,
+    36384,
+    36456,
+    36474,
+    36484,
+    36502,
+    36514,
+    36548,
+    36577,
+    36606,
+    36609,
+    36610,
+    36618,
+    36622,
+    36624,
+    36652,
+    36669,
+    36677,
+    36695,
+    36713,
+    36715,
+    36720,
+    36778,
+    36781,
+    36800,
+    36818,
+    36825,
+    36827,
+    36845,
+    36876,
+    36882,
+    36884,
+    36902,
+    36914,
+    36915,
+    36928,
+    36944,
+    36951,
+    36968,
+    36978,
+    36979,
+    36984,
+    37023,
+    37038,
+    37048,
+    37051,
+    37064,
+    37076,
+    37084,
+    37096,
+    37124,
+    37148,
+    37157,
+    37178,
+    37195,
+    37204,
+    37235,
+    37254,
+    37258,
+    37265,
+    37270,
+    37275,
+    37283,
+    37289,
+    37334,
+    37336,
+    37341,
+    37365,
+    37389,
+    37396,
+    37403,
+    37440,
+    37441,
+    37445,
+    37472,
+    37485,
+    37490,
+    37509,
+    37544,
+    37595,
+    37621,
+    37640,
+    37663,
+    37673,
+    37692,
+    37698,
+    37701,
+    37713,
+    37719,
+    37770,
+    37790,
+    37792,
+    37807,
+    37833,
+    37844,
+    37846,
+    37859,
+    37880,
+    37890,
+    37913,
+    37927,
+    37945,
+    37959,
+    37960,
+    37994,
+    38036,
+    38085,
+    38088,
+    38104,
+    38177,
+    38178,
+    38192,
+    38209,
+    38210,
+    38225,
+    38249,
+    38250,
+    38291,
+    38320,
+    38330,
+    38356,
+    38382,
+    38386,
+    38408,
+    38419,
+    38427,
+    38433,
+    38436,
+    38441,
+    38445,
+    38459,
+    38464,
+    38497,
+    38502,
+    38522,
+    38523,
+    38535,
+    38547,
+    38564,
+    38577,
+    38584,
+    38592,
+    38607,
+    38608,
+    38609,
+    38615,
+    38646,
+    38657,
+    38669,
+    38679,
+    38691,
+    38705,
+    38707,
+    38732,
+    38738,
+    38739,
+    38772,
+    38776,
+    38802,
+    38812,
+    38837,
+    38842,
+    38852,
+    38866,
+    38881,
+    38903,
+    38911,
+    38915,
+    38942,
+    38958,
+    38962,
+    38966,
+    38981,
+    39002,
+    39004,
+    39012,
+    39019,
+    39024,
+    39025,
+    39047,
+    39071,
+    39098,
+    39137,
+    39144,
+    39147,
+    39175,
+    39193,
+    39205,
+    39206,
+    39226,
+    39276,
+    39278,
+    39290,
+    39315,
+    39317,
+    39321,
+    39327,
+    39341,
+    39365,
+    39366,
+    39417,
+    39444,
+    39464,
+    39467,
+    39503,
+    39514,
+    39524,
+    39528,
+    39555,
+    39567,
+    39569,
+    39573,
+    39595,
+    39635,
+    39647,
+    39648,
+    39663,
+    39678,
+    39680,
+    39692,
+    39724,
+    39729,
+    39734,
+    39749,
+    39769,
+    39774,
+    39778,
+    39781,
+    39789,
+    39790,
+    39805,
+    39809,
+    39821,
+    39831,
+    39834,
+    39839,
+    39865,
+    39866,
+    39887,
+    39902,
+    39908,
+    39928,
+    39945,
+    39952,
+    39955,
+    39969,
+    39974,
+    40005,
+    40007,
+    40023,
+    40044,
+    40046,
+    40056,
+    40057,
+    40071,
+    40087,
+    40148,
+    40201,
+    40250,
+    40254,
+    40268,
+    40304,
+    40317,
+    40318,
+    40330,
+    40337,
+    40401,
+    40419,
+    40494,
+    40539,
+    40548,
+    40556,
+    40571,
+    40583,
+    40589,
+    40610,
+    40612,
+    40617,
+    40625,
+    40629,
+    40642,
+    40650,
+    40653,
+    40654,
+    40666,
+    40691,
+    40714,
+    40722,
+    40723,
+    40725,
+    40732,
+    40742,
+    40747,
+    40764,
+    40771,
+    40804,
+    40840,
+    40845,
+    40877,
+    40901,
+    40919,
+    40938,
+    40949,
+    40964,
+    41025,
+    41037,
+    41040,
+    41075,
+    41091,
+    41117,
+    41129,
+    41153,
+    41165,
+    41169,
+    41191,
+    41196,
+    41197,
+    41233,
+    41236,
+    41237,
+    41263,
+    41289,
+    41295,
+    41320,
+    41344,
+    41364,
+    41399,
+    41401,
+    41405,
+    41422,
+    41446,
+    41453,
+    41458,
+    41479,
+    41491,
+    41516,
+    41529,
+    41546,
+    41553,
+    41573,
+    41577,
+    41578,
+    41583,
+    41596,
+    41620,
+    41636,
+    41649,
+    41655,
+    41693,
+    41694,
+    41749,
+    41785,
+    41794,
+    41838,
+    41843,
+    41853,
+    41864,
+    41892,
+    41902,
+    41909,
+    41920,
+    41922,
+    41940,
+    41942,
+    41952,
+    41958,
+    41998,
+    42012,
+    42013,
+    42015,
+    42059,
+    42071,
+    42073,
+    42078,
+    42080,
+    42083,
+    42109,
+    42132,
+    42144,
+    42154,
+    42179,
+    42181,
+    42182,
+    42194,
+    42197,
+    42236,
+    42252,
+    42273,
+    42303,
+    42307,
+    42311,
+    42342,
+    42344,
+    42363,
+    42372,
+    42384,
+    42392,
+    42398,
+    42419,
+    42450,
+    42455,
+    42476,
+    42519,
+    42521,
+    42534,
+    42574,
+    42595,
+    42604,
+    42618,
+    42625,
+    42718,
+    42732,
+    42735,
+    42788,
+    42795,
+    42833,
+    42849,
+    42871,
+    42892,
+    42902,
+    42999,
+    43002,
+    43005,
+    43036,
+    43057,
+    43060,
+    43146,
+    43153,
+    43163,
+    43175,
+    43196,
+    43202,
+    43234,
+    43270,
+    43284,
+    43305,
+    43317,
+    43333,
+    43336,
+    43341,
+    43349,
+    43355,
+    43373,
+    43385,
+    43397,
+    43410,
+    43448,
+    43459,
+    43471,
+    43472,
+    43488,
+    43493,
+    43501,
+    43502,
+    43507,
+    43547,
+    43552,
+    43559,
+    43575,
+    43606,
+    43608,
+    43614,
+    43644,
+    43664,
+    43691,
+    43716,
+    43720,
+    43722,
+    43724,
+    43738,
+    43752,
+    43759,
+    43779,
+    43839,
+    43853,
+    43872,
+    43886,
+    43926,
+    43946,
+    43952,
+    44004,
+    44014,
+    44044,
+    44048,
+    44051,
+    44054,
+    44062,
+    44075,
+    44098,
+    44099,
+    44103,
+    44104,
+    44122,
+    44132,
+    44142,
+    44147,
+    44155,
+    44163,
+    44194,
+    44212,
+    44258,
+    44292,
+    44307,
+    44311,
+    44316,
+    44342,
+    44360,
+    44364,
+    44383,
+    44386,
+    44388,
+    44401,
+    44415,
+    44416,
+    44481,
+    44490,
+    44504,
+    44568,
+    44608,
+    44611,
+    44635,
+    44651,
+    44654,
+    44660,
+    44665,
+    44680,
+    44702,
+    44706,
+    44720,
+    44723,
+    44726,
+    44732,
+    44735,
+    44746,
+    44749,
+    44752,
+    44784,
+    44811,
+    44818,
+    44824,
+    44828,
+    44832,
+    44834,
+    44841,
+    44851,
+    44862,
+    44888,
+    44891,
+    44955,
+    44957,
+    44959,
+    44965,
+    44970,
+    44973,
+    44993,
+    45014,
+    45016,
+    45023,
+    45027,
+    45029,
+    45030,
+    45033,
+    45076,
+    45100,
+    45104,
+    45119,
+    45128,
+    45130,
+    45164,
+    45169,
+    45175,
+    45219,
+    45244,
+    45313,
+    45320,
+    45335,
+    45349,
+    45375,
+    45385,
+    45407,
+    45426,
+    45432,
+    45434,
+    45438,
+    45443,
+    45456,
+    45538,
+    45575,
+    45608,
+    45609,
+    45611,
+    45626,
+    45632,
+    45638,
+    45649,
+    45675,
+    45676,
+    45705,
+    45722,
+    45748,
+    45752,
+    45806,
+    45807,
+    45811,
+    45814,
+    45830,
+    45850,
+    45858,
+    45875,
+    45881,
+    45904,
+    45924,
+    45929,
+    45961,
+    45982,
+    46006,
+    46018,
+    46021,
+    46030,
+    46046,
+    46091,
+    46151,
+    46165,
+    46186,
+    46203,
+    46215,
+    46218,
+    46225,
+    46232,
+    46251,
+    46255,
+    46259,
+    46277,
+    46282,
+    46338,
+    46341,
+    46346,
+    46353,
+    46356,
+    46373,
+    46418,
+    46426,
+    46434,
+    46442,
+    46452,
+    46500,
+    46505,
+    46519,
+    46542,
+    46548,
+    46556,
+    46577,
+    46602,
+    46605,
+    46607,
+    46644,
+    46649,
+    46656,
+    46729,
+    46739,
+    46744,
+    46750,
+    46789,
+    46796,
+    46800,
+    46801,
+    46826,
+    46827,
+    46832,
+    46851,
+    46871,
+    46873,
+    46896,
+    46903,
+    46914,
+    46964,
+    46986,
+    46998,
+    47022,
+    47065,
+    47080,
+    47101,
+    47135,
+    47144,
+    47152,
+    47184,
+    47189,
+    47207,
+    47215,
+    47218,
+    47221,
+    47254,
+    47266,
+    47300,
+    47326,
+    47336,
+    47362,
+    47369,
+    47378,
+    47379,
+    47383,
+    47415,
+    47423,
+    47446,
+    47449,
+    47453,
+    47455,
+    47456,
+    47486,
+    47489,
+    47494,
+    47503,
+    47505,
+    47527,
+    47531,
+    47552,
+    47563,
+    47603,
+    47614,
+    47616,
+    47617,
+    47620,
+    47665,
+    47683,
+    47717,
+    47726,
+    47729,
+    47760,
+    47761,
+    47785,
+    47843,
+    47857,
+    47867,
+    47893,
+    47895,
+    47911,
+    47915,
+    47936,
+    47966,
+    47972,
+    47979,
+    47989,
+    48018,
+    48030,
+    48046,
+    48059,
+    48064,
+    48076,
+    48085,
+    48108,
+    48109,
+    48115,
+    48116,
+    48139,
+    48148,
+    48174,
+    48230,
+    48242,
+    48269,
+    48271,
+    48272,
+    48281,
+    48312,
+    48320,
+    48321,
+    48330,
+    48340,
+    48364,
+    48396,
+    48405,
+    48426,
+    48443,
+    48455,
+    48458,
+    48462,
+    48503,
+    48527,
+    48531,
+    48533,
+    48566,
+    48571,
+    48591,
+    48603,
+    48622,
+    48643,
+    48644,
+    48645,
+    48651,
+    48654,
+    48656,
+    48666,
+    48668,
+    48683,
+    48690,
+    48721,
+    48724,
+    48727,
+    48746,
+    48749,
+    48754,
+    48800,
+    48814,
+    48817,
+    48840,
+    48885,
+    48898,
+    48937,
+    48949,
+    48950,
+    48953,
+    48962,
+    48967,
+    48974,
+    48997,
+    49038,
+    49048,
+    49071,
+    49088,
+    49092,
+    49100,
+    49106,
+    49116,
+    49128,
+    49129,
+    49155,
+    49166,
+    49173,
+    49174,
+    49196,
+    49217,
+    49235,
+    49237,
+    49245,
+    49248,
+    49254,
+    49270,
+    49289,
+    49323,
+    49348,
+    49408,
+    49420,
+    49422,
+    49434,
+    49475,
+    49493,
+    49542,
+    49548,
+    49555,
+    49577,
+    49589,
+    49595,
+    49602,
+    49618,
+    49622,
+    49624,
+    49669,
+    49725,
+    49747,
+    49760,
+    49777,
+    49810,
+    49854,
+    49884,
+    49897,
+    49901,
+    49917,
+    49923,
+    49943,
+    49957,
+    49962,
+    49987,
+    50048,
+    50075,
+    50102,
+    50111,
+    50119,
+    50121,
+    50134,
+    50165,
+    50177,
+    50188,
+    50236,
+    50245,
+    50294,
+    50305,
+    50316,
+    50320,
+    50347,
+    50362,
+    50432,
+    50474,
+    50496,
+    50510,
+    50514,
+    50524,
+    50534,
+    50549,
+    50593,
+    50599,
+    50600,
+    50608,
+    50640,
+    50680,
+    50713,
+    50714,
+    50718,
+    50721,
+    50724,
+    50746,
+    50758,
+    50760,
+    50803,
+    50805,
+    50807,
+    50841,
+    50869,
+    50886,
+    50919,
+    50928,
+    50938,
+    50940,
+    50948,
+    50950,
+    50963,
+    50970,
+    50978,
+    50989,
+    50994,
+    51012,
+    51027,
+    51028,
+    51030,
+    51048,
+    51067,
+    51076,
+    51088,
+    51100,
+    51113,
+    51118,
+    51124,
+    51126,
+    51128,
+    51141,
+    51200,
+    51264,
+    51275,
+    51278,
+    51308,
+    51356,
+    51363,
+    51364,
+    51377,
+    51386,
+    51387,
+    51414,
+    51418,
+    51423,
+    51427,
+    51442,
+    51461,
+    51480,
+    51494,
+    51497,
+    51506,
+    51507,
+    51519,
+    51622,
+    51632,
+    51658,
+    51666,
+    51669,
+    51672,
+    51687,
+    51746,
+    51750,
+    51752,
+    51754,
+    51787,
+    51796,
+    51814,
+    51820,
+    51837,
+    51877,
+    51902,
+    51912,
+    51921,
+    51944,
+    51946,
+    52011,
+    52027,
+    52031,
+    52039,
+    52054,
+    52072,
+    52073,
+    52080,
+    52133,
+    52154,
+    52169,
+    52187,
+    52195,
+    52240,
+    52252,
+    52262,
+    52268,
+    52285,
+    52294,
+    52324,
+    52338,
+    52347,
+    52372,
+    52386,
+    52388,
+    52397,
+    52408,
+    52411,
+    52438,
+    52444,
+    52462,
+    52463,
+    52474,
+    52481,
+    52489,
+    52506,
+    52516,
+    52527,
+    52531,
+    52574,
+    52599,
+    52607,
+    52610,
+    52613,
+    52617,
+    52620,
+    52623,
+    52629,
+    52664,
+    52684,
+    52699,
+    52707,
+    52714,
+    52720,
+    52731,
+    52792,
+    52797,
+    52798,
+    52806,
+    52818,
+    52859,
+    52887,
+    52976,
+    52981,
+    52991,
+    53005,
+    53006,
+    53025,
+    53075,
+    53099,
+    53111,
+    53126,
+    53136,
+    53196,
+    53278,
+    53279,
+    53304,
+    53321,
+    53322,
+    53372,
+    53418,
+    53430,
+    53437,
+    53440,
+    53496,
+    53497,
+    53505,
+    53556,
+    53560,
+    53589,
+    53599,
+    53600,
+    53614,
+    53632,
+    53675,
+    53722,
+    53736,
+    53740,
+    53751,
+    53780,
+    53781,
+    53808,
+    53810,
+    53815,
+    53848,
+    53895,
+    53897,
+    53916,
+    53922,
+    53923,
+    53949,
+    53973,
+    53982,
+    53989,
+    53995,
+    54027,
+    54028,
+    54050,
+    54053,
+    54060,
+    54093,
+    54108,
+    54109,
+    54128,
+    54130,
+    54131,
+    54136,
+    54145,
+    54201,
+    54203,
+    54210,
+    54229,
+    54258,
+    54268,
+    54296,
+    54323,
+    54378,
+    54394,
+    54402,
+    54403,
+    54436,
+    54443,
+    54453,
+    54457,
+    54463,
+    54470,
+    54471,
+    54489,
+    54491,
+    54492,
+    54494,
+    54496,
+    54538,
+    54597,
+    54599,
+    54616,
+    54622,
+    54630,
+    54642,
+    54646,
+    54672,
+    54680,
+    54693,
+    54697,
+    54699,
+    54702,
+    54716,
+    54728,
+    54769,
+    54784,
+    54786,
+    54837,
+    54869,
+    54878,
+    54880,
+    54907,
+    54914,
+    54924,
+    54934,
+    54941,
+    54955,
+    54978,
+    54980,
+    54995,
+    55008,
+    55030,
+    55051,
+    55059,
+    55064,
+    55065,
+    55073,
+    55079,
+    55087,
+    55098,
+    55120,
+    55122,
+    55144,
+    55157,
+    55168,
+    55178,
+    55186,
+    55187,
+    55188,
+    55212,
+    55223,
+    55224,
+    55226,
+    55237,
+    55243,
+    55266,
+    55280,
+    55283,
+    55342,
+    55350,
+    55414,
+    55421,
+    55430,
+    55435,
+    55447,
+    55458,
+    55479,
+    55483,
+    55493,
+    55504,
+    55531,
+    55546,
+    55556,
+    55561,
+    55574,
+    55589,
+    55681,
+    55685,
+    55691,
+    55698,
+    55702,
+    55705,
+    55722,
+    55729,
+    55750,
+    55764,
+    55783,
+    55809,
+    55819,
+    55823,
+    55828,
+    55838,
+    55855,
+    55890,
+    55919,
+    55924,
+    55927,
+    55957,
+    55961,
+    55962,
+    55969,
+    55979,
+    55983,
+    56002,
+    56024,
+    56040,
+    56041,
+    56045,
+    56058,
+    56062,
+    56065,
+    56073,
+    56085,
+    56114,
+    56121,
+    56132,
+    56141,
+    56144,
+    56154,
+    56162,
+    56177,
+    56247,
+    56252,
+    56268,
+    56286,
+    56291,
+    56293,
+    56337,
+    56339,
+    56351,
+    56361,
+    56391,
+    56422,
+    56504,
+    56596,
+    56606,
+    56701,
+    56703,
+    56718,
+    56761,
+    56779,
+    56782,
+    56823,
+    56831,
+    56842,
+    56870,
+    56877,
+    56882,
+    56890,
+    56912,
+    56948,
+    56962,
+    56964,
+    56980,
+    56993,
+    57018,
+    57036,
+    57048,
+    57050,
+    57068,
+    57069,
+    57073,
+    57130,
+    57133,
+    57150,
+    57154,
+    57160,
+    57178,
+    57183,
+    57213,
+    57214,
+    57223,
+    57224,
+    57249,
+    57252,
+    57307,
+    57320,
+    57332,
+    57350,
+    57351,
+    57352,
+    57362,
+    57374,
+    57384,
+    57395,
+    57425,
+    57439,
+    57475,
+    57495,
+    57524,
+    57535,
+    57545,
+    57551,
+    57564,
+    57565,
+    57570,
+    57574,
+    57576,
+    57590,
+    57599,
+    57619,
+    57709,
+    57723,
+    57737,
+    57743,
+    57758,
+    57777,
+    57786,
+    57788,
+    57798,
+    57802,
+    57807,
+    57836,
+    57862,
+    57865,
+    57867,
+    57879,
+    57887,
+    57891,
+    57911,
+    57944,
+    57956,
+    57968,
+    57979,
+    58007,
+    58018,
+    58053,
+    58062,
+    58071,
+    58081,
+    58087,
+    58092,
+    58098,
+    58128,
+    58136,
+    58157,
+    58169,
+    58172,
+    58174,
+    58177,
+    58201,
+    58208,
+    58219,
+    58230,
+    58265,
+    58290,
+    58291,
+    58299,
+    58336,
+    58337,
+    58338,
+    58359,
+    58375,
+    58384,
+    58389,
+    58406,
+    58407,
+    58410,
+    58418,
+    58423,
+    58433,
+    58445,
+    58464,
+    58501,
+    58522,
+    58557,
+    58559,
+    58590,
+    58602,
+    58606,
+    58627,
+    58629,
+    58645,
+    58658,
+    58715,
+    58724,
+    58789,
+    58831,
+    58869,
+    58872,
+    58899,
+    58902,
+    58934,
+    58935,
+    58942,
+    58949,
+    58957,
+    58958,
+    58994,
+    59000,
+    59024,
+    59026,
+    59036,
+    59043,
+    59049,
+    59053,
+    59093,
+    59101,
+    59125,
+    59139,
+    59154,
+    59204,
+    59208,
+    59209,
+    59210,
+    59216,
+    59217,
+    59312,
+    59336,
+    59360,
+    59403,
+    59404,
+    59423,
+    59454,
+    59461,
+    59475,
+    59478,
+    59479,
+    59480,
+    59482,
+    59483,
+    59484,
+    59509,
+    59544,
+    59564,
+    59581,
+    59591,
+    59603,
+    59610,
+    59611,
+    59630,
+    59649,
+    59712,
+    59757,
+    59792,
+    59841,
+    59845,
+    59865,
+    59867,
+    59876,
+    59877,
+    59911,
+    59928,
+    59979,
+    59999,
+    60016,
+    60038,
+    60083,
+    60131,
+    60156,
+    60170,
+    60171,
+    60198,
+    60207,
+    60235,
+    60240,
+    60266,
+    60288,
+    60338,
+    60344,
+    60460,
+    60488,
+    60525,
+    60543,
+    60559,
+    60581,
+    60586,
+    60596,
+    60604,
+    60627,
+    60628,
+    60671,
+    60672,
+    60674,
+    60681,
+    60704,
+    60715,
+    60740,
+    60757,
+    60765,
+    60803,
+    60808,
+    60856,
+    60864,
+    60872,
+    60873,
+    60880,
+    60895,
+    60907,
+    60971,
+    60985,
+    60994,
+    60998,
+    61002,
+    61004,
+    61021,
+    61022,
+    61032,
+    61064,
+    61081,
+    61083,
+    61118,
+    61163,
+    61166,
+    61173,
+    61200,
+    61210,
+    61211,
+    61241,
+    61250,
+    61261,
+    61265,
+    61277,
+    61281,
+    61290,
+    61301,
+    61318,
+    61320,
+    61338,
+    61340,
+    61355,
+    61381,
+    61413,
+    61414,
+    61439,
+    61513,
+    61519,
+    61525,
+    61528,
+    61556,
+    61557,
+    61563,
+    61580,
+    61581,
+    61586,
+    61617,
+    61629,
+    61633,
+    61657,
+    61680,
+    61710,
+    61765,
+    61772,
+    61780,
+    61804,
+    61827,
+    61843,
+    61861,
+    61870,
+    61910,
+    61968,
+    61969,
+    61983,
+    61993,
+    61996,
+    61997,
+    62005,
+    62021,
+    62033,
+    62036,
+    62067,
+    62099,
+    62100,
+    62108,
+    62117,
+    62123,
+    62143,
+    62191,
+    62198,
+    62217,
+    62218,
+    62256,
+    62274,
+    62296,
+    62302,
+    62307,
+    62318,
+    62319,
+    62338,
+    62344,
+    62349,
+    62382,
+    62403,
+    62416,
+    62422,
+    62447,
+    62479,
+    62480,
+    62497,
+    62512,
+    62537,
+    62540,
+    62544,
+    62555,
+    62564,
+    62591,
+    62601,
+    62610,
+    62618,
+    62631,
+    62674,
+    62697,
+    62705,
+    62720,
+    62732,
+    62740,
+    62770,
+    62789,
+    62799,
+    62817,
+    62823,
+    62863,
+    62877,
+    62879,
+    62887,
+    62896,
+    62918,
+    62965,
+    62982,
+    62986,
+    62991,
+    62994,
+    63039,
+    63040,
+    63046,
+    63106,
+    63119,
+    63156,
+    63159,
+    63173,
+    63191,
+    63194,
+    63211,
+    63219,
+    63230,
+    63242,
+    63276,
+    63295,
+    63296,
+    63324,
+    63348,
+    63351,
+    63371,
+    63372,
+    63393,
+    63400,
+    63407,
+    63453,
+    63458,
+    63477,
+    63481,
+    63485,
+    63538,
+    63545,
+    63553,
+    63554,
+    63624,
+    63632,
+    63636,
+    63660,
+    63698,
+    63720,
+    63740,
+    63741,
+    63749,
+    63761,
+    63779,
+    63802,
+    63853,
+    63861,
+    63887,
+    63926,
+    63941,
+    63963,
+    63966,
+    63985,
+    63993,
+    64044,
+    64054,
+    64058,
+    64059,
+    64071,
+    64077,
+    64092,
+    64113,
+    64114,
+    64137,
+    64139,
+    64166,
+    64175,
+    64190,
+    64212,
+    64217,
+    64247,
+    64253,
+    64266,
+    64277,
+    64322,
+    64329,
+    64359,
+    64360,
+    64388,
+    64395,
+    64399,
+    64417,
+    64419,
+    64433,
+    64434,
+    64457,
+    64497,
+    64501,
+    64518,
+    64520,
+    64540,
+    64545,
+    64568,
+    64589,
+    64600,
+    64607,
+    64627,
+    64631,
+    64632,
+    64660,
+    64772,
+    64783,
+    64805,
+    64822,
+    64827,
+    64881,
+    64884,
+    64888,
+    64893,
+    64897,
+    64976,
+    65011,
+    65016,
+    65018,
+    65019,
+    65033,
+    65037,
+    65038,
+    65052,
+    65069,
+    65079,
+    65097,
+    65131,
+    65138,
+    65159,
+    65173,
+    65185,
+    65197,
+    65225,
+    65259,
+    65260,
+    65261,
+    65264,
+    65267,
+    65287,
+    65291,
+    65297,
+    65300,
+    65305,
+    65317,
+    65360,
+    65383,
+    65387,
+    65405,
+    65416,
+    65434,
+    65440,
+    65451,
+    65456,
+    65473,
+    65490,
+    65506,
+    65519,
+    65543,
+    65553,
+    65570,
+    65579,
+    65589,
+    65611,
+    65612,
+    65622,
+    65664,
+    65667,
+    65668,
+    65672,
+    65678,
+    65683,
+    65699,
+    65722,
+    65727,
+    65737,
+    65745,
+    65759,
+    65763,
+    65766,
+    65777,
+    65787,
+    65792,
+    65810,
+    65815,
+    65830,
+    65831,
+    65846,
+    65850,
+    65861,
+    65870,
+    65871,
+    65877,
+    65887,
+    65901,
+    65912,
+    65974,
+    65980,
+    66014,
+    66092,
+    66112,
+    66118,
+    66120,
+    66153,
+    66163,
+    66203,
+    66233,
+    66261,
+    66285,
+    66297,
+    66323,
+    66371,
+    66376,
+    66377,
+    66401,
+    66405,
+    66426,
+    66436,
+    66444,
+    66474,
+    66475,
+    66498,
+    66506,
+    66519,
+    66521,
+    66535,
+    66595,
+    66600,
+    66606,
+    66627,
+    66629,
+    66635,
+    66647,
+    66656,
+    66686,
+    66689,
+    66731,
+    66778,
+    66781,
+    66783,
+    66786,
+    66816,
+    66824,
+    66834,
+    66844,
+    66853,
+    66870,
+    66882,
+    66905,
+    66910,
+    66929,
+    66941,
+    66960,
+    66995,
+    67018,
+    67022,
+    67036,
+    67054,
+    67066,
+    67073,
+    67083,
+    67097,
+    67108,
+    67150,
+    67172,
+    67183,
+    67219,
+    67225,
+    67240,
+    67293,
+    67314,
+    67334,
+    67372,
+    67392,
+    67393,
+    67411,
+    67432,
+    67436,
+    67451,
+    67455,
+    67471,
+    67476,
+    67506,
+    67564,
+    67591,
+    67615,
+    67618,
+    67625,
+    67638,
+    67650,
+    67664,
+    67680,
+    67701,
+    67708,
+    67709,
+    67750,
+    67758,
+    67773,
+    67774,
+    67796,
+    67807,
+    67811,
+    67838,
+    67848,
+    67864,
+    67886,
+    67895,
+    67914,
+    67922,
+    67924,
+    67938,
+    67940,
+    67977,
+    68006,
+    68013,
+    68046,
+    68048,
+    68081,
+    68101,
+    68132,
+    68133,
+    68165,
+    68166,
+    68169,
+    68172,
+    68174,
+    68187,
+    68191,
+    68221,
+    68241,
+    68250,
+    68258,
+    68271,
+    68294,
+    68298,
+    68300,
+    68303,
+    68327,
+    68329,
+    68346,
+    68379,
+    68405,
+    68417,
+    68420,
+    68433,
+    68434,
+    68454,
+    68464,
+    68485,
+    68492,
+    68502,
+    68547,
+    68562,
+    68590,
+    68597,
+    68601,
+    68603,
+    68612,
+    68615,
+    68638,
+    68650,
+    68653,
+    68739,
+    68746,
+    68751,
+    68752,
+    68760,
+    68762,
+    68776,
+    68780,
+    68786,
+    68804,
+    68806,
+    68827,
+    68843,
+    68888,
+    68890,
+    68901,
+    68907,
+    68908,
+    68915,
+    68916,
+    68920,
+    68941,
+    69016,
+    69033,
+    69043,
+    69068,
+    69099,
+    69127,
+    69161,
+    69163,
+    69165,
+    69169,
+    69178,
+    69192,
+    69196,
+    69210,
+    69222,
+    69227,
+    69237,
+    69248,
+    69266,
+    69272,
+    69276,
+    69366,
+    69382,
+    69383,
+    69441,
+    69457,
+    69493,
+    69494,
+    69515,
+    69540,
+    69550,
+    69567,
+    69570,
+    69597,
+    69646,
+    69687,
+    69757,
+    69761,
+    69768,
+    69782,
+    69792,
+    69826,
+    69877,
+    69903,
+    69909,
+    69911,
+    69937,
+    69938,
+    69943,
+    69950,
+    69963,
+    70031,
+    70044,
+    70053,
+    70073,
+    70079,
+    70113,
+    70119,
+    70130,
+    70143,
+    70170,
+    70177,
+    70179,
+    70180,
+    70191,
+    70193,
+    70234,
+    70235,
+    70247,
+    70249,
+    70269,
+    70285,
+    70297,
+    70305,
+    70329,
+    70339,
+    70340,
+    70342,
+    70377,
+    70457,
+    70463,
+    70467,
+    70491,
+    70542,
+    70549,
+    70557,
+    70571,
+    70576,
+    70585,
+    70597,
+    70606,
+    70634,
+    70639,
+    70641,
+    70652,
+    70674,
+    70676,
+    70685,
+    70701,
+    70731,
+    70755,
+    70759,
+    70787,
+    70818,
+    70828,
+    70844,
+    70846,
+    70851,
+    70873,
+    70876,
+    70878,
+    70881,
+    70918,
+    70931,
+    70955,
+    70956,
+    70963,
+    70975,
+    70988,
+    71013,
+    71035,
+    71064,
+    71077,
+    71100,
+    71113,
+    71119,
+    71146,
+    71153,
+    71163,
+    71167,
+    71189,
+    71192,
+    71210,
+    71220,
+    71230,
+    71239,
+    71248,
+    71265,
+    71306,
+    71362,
+    71373,
+    71379,
+    71389,
+    71412,
+    71430,
+    71443,
+    71469,
+    71471,
+    71472,
+    71473,
+    71476,
+    71481,
+    71496,
+    71515,
+    71522,
+    71524,
+    71537,
+    71544,
+    71566,
+    71568,
+    71588,
+    71610,
+    71612,
+    71630,
+    71634,
+    71646,
+    71649,
+    71650,
+    71664,
+    71698,
+    71714,
+    71721,
+    71779,
+    71812,
+    71820,
+    71847,
+    71868,
+    71880,
+    71928,
+    71930,
+    71933,
+    71943,
+    71956,
+    71970,
+    72002,
+    72009,
+    72011,
+    72016,
+    72021,
+    72025,
+    72036,
+    72080,
+    72086,
+    72103,
+    72208,
+    72219,
+    72229,
+    72253,
+    72280,
+    72333,
+    72341,
+    72344,
+    72353,
+    72383,
+    72389,
+    72430,
+    72446,
+    72458,
+    72473,
+    72476,
+    72496,
+    72497,
+    72504,
+    72507,
+    72509,
+    72534,
+    72573,
+    72589,
+    72615,
+    72636,
+    72646,
+    72648,
+    72712,
+    72727,
+    72745,
+    72764,
+    72782,
+    72785,
+    72791,
+    72793,
+    72795,
+    72796,
+    72821,
+    72825,
+    72839,
+    72855,
+    72877,
+    72882,
+    72903,
+    72905,
+    72916,
+    72929,
+    72931,
+    72946,
+    73002,
+    73024,
+    73031,
+    73066,
+    73104,
+    73131,
+    73133,
+    73140,
+    73147,
+    73162,
+    73173,
+    73196,
+    73209,
+    73242,
+    73259,
+    73269,
+    73303,
+    73330,
+    73357,
+    73363,
+    73375,
+    73392,
+    73407,
+    73416,
+    73427,
+    73443,
+    73462,
+    73472,
+    73480,
+    73510,
+    73524,
+    73530,
+    73534,
+    73554,
+    73582,
+    73592,
+    73593,
+    73611,
+    73650,
+    73663,
+    73691,
+    73699,
+    73727,
+    73744,
+    73745,
+    73748,
+    73751,
+    73798,
+    73822,
+    73834,
+    73845,
+    73863,
+    73867,
+    73900,
+    73903,
+    73925,
+    73927,
+    73952,
+    73953,
+    73963,
+    73964,
+    73971,
+    73983,
+    74029,
+    74034,
+    74071,
+    74083,
+    74092,
+    74123,
+    74128,
+    74138,
+    74165,
+    74186,
+    74196,
+    74199,
+    74203,
+    74209,
+    74228,
+    74276,
+    74282,
+    74285,
+    74312,
+    74324,
+    74326,
+    74342,
+    74361,
+    74376,
+    74384,
+    74385,
+    74446,
+    74491,
+    74494,
+    74499,
+    74525,
+    74526,
+    74545,
+    74577,
+    74588,
+    74637,
+    74678,
+    74687,
+    74699,
+    74706,
+    74727,
+    74734,
+    74740,
+    74782,
+    74792,
+    74794,
+    74824,
+    74834,
+    74843,
+    74866,
+    74869,
+    74884,
+    74897,
+    75014,
+    75026,
+    75033,
+    75042,
+    75048,
+    75053,
+    75089,
+    75097,
+    75107,
+    75132,
+    75142,
+    75145,
+    75156,
+    75191,
+    75192,
+    75218,
+    75228,
+    75241,
+    75244,
+    75246,
+    75258,
+    75276,
+    75279,
+    75282,
+    75295,
+    75346,
+    75352,
+    75360,
+    75375,
+    75387,
+    75396,
+    75453,
+    75456,
+    75467,
+    75487,
+    75499,
+    75531,
+    75558,
+    75583,
+    75598,
+    75600,
+    75616,
+    75624,
+    75640,
+    75665,
+    75671,
+    75679,
+    75693,
+    75694,
+    75699,
+    75707,
+    75719,
+    75723,
+    75743,
+    75781,
+    75800,
+    75846,
+    75862,
+    75869,
+    75883,
+    75884,
+    75890,
+    75908,
+    75910,
+    75912,
+    75947,
+    75960,
+    75962,
+    75976,
+    76014,
+    76058,
+    76080,
+    76101,
+    76122,
+    76125,
+    76172,
+    76199,
+    76206,
+    76216,
+    76222,
+    76263,
+    76270,
+    76299,
+    76300,
+    76325,
+    76327,
+    76350,
+    76352,
+    76379,
+    76423,
+    76432,
+    76440,
+    76459,
+    76496,
+    76497,
+    76540,
+    76564,
+    76590,
+    76591,
+    76606,
+    76627,
+    76643,
+    76659,
+    76669,
+    76670,
+    76675,
+    76689,
+    76690,
+    76694,
+    76723,
+    76737,
+    76741,
+    76768,
+    76777,
+    76788,
+    76840,
+    76847,
+    76850,
+    76852,
+    76860,
+    76878,
+    76888,
+    76895,
+    76910,
+    76918,
+    76919,
+    76941,
+    76952,
+    76977,
+    76986,
+    77007,
+    77009,
+    77047,
+    77051,
+    77057,
+    77127,
+    77130,
+    77136,
+    77137,
+    77156,
+    77168,
+    77204,
+    77219,
+    77305,
+    77311,
+    77314,
+    77344,
+    77358,
+    77376,
+    77383,
+    77390,
+    77407,
+    77410,
+    77417,
+    77423,
+    77434,
+    77479,
+    77496,
+    77503,
+    77515,
+    77535,
+    77544,
+    77561,
+    77565,
+    77596,
+    77620,
+    77623,
+    77684,
+    77706,
+    77721,
+    77754,
+    77767,
+    77787,
+    77804,
+    77828,
+    77833,
+    77862,
+    77863,
+    77872,
+    77908,
+    77927,
+    77933,
+    77943,
+    77955,
+    77962,
+    77978,
+    77980,
+    77993,
+    78003,
+    78009,
+    78016,
+    78039,
+    78042,
+    78082,
+    78105,
+    78108,
+    78110,
+    78126,
+    78135,
+    78137,
+    78182,
+    78239,
+    78241,
+    78254,
+    78264,
+    78286,
+    78300,
+    78314,
+    78336,
+    78337,
+    78346,
+    78355,
+    78378,
+    78384,
+    78401,
+    78402,
+    78435,
+    78458,
+    78515,
+    78552,
+    78602,
+    78608,
+    78637,
+    78642,
+    78657,
+    78664,
+    78672,
+    78757,
+    78772,
+    78774,
+    78777,
+    78778,
+    78837,
+    78838,
+    78878,
+    78900,
+    78903,
+    78910,
+    78916,
+    78921,
+    78923,
+    78928,
+    78929,
+    78938,
+    78955,
+    78971,
+    78988,
+    79000,
+    79018,
+    79076,
+    79083,
+    79089,
+    79091,
+    79098,
+    79133,
+    79141,
+    79142,
+    79146,
+    79189,
+    79201,
+    79203,
+    79207,
+    79208,
+    79218,
+    79226,
+    79245,
+    79255,
+    79268,
+    79279,
+    79295,
+    79302,
+    79304,
+    79318,
+    79319,
+    79321,
+    79322,
+    79325,
+    79326,
+    79342,
+    79352,
+    79364,
+    79371,
+    79375,
+    79389,
+    79390,
+    79394,
+    79403,
+    79413,
+    79427,
+    79448,
+    79453,
+    79480,
+    79483,
+    79515,
+    79531,
+    79535,
+    79553,
+    79558,
+    79567,
+    79583,
+    79590,
+    79597,
+    79621,
+    79625,
+    79654,
+    79714,
+    79739,
+    79775,
+    79787,
+    79795,
+    79846,
+    79858,
+    79865,
+    79868,
+    79878,
+    79879,
+    79883,
+    79888,
+    79895,
+    79910,
+    79920,
+    79931,
+    79939,
+    79949,
+    79965,
+    79993,
+    80007,
+    80078,
+    80080,
+    80090,
+    80142,
+    80154,
+    80156,
+    80169,
+    80171,
+    80178,
+    80181,
+    80212,
+    80220,
+    80244,
+    80281,
+    80306,
+    80316,
+    80330,
+    80345,
+    80357,
+    80361,
+    80368,
+    80391,
+    80394,
+    80403,
+    80410,
+    80434,
+    80436,
+    80445,
+    80455,
+    80488,
+    80505,
+    80540,
+    80550,
+    80592,
+    80608,
+    80613,
+    80634,
+    80640,
+    80644,
+    80653,
+    80687,
+    80698,
+    80702,
+    80719,
+    80741,
+    80749,
+    80803,
+    80818,
+    80821,
+    80823,
+    80874,
+    80961,
+    80984,
+    81011,
+    81012,
+    81014,
+    81036,
+    81042,
+    81048,
+    81057,
+    81065,
+    81081,
+    81091,
+    81131,
+    81141,
+    81142,
+    81161,
+    81174,
+    81243,
+    81246,
+    81250,
+    81254,
+    81281,
+    81285,
+    81349,
+    81392,
+    81394,
+    81436,
+    81439,
+    81452,
+    81483,
+    81499,
+    81511,
+    81554,
+    81556,
+    81572,
+    81580,
+    81599,
+    81608,
+    81609,
+    81618,
+    81645,
+    81654,
+    81661,
+    81662,
+    81723,
+    81740,
+    81767,
+    81779,
+    81802,
+    81840,
+    81866,
+    81868,
+    81892,
+    81942,
+    81949,
+    81956,
+    81962,
+    81970,
+    82003,
+    82006,
+    82024,
+    82034,
+    82042,
+    82049,
+    82074,
+    82120,
+    82172,
+    82206,
+    82208,
+    82216,
+    82266,
+    82273,
+    82294,
+    82361,
+    82368,
+    82379,
+    82382,
+    82432,
+    82446,
+    82466,
+    82475,
+    82484,
+    82501,
+    82506,
+    82511,
+    82530,
+    82596,
+    82598,
+    82606,
+    82630,
+    82669,
+    82671,
+    82672,
+    82693,
+    82706,
+    82712,
+    82776,
+    82777,
+    82794,
+    82798,
+    82815,
+    82819,
+    82822,
+    82848,
+    82866,
+    82868,
+    82888,
+    82893,
+    82912,
+    82921,
+    82923,
+    82934,
+    82961,
+    82971,
+    82979,
+    82994,
+    82997,
+    83002,
+    83006,
+    83007,
+    83050,
+    83051,
+    83058,
+    83065,
+    83073,
+    83097,
+    83107,
+    83137,
+    83164,
+    83191,
+    83197,
+    83207,
+    83222,
+    83226,
+    83237,
+    83244,
+    83271,
+    83280,
+    83312,
+    83320,
+    83327,
+    83332,
+    83347,
+    83359,
+    83383,
+    83394,
+    83434,
+    83447,
+    83457,
+    83460,
+    83485,
+    83503,
+    83515,
+    83519,
+    83522,
+    83532,
+    83554,
+    83555,
+    83570,
+    83591,
+    83626,
+    83643,
+    83663,
+    83670,
+    83671,
+    83710,
+    83711,
+    83724,
+    83725,
+    83738,
+    83748,
+    83775,
+    83790,
+    83809,
+    83844,
+    83849,
+    83852,
+    83853,
+    83877,
+    83886,
+    83894,
+    83900,
+    83913,
+    83917,
+    83921,
+    83933,
+    83941,
+    83950,
+    83970,
+    84008,
+    84019,
+    84025,
+    84044,
+    84086,
+    84109,
+    84119,
+    84133,
+    84141,
+    84164,
+    84200,
+    84201,
+    84212,
+    84215,
+    84238,
+    84271,
+    84274,
+    84325,
+    84330,
+    84333,
+    84345,
+    84356,
+    84413,
+    84462,
+    84472,
+    84496,
+    84500,
+    84517,
+    84539,
+    84554,
+    84558,
+    84559,
+    84562,
+    84563,
+    84565,
+    84567,
+    84572,
+    84576,
+    84580,
+    84588,
+    84595,
+    84613,
+    84631,
+    84636,
+    84645,
+    84685,
+    84693,
+    84702,
+    84705,
+    84716,
+    84738,
+    84741,
+    84852,
+    84863,
+    84875,
+    84877,
+    84898,
+    84929,
+    84940,
+    84944,
+    84960,
+    84999,
+    85004,
+    85054,
+    85103,
+    85120,
+    85161,
+    85166,
+    85167,
+    85177,
+    85180,
+    85203,
+    85243,
+    85268,
+    85277,
+    85288,
+    85312,
+    85318,
+    85321,
+    85334,
+    85342,
+    85368,
+    85392,
+    85398,
+    85407,
+    85413,
+    85422,
+    85434,
+    85447,
+    85465,
+    85490,
+    85494,
+    85500,
+    85503,
+    85540,
+    85545,
+    85568,
+    85595,
+    85602,
+    85604,
+    85617,
+    85629,
+    85677,
+    85714,
+    85718,
+    85764,
+    85785,
+    85794,
+    85795,
+    85846,
+    85847,
+    85863,
+    85867,
+    85886,
+    85892,
+    85912,
+    85925,
+    85931,
+    85990,
+    85991,
+    85994,
+    86021,
+    86025,
+    86062,
+    86076,
+    86085,
+    86089,
+    86114,
+    86146,
+    86154,
+    86197,
+    86214,
+    86215,
+    86232,
+    86234,
+    86237,
+    86256,
+    86265,
+    86303,
+    86316,
+    86321,
+    86344,
+    86378,
+    86427,
+    86435,
+    86436,
+    86459,
+    86465,
+    86474,
+    86503,
+    86558,
+    86620,
+    86637,
+    86643,
+    86661,
+    86709,
+    86724,
+    86726,
+    86729,
+    86770,
+    86779,
+    86812,
+    86827,
+    86828,
+    86843,
+    86853,
+    86856,
+    86881,
+    86922,
+    86967,
+    87008,
+    87036,
+    87037,
+    87059,
+    87079,
+    87094,
+    87109,
+    87110,
+    87114,
+    87120,
+    87136,
+    87141,
+    87248,
+    87260,
+    87282,
+    87299,
+    87346,
+    87358,
+    87411,
+    87414,
+    87433,
+    87441,
+    87442,
+    87489,
+    87553,
+    87565,
+    87567,
+    87569,
+    87573,
+    87574,
+    87586,
+    87597,
+    87619,
+    87628,
+    87647,
+    87653,
+    87676,
+    87685,
+    87712,
+    87736,
+    87741,
+    87754,
+    87771,
+    87775,
+    87793,
+    87861,
+    87874,
+    87894,
+    87903,
+    87942,
+    87951,
+    87959,
+    87964,
+    88002,
+    88033,
+    88090,
+    88099,
+    88102,
+    88117,
+    88120,
+    88125,
+    88141,
+    88143,
+    88176,
+    88252,
+    88281,
+    88349,
+    88363,
+    88380,
+    88385,
+    88433,
+    88463,
+    88480,
+    88490,
+    88512,
+    88523,
+    88529,
+    88541,
+    88573,
+    88622,
+    88657,
+    88691,
+    88726,
+    88744,
+    88763,
+    88774,
+    88783,
+    88804,
+    88830,
+    88853,
+    88866,
+    88889,
+    88901,
+    88903,
+    88928,
+    88940,
+    88945,
+    88946,
+    88988,
+    88998,
+    89004,
+    89036,
+    89039,
+    89049,
+    89061,
+    89063,
+    89071,
+    89083,
+    89092,
+    89093,
+    89095,
+    89101,
+    89138,
+    89148,
+    89157,
+    89160,
+    89166,
+    89179,
+    89212,
+    89213,
+    89238,
+    89253,
+    89262,
+    89291,
+    89297,
+    89318,
+    89329,
+    89363,
+    89382,
+    89383,
+    89384,
+    89385,
+    89389,
+    89401,
+    89411,
+    89419,
+    89427,
+    89434,
+    89443,
+    89459,
+    89460,
+    89467,
+    89478,
+    89486,
+    89488,
+    89494,
+    89516,
+    89526,
+    89528,
+    89543,
+    89598,
+    89650,
+    89683,
+    89684,
+    89694,
+    89720,
+    89727,
+    89744,
+    89746,
+    89764,
+    89788,
+    89835,
+    89855,
+    89856,
+    89869,
+    89882,
+    89903,
+    89921,
+    89929,
+    89950,
+    89958,
+    89966,
+    89998,
+    90001,
+    90094,
+    90096,
+    90118,
+    90147,
+    90148,
+    90155,
+    90177,
+    90178,
+    90192,
+    90204,
+    90208,
+    90218,
+    90220,
+    90222,
+    90230,
+    90237,
+    90241,
+    90265,
+    90276,
+    90277,
+    90278,
+    90306,
+    90308,
+    90314,
+    90332,
+    90335,
+    90358,
+    90380,
+    90387,
+    90395,
+    90408,
+    90410,
+    90415,
+    90430,
+    90440,
+    90475,
+    90476,
+    90496,
+    90515,
+    90522,
+    90546,
+    90557,
+    90621,
+    90639,
+    90643,
+    90651,
+    90675,
+    90677,
+    90679,
+    90688,
+    90698,
+    90711,
+    90717,
+    90730,
+    90734,
+    90759,
+    90768,
+    90798,
+    90827,
+    90832,
+    90839,
+    90894,
+    90906,
+    90911,
+    90959,
+    90983,
+    91008,
+    91018,
+    91035,
+    91048,
+    91050,
+    91088,
+    91093,
+    91099,
+    91117,
+    91182,
+    91196,
+    91205,
+    91211,
+    91223,
+    91249,
+    91276,
+    91278,
+    91283,
+    91287,
+    91301,
+    91347,
+    91349,
+    91354,
+    91360,
+    91383,
+    91389,
+    91399,
+    91403,
+    91413,
+    91479,
+    91514,
+    91515,
+    91517,
+    91528,
+    91545,
+    91573,
+    91581,
+    91586,
+    91595,
+    91630,
+    91637,
+    91644,
+    91697,
+    91699,
+    91737,
+    91752,
+    91756,
+    91771,
+    91779,
+    91812,
+    91827,
+    91841,
+    91867,
+    91868,
+    91888,
+    91889,
+    91898,
+    91904,
+    91921,
+    91927,
+    91935,
+    91940,
+    91944,
+    91958,
+    91976,
+    92000,
+    92010,
+    92047,
+    92056,
+    92081,
+    92084,
+    92099,
+    92102,
+    92103,
+    92108,
+    92111,
+    92120,
+    92142,
+    92149,
+    92163,
+    92165,
+    92171,
+    92173,
+    92181,
+    92187,
+    92190,
+    92213,
+    92222,
+    92228,
+    92243,
+    92277,
+    92327,
+    92346,
+    92349,
+    92378,
+    92379,
+    92456,
+    92464,
+    92465,
+    92515,
+    92562,
+    92590,
+    92650,
+    92710,
+    92723,
+    92737,
+    92753,
+    92763,
+    92782,
+    92783,
+    92833,
+    92855,
+    92880,
+    92893,
+    92897,
+    92910,
+    92978,
+    92986,
+    93004,
+    93008,
+    93029,
+    93045,
+    93047,
+    93075,
+    93083,
+    93170,
+    93175,
+    93178,
+    93188,
+    93191,
+    93196,
+    93219,
+    93227,
+    93241,
+    93244,
+    93255,
+    93271,
+    93273,
+    93296,
+    93299,
+    93314,
+    93316,
+    93317,
+    93333,
+    93342,
+    93349,
+    93389,
+    93397,
+    93437,
+    93448,
+    93490,
+    93509,
+    93538,
+    93544,
+    93549,
+    93558,
+    93596,
+    93601,
+    93611,
+    93614,
+    93618,
+    93627,
+    93633,
+    93652,
+    93670,
+    93682,
+    93718,
+    93728,
+    93744,
+    93746,
+    93754,
+    93826,
+    93844,
+    93873,
+    93892,
+    93896,
+    93897,
+    93902,
+    93905,
+    93911,
+    93920,
+    93922,
+    93939,
+    93941,
+    93977,
+    93986,
+    94010,
+    94023,
+    94028,
+    94034,
+    94037,
+    94052,
+    94081,
+    94125,
+    94127,
+    94128,
+    94136,
+    94141,
+    94169,
+    94208,
+    94227,
+    94239,
+    94244,
+    94305,
+    94326,
+    94334,
+    94335,
+    94345,
+    94367,
+    94374,
+    94391,
+    94432,
+    94439,
+    94448,
+    94456,
+    94464,
+    94491,
+    94502,
+    94511,
+    94521,
+    94538,
+    94601,
+    94614,
+    94620,
+    94679,
+    94691,
+    94702,
+    94714,
+    94722,
+    94730,
+    94745,
+    94753,
+    94767,
+    94773,
+    94786,
+    94792,
+    94799,
+    94807,
+    94821,
+    94825,
+    94883,
+    94884,
+    94886,
+    94888,
+    94892,
+    94925,
+    94931,
+    94945,
+    94947,
+    94951,
+    94964,
+    94976,
+    94985,
+    95024,
+    95032,
+    95036,
+    95053,
+    95063,
+    95069,
+    95071,
+    95087,
+    95105,
+    95106,
+    95113,
+    95114,
+    95140,
+    95159,
+    95173,
+    95211,
+    95248,
+    95255,
+    95267,
+    95270,
+    95274,
+    95278,
+    95288,
+    95299,
+    95323,
+    95349,
+    95350,
+    95357,
+    95363,
+    95368,
+    95377,
+    95423,
+    95426,
+    95438,
+    95482,
+    95492,
+    95519,
+    95522,
+    95525,
+    95527,
+    95528,
+    95546,
+    95586,
+    95605,
+    95620,
+    95642,
+    95654,
+    95670,
+    95674,
+    95685,
+    95687,
+    95707,
+    95741,
+    95753,
+    95754,
+    95804,
+    95810,
+    95815,
+    95843,
+    95849,
+    95851,
+    95884,
+    95892,
+    95901,
+    95906,
+    95916,
+    95970,
+    95974,
+    95989,
+    96017,
+    96021,
+    96025,
+    96033,
+    96050,
+    96055,
+    96060,
+    96065,
+    96069,
+    96075,
+    96090,
+    96116,
+    96117,
+    96121,
+    96133,
+    96138,
+    96155,
+    96200,
+    96279,
+    96332,
+    96335,
+    96351,
+    96360,
+    96366,
+    96390,
+    96426,
+    96461,
+    96466,
+    96494,
+    96501,
+    96503,
+    96530,
+    96561,
+    96580,
+    96581,
+    96595,
+    96604,
+    96683,
+    96705,
+    96711,
+    96721,
+    96731,
+    96756,
+    96759,
+    96771,
+    96773,
+    96786,
+    96806,
+    96808,
+    96859,
+    96866,
+    96887,
+    96899,
+    96911,
+    96912,
+    96938,
+    96950,
+    96996,
+    97000,
+    97015,
+    97061,
+    97064,
+    97085,
+    97086,
+    97095,
+    97112,
+    97149,
+    97172,
+    97207,
+    97216,
+    97233,
+    97236,
+    97238,
+    97256,
+    97259,
+    97309,
+    97314,
+    97332,
+    97342,
+    97347,
+    97348,
+    97403,
+    97433,
+    97469,
+    97470,
+    97478,
+    97494,
+    97514,
+    97529,
+    97539,
+    97556,
+    97568,
+    97591,
+    97605,
+    97606,
+    97619,
+    97644,
+    97676,
+    97739,
+    97741,
+    97833,
+    97856,
+    97887,
+    97918,
+    97923,
+    97937,
+    97942,
+    97946,
+    97962,
+    97999,
+    98008,
+    98029,
+    98045,
+    98048,
+    98079,
+    98102,
+    98140,
+    98155,
+    98164,
+    98174,
+    98187,
+    98203,
+    98207,
+    98208,
+    98220,
+    98249,
+    98251,
+    98274,
+    98276,
+    98282,
+    98291,
+    98302,
+    98313,
+    98319,
+    98320,
+    98329,
+    98330,
+    98347,
+    98364,
+    98366,
+    98372,
+    98379,
+    98385,
+    98398,
+    98401,
+    98406,
+    98422,
+    98434,
+    98436,
+    98443,
+    98517,
+    98524,
+    98527,
+    98610,
+    98642,
+    98655,
+    98665,
+    98670,
+    98685,
+    98688,
+    98708,
+    98722,
+    98729,
+    98734,
+    98737,
+    98747,
+    98759,
+    98769,
+    98788,
+    98821,
+    98837,
+    98853,
+    98873,
+    98880,
+    98891,
+    98893,
+    98899,
+    98903,
+    98906,
+    98926,
+    98952,
+    98964,
+    98965,
+    98973,
+    98976,
+    98986,
+    98997,
+    99007,
+    99010,
+    99012,
+    99057,
+    99058,
+    99076,
+    99082,
+    99095,
+    99102,
+    99141,
+    99144,
+    99151,
+    99156,
+    99157,
+    99158,
+    99159,
+    99160,
+    99161,
+    99162,
+    99163,
+    99166,
+    99167,
+    99168,
+    99169,
+    99170,
+    99171,
+    99173,
+    99174,
+    99175,
+    99176,
+    99177,
+    99179,
+    99181,
+    99183,
+    99184,
+    99187,
+    99188,
+    99189,
+    99192,
+    99196,
+    99197,
+    99198,
+    99201,
+    99202,
+    99203,
+    99206,
+    99207,
+    99211,
+    99214,
+    99215,
+    99220,
+    99221,
+    99224,
+    99228,
+    99229,
+    99230,
+    99231,
+    99238,
+    99239,
+    99240,
+    99247,
+    99248,
+    99249,
+    99254,
+    99255,
+    99256,
+    99264,
+    99265,
+    99266,
+    99267,
+    99268,
+    99269,
+    99274,
+    99275,
+    99276,
+    99281,
+    99282,
+    99289,
+    99290,
+    99291,
+    99300,
+    99301,
+    99302,
+    99303,
+    99311,
+    99323,
+    99324,
+    99325,
+    99341,
+    99342,
+    99343,
+    99344,
+    99359,
+    99374,
+    99380,
+    99381,
+    99382,
+    99401,
+    99402,
+    99423,
+    99439,
+    99442,
+    99443,
+    99444,
+    99456,
+    99484,
+    99485,
+    99516,
+    99551,
+    99552,
+    99597,
+    99598,
+    99643,
+    99648,
+    99649,
+    99714,
+    99715,
+    99777,
+    99785,
+    99874,
+    99875,
+    99977,
+    99997,
+    100024,
+    100120,
+    100121,
+    100127,
+    100128,
+    100129,
+    100130,
+    100337,
+    100459,
+    100563,
+    100620,
+    100621,
+    100988,
+    101018,
+    101022,
+    101024,
+    101025,
+    101026,
+    101027,
+    101028,
+    101029,
+    101030,
+    101031,
+    101032,
+    101033,
+    101759,
+    101838,
+    101851,
+    101860,
+    101861,
+    101862,
+    101863,
+    101864,
+    101865,
+    101866,
+    101867,
+    101868,
+    101869,
+    101870,
+    101871,
+    101872,
+    101873,
+    101874,
+    101875,
+    101876,
+    101877,
+    101878,
+    101879,
+    101880,
+    101881,
+    103893,
+    103894,
+    103895,
+    103896,
+    103897,
+    103898,
+    103899,
+    103900,
+    103901,
+    103902,
+    103903,
+    103904,
+    103905,
+    103906,
+    103907,
+    103908,
+    103909,
+    103910,
+    103911,
+    103912,
+    103913,
+    103914,
+    103915,
+    103916,
+    103917,
+    104905,
+    105619,
+    109241,
+    109992,
+    110713,
+    111747,
+    112951,
+    113943,
+    114235,
+    114397,
+    115646,
+    117035,
+    119158,
+    119346,
+    119347,
+    119348,
+    119941,
+    119964,
+    120100,
+    120409,
+    121404,
+    121667,
+    121773,
+    122154,
+    122174,
+    122202,
+    122219,
+    122222,
+    122259,
+    122267,
+    122289,
+    122317,
+    122375,
+    122382,
+    122427,
+    122448,
+    122455,
+    122514,
+    122568,
+    122596,
+    122614,
+    122618,
+    122634,
+    122659,
+    122725,
+    122740,
+    122757,
+    122802,
+    122843,
+    122855,
+    122866,
+    122890,
+    122948,
+    122950,
+    122976,
+    122985,
+    122999,
+    123007,
+    123008,
+    123016,
+    123057,
+    123064,
+    123067,
+    123072,
+    123084,
+    123091,
+    123131,
+    123164,
+    123178,
+    123182,
+    123204,
+    123206,
+    123287,
+    123301,
+    123305,
+    123318,
+    123332,
+    123378,
+    123400,
+    123408,
+    123420,
+    123489,
+    123516,
+    123527,
+    123547,
+    123614,
+    123658,
+    123676,
+    123701,
+    123714,
+    123740,
+    123747,
+    123760,
+    123806,
+    123807,
+    123808,
+    123810,
+    123811,
+    123812,
+    123813,
+    123814,
+    123815,
+    123816,
+    123817,
+    123819,
+    123821,
+    123825,
+    123827,
+    123830,
+    123836,
+    123837,
+    123839,
+    123840,
+    123841,
+    123842,
+    123844,
+    123846,
+    123847,
+    123848,
+    123851,
+    123853,
+    123854,
+    123859,
+    123867,
+    123870,
+    123871,
+    123875,
+    123876,
+    123882,
+    123889,
+    123892,
+    123893,
+    123907,
+    123908,
+    123911,
+    123912,
+    123916,
+    123917,
+    123918,
+    123927,
+    123928,
+    123929,
+    123930,
+    123931,
+    123932,
+    123933,
+    123934,
+    123946,
+    123947,
+    123950,
+    123951,
+    123953,
+    123954,
+    123955,
+    123956,
+    123957,
+    123965,
+    123967,
+    123968,
+    123969,
+    123970,
+    123986,
+    123989,
+    123996,
+    124000,
+    124001,
+    124002,
+    124003,
+    124004,
+    124005,
+    124018,
+    124019,
+    124020,
+    124021,
+    124022,
+    124023,
+    124024,
+    124025,
+    124026,
+    124049,
+    124050,
+    124051,
+    124052,
+    124053,
+    124054,
+    124063,
+    124064,
+    124066,
+    124067,
+    124068,
+    124069,
+    124070,
+    124071,
+    124092,
+    124096,
+    124098,
+    124099,
+    124100,
+    124101,
+    124102,
+    124118,
+    124134,
+    124135,
+    124136,
+    124137,
+    124149,
+    124154,
+    124159,
+    124160,
+    124161,
+    124162,
+    124163,
+    124164,
+    124165,
+    124166,
+    124167,
+    124168,
+    124193,
+    124194,
+    124195,
+    124196,
+    124197,
+    124198,
+    124199,
+    124200,
+    124201,
+    124222,
+    124235,
+    124236,
+    124240,
+    124241,
+    124243,
+    124244,
+    124245,
+    124246,
+    124247,
+    124248,
+    124249,
+    124250,
+    124251,
+    124252,
+    124287,
+    124296,
+    124297,
+    124298,
+    124299,
+    124300,
+    124301,
+    124302,
+    124303,
+    124304,
+    124305,
+    124306,
+    124307,
+    124308,
+    124309,
+    124334,
+    124349,
+    124364,
+    124365,
+    124366,
+    124367,
+    124368,
+    124369,
+    124370,
+    124371,
+    124372,
+    124373,
+    124374,
+    124433,
+    124459,
+    124470,
+    124471,
+    124472,
+    124473,
+    124474,
+    124475,
+    124570,
+    124577,
+    124578,
+    124579,
+    124580,
+    124581,
+    124582,
+    124583,
+    124584,
+    124585,
+    124586,
+    124587,
+    124588,
+    124589,
+    124590,
+    124591,
+    124592,
+    124593,
+    124594,
+    124595,
+    124596,
+    124693,
+    124727,
+    124744,
+    124745,
+    124746,
+    124747,
+    124748,
+    124749,
+    124750,
+    124751,
+    124752,
+    124753,
+    124754,
+    124755,
+    124995,
+    124996,
+    124997,
+    124998,
+    124999,
+    125000,
+    125001,
+    125002,
+    125003,
+    125004,
+    125388,
+    125413,
+    125414,
+    125415,
+    125416,
+    125417,
+    125418,
+    125419,
+    125420,
+    125421,
+    125422,
+    125423,
+    125424,
+    125425,
+    125426,
+    125427,
+    125428,
+    125429,
+    125430,
+    125639,
+    125713,
+    126159,
+    126177,
+    126178,
+    126179,
+    126180,
+    126181,
+    126182,
+    126183,
+    126184,
+    126185,
+    126186,
+    126187,
+    126188,
+    126189,
+    126190,
+    126191,
+    126192,
+    126193,
+    126365,
+    126537,
+    127041,
+    127124,
+    127165,
+    127369,
+    127572,
+    127708,
+    127819,
+    127964,
+    128222,
+    128223,
+    128224,
+    128225,
+    128226,
+    128227,
+    128228,
+    128229,
+    128230,
+    128231,
+    128232,
+    128233,
+    128234,
+    128235,
+    128236,
+    128237,
+    128238,
+    128239,
+    128240,
+    128241,
+    128242,
+    129176,
+    130507,
+    130654,
+    132202,
+    132376,
+    133552,
+    134312,
+    134380,
+    134582,
+    135078,
+    136279,
+    136516,
+    136661,
+    136940,
+    136946,
+    137345,
+    137661,
+    137767,
+    137973,
+    138658,
+    138673,
+    139793,
+    139816,
+    140356,
+    141539,
+    142067,
+    142209,
+    142210,
+    142258,
+    142274,
+    142299,
+    142451,
+    142509,
+    142731,
+    142834,
+    143241,
+    143455,
+    143861,
+    143887,
+    145773,
+    146047,
+    146632,
+    148462,
+    148774,
+    148827,
+    148860,
+    148864,
+    148880,
+    149175,
+    149178,
+    149287,
+    149352,
+    149392,
+    149394,
+    149589,
+    149591,
+    149593,
+    149714,
+    149716,
+    149718,
+    149721,
+    149946,
+    149961,
+    149963,
+    149990,
+    150114,
+    150151,
+    150168,
+    150195,
+    150223,
+    150270,
+    150479,
+    150579,
+    150792,
+    150794,
+    150800,
+    150802,
+    150807,
+    150809,
+    150814,
+    150817,
+    150819,
+    150821,
+    150824,
+    150829,
+    150836,
+    151233,
+    151254,
+    151264,
+    151266,
+    151268,
+    151270,
+    151272,
+    151274,
+    151276,
+    151278,
+    151282,
+    151366,
+    151560
+  ],
+  "bos_token": "<|im_start|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MiniCPMOProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": "<unk>",
+  "use_fast": true
+}

utils.py ADDED Viewed

	@@ -0,0 +1,2417 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from dataclasses import dataclass
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Literal
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import torch
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as P
+from transformers.cache_utils import DynamicCache
+logger = logging.getLogger(__name__)
+# text
+@dataclass
+class GenerateChunkOutput:
+    chunk_token_ids: torch.Tensor
+    current_inputs_embeds: torch.Tensor
+    input_last_hidden_states: Optional[torch.Tensor]  # for tts use_speaker_embedding
+    last_hidden_states: Optional[torch.Tensor]  # for tts input feature (projector_semantic)
+    past_key_values: Optional[torch.Tensor]
+    finished: bool
+class ChunkPrefillChunkGenerate:
+    def __init__(self, model, tokenizer, terminators):
+        self.tokenizer = tokenizer
+        self.model = model
+        self.terminators = terminators
+        self.terminators_ids = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        self.embedding_layer = self.model.get_input_embeddings()
+        self.forbidden_tokens = [
+            ":",
+            "：",
+            "；",
+            "#",
+            "“",
+            "”",
+            "‘",
+            "’",
+            "@",
+            "*",
+            "【",
+            "】",
+            "「",
+            "」",
+            "(",
+            ")",
+            "（",
+            "）",
+            "[",
+            "]",
+            "&",
+            "/",
+            "$",
+        ]
+        self.forbidden_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in self.forbidden_tokens]
+        bad_token_ids = getattr(tokenizer, "bad_token_ids", [])
+        if bad_token_ids:
+            self.forbidden_token_ids.extend(bad_token_ids)
+    @staticmethod
+    def prepare_generation_config(do_sample, max_new_tokens=50, min_new_tokens=0, **kwargs):
+        num_beams = kwargs.get("num_beams", 3)
+        generation_config = {
+            "num_beams": num_beams,
+            "top_p": 0.8,
+            "top_k": 100,
+            "temperature": 0.7,
+            "do_sample": True,
+            "repetition_penalty": 1.05,
+        }
+        if do_sample:
+            generation_config.update(
+                {
+                    "top_p": 0.8,
+                    "top_k": 100,
+                    "temperature": 0.7,
+                    "do_sample": True,
+                    "repetition_penalty": 1.05,
+                }
+            )
+        elif num_beams > 1:
+            generation_config.update({"num_beams": num_beams, "repetition_penalty": 1.2, "do_sample": False})
+        else:
+            generation_config.update({"do_sample": False, "repetition_penalty": 1.05})
+        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
+        generation_config["min_new_tokens"] = min_new_tokens
+        generation_config["max_new_tokens"] = max_new_tokens
+        return generation_config
+    def chunk_generate(
+        self,
+        inputs_embeds: torch.Tensor,
+        past_key_values,
+        is_first_generate_chunk: bool,
+        chunk_size: int,
+        return_hidden_states: bool,
+        do_sample: bool,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        repetition_penalty: float = 1.05,
+        length_penalty: float = 1.0,
+        all_input_ids: Optional[torch.Tensor] = None,
+    ) -> GenerateChunkOutput:
+        """
+        Args:
+            inputs_embeds: [1, seq_len, hidden_dim], Input embeddings of current chunk.
+            past_key_values: [num_layers, 2, batch_size, num_heads, seq_len, head_dim], Past key values for llm.
+            is_first_generate_chunk: bool, Whether this is the first generate chunk.
+            chunk_size: int, The size of the current chunk, default is 10, and it is fixed during training.
+            return_hidden_states: bool Whether to return the hidden states, default is True.
+            do_sample: bool Whether to sample from the model, default is True.
+            temperature: float The temperature for the model, default is 0.7.
+            top_p: float The top-p for the model, default is 0.8.
+            top_k: int The top-k for the model, default is 100.
+            repetition_penalty: float, The repetition penalty for the model, default is 1.05.
+            length_penalty: float, The length penalty for the model, default is 1.0. Higher value means more detailed generation.
+            all_input_ids: Optional[torch.Tensor], The input ids for the current chunk.
+        """
+        finished = False
+        current_inputs_embeds = inputs_embeds.clone()
+        input_last_hidden_states = []
+        last_hidden_states = []
+        generated_tokens = []
+        for token_idx in range(chunk_size):
+            if is_first_generate_chunk and token_idx == 0:
+                # first generate chunk, prefill inputs_embeds
+                model_inputs = {
+                    "inputs_embeds": current_inputs_embeds,
+                    "past_key_values": past_key_values,
+                    "use_cache": True,
+                    "output_hidden_states": return_hidden_states,
+                }
+            else:  # for all other cases: prefill the latest generated token
+                model_inputs = {
+                    "inputs_embeds": current_inputs_embeds[:, -1:, :],
+                    "past_key_values": past_key_values,
+                    "use_cache": True,
+                    "output_hidden_states": return_hidden_states,
+                }
+            with torch.no_grad():
+                outputs = self.model(**model_inputs)
+            # last token's logits
+            logits = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=inputs_embeds.device)
+            # forbid specific tokens decoding = model.generate@suppress_tokens
+            if self.forbidden_token_ids:
+                logits[:, self.forbidden_token_ids] = float("-inf")
+            past_key_values = outputs.past_key_values
+            PENALTY_WINDOW_SIZE = 128
+            # apply repetition penalty
+            if repetition_penalty != 1.0:
+                # get token ids for repetition penalty
+                if all_input_ids is not None:
+                    # use global input ids (including original input and generated part)
+                    if len(generated_tokens) > 0:
+                        generated_token_ids = torch.cat(generated_tokens, dim=1)
+                        current_sequence = torch.cat(
+                            [
+                                all_input_ids[:, -PENALTY_WINDOW_SIZE:],
+                                generated_token_ids,
+                            ],
+                            dim=1,
+                        )
+                    else:
+                        current_sequence = all_input_ids[:, -PENALTY_WINDOW_SIZE:]
+                    unique_token_ids = torch.unique(current_sequence.squeeze(0))
+                elif len(generated_tokens) > 0:
+                    # revert to original logic: only use generated tokens
+                    generated_token_ids = torch.cat(generated_tokens, dim=1).squeeze(0)
+                    unique_token_ids = torch.unique(generated_token_ids)
+                else:
+                    unique_token_ids = torch.tensor([], dtype=torch.long, device=logits.device)
+                # apply repetition penalty
+                for token_id in unique_token_ids:
+                    if logits[0, token_id] > 0:
+                        logits[0, token_id] = logits[0, token_id] / repetition_penalty
+                    else:
+                        logits[0, token_id] = logits[0, token_id] * repetition_penalty
+            # apply length penalty, higher value means more detailed generation
+            if length_penalty != 1.0:
+                for eos_token_id in self.terminators_ids:
+                    if logits[0, eos_token_id] > 0:
+                        logits[0, eos_token_id] = logits[0, eos_token_id] / length_penalty
+                    else:
+                        logits[0, eos_token_id] = logits[0, eos_token_id] * length_penalty
+            # apply temperature
+            if temperature != 1.0:
+                logits = logits / temperature
+            if do_sample:
+                # Top-k filtering
+                if top_k > 0:
+                    top_k_logits, top_k_indices = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits_filtered = torch.full_like(logits, float("-inf"))
+                    logits_filtered.scatter_(1, top_k_indices, top_k_logits)
+                    logits = logits_filtered
+                # Top-p filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    # remove tokens with cumulative probability greater than top_p
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float("-inf")
+                # sampling
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(logits, dim=-1, keepdim=True)
+            if return_hidden_states:
+                if is_first_generate_chunk and token_idx == 0:
+                    input_last_hidden_states.append(outputs.hidden_states[-1])
+                else:
+                    last_hidden_states.append(outputs.hidden_states[-1])
+            # if terminator token, stop generating
+            if next_token.item() in self.terminators_ids:
+                finished = True
+                break
+            generated_tokens.append(next_token)
+            # convert new token to embeddings and concatenate
+            next_token_embed = self.embedding_layer(next_token)
+            # update inputs_embeds, add one
+            current_inputs_embeds = torch.cat([current_inputs_embeds, next_token_embed], dim=1)
+        if len(generated_tokens) > 0:
+            chunk_token_ids = torch.cat(generated_tokens, dim=1)
+        else:
+            # special case: if last chunk and first predict is eos token, return last token of previous chunk. return a tensor with shape (1, 0)
+            if finished:
+                chunk_token_ids = torch.zeros((1, 0), dtype=torch.long, device=current_inputs_embeds.device)
+            else:
+                raise Exception("this should not happen")
+        if len(last_hidden_states) > 0:
+            last_hidden_states = torch.cat(last_hidden_states, dim=1)
+        else:
+            # special case: if last chunk, return last token of previous chunk.
+            if finished:
+                last_hidden_states = torch.cat(last_hidden_states, dim=1)
+            else:
+                raise Exception("this should not happen")
+        if len(input_last_hidden_states) > 0:
+            input_last_hidden_states = torch.cat(input_last_hidden_states, dim=1)
+        else:
+            input_last_hidden_states = None
+        return GenerateChunkOutput(
+            chunk_token_ids=chunk_token_ids,
+            current_inputs_embeds=current_inputs_embeds,
+            input_last_hidden_states=input_last_hidden_states,
+            last_hidden_states=last_hidden_states,
+            past_key_values=past_key_values,
+            finished=finished,
+        )
+def streaming_token_decoder(token_iterator, tokenizer, skip_special_tokens=False):
+    """
+    Incrementally decode tokens from an iterator, handling partial multi-byte characters.
+    When streaming tokens, multi-byte characters (like Chinese) may be split across multiple
+    tokens. Decoding partial tokens results in replacement characters (U+FFFD). This function
+    buffers tokens and only yields complete characters.
+    Args:
+        token_iterator: An iterator yielding (token_ids, is_finished) tuples.
+                       token_ids can be torch.Tensor or any iterable of integers.
+        tokenizer: The tokenizer to use for decoding.
+        skip_special_tokens: Whether to skip special tokens during decoding.
+    Yields:
+        (decoded_text, is_finished) tuples where decoded_text is the new text since last yield.
+    """
+    accumulated_token_ids = []
+    yielded_text_len = 0
+    for token_ids, is_finished in token_iterator:
+        # Accumulate token IDs
+        if torch.is_tensor(token_ids):
+            accumulated_token_ids.extend(token_ids.reshape(-1).tolist())
+        else:
+            accumulated_token_ids.extend(list(token_ids) if hasattr(token_ids, "__iter__") else [token_ids])
+        # Decode all accumulated tokens
+        full_decoded = tokenizer.decode(accumulated_token_ids, skip_special_tokens=skip_special_tokens)
+        if is_finished:
+            # Final chunk - yield all remaining text
+            new_text = full_decoded[yielded_text_len:]
+            yield new_text, is_finished
+        else:
+            # Find safe prefix without incomplete multi-byte characters
+            # The replacement character '�' (U+FFFD) indicates incomplete decoding
+            new_text = full_decoded[yielded_text_len:]
+            # Hold back text ending with replacement character (incomplete UTF-8 sequence)
+            safe_end = len(new_text)
+            while safe_end > 0 and new_text[safe_end - 1] == "\ufffd":
+                safe_end -= 1
+            safe_text = new_text[:safe_end] if safe_end > 0 else ""
+            yielded_text_len += len(safe_text)
+            yield safe_text, is_finished
+def torch_clone_recursive(obj):
+    """Recursively clone nested containers of torch.Tensors.
+    Supported container types: dict, list, tuple. Non-container non-Tensor
+    objects are returned as-is.
+    """
+    if torch.is_tensor(obj):
+        return obj.clone()
+    elif isinstance(obj, dict):
+        return {k: torch_clone_recursive(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [torch_clone_recursive(v) for v in obj]
+    elif isinstance(obj, tuple):
+        return tuple(torch_clone_recursive(v) for v in obj)
+    else:
+        raise ValueError(f"Unsupported type: {type(obj)}")
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotate half the hidden dims of the input for RoPE."""
+    dim = x.shape[-1]
+    x1 = x[..., : dim // 2]
+    x2 = x[..., dim // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@dataclass
+class SpeculativeSnapshot:
+    """Speculative snapshot for VAD speculative rollback.
+    Used in VAD speculative execution: creates a snapshot after streaming_prefill
+    and before streaming_generate. If speculation fails (user continues speaking),
+    the state can be restored to continue streaming_prefill.
+    Implementation:
+    - LLM KV Cache: only record length, restore by truncation (zero extra VRAM)
+    - Audio KV Cache: requires cloning, as generate sets it to None
+    - Mel processor: save full state snapshot (including buffer)
+    """
+    # KV Cache length (for truncation recovery)
+    llm_cache_length: int
+    audio_cache_length: int
+    # session state
+    new_user_msg: bool
+    llm_generated: bool
+    llm_generate_completed: bool
+    # Round management
+    next_round_id: int
+    pending_round_id: Optional[int]
+    omni_chunk_history_length: int
+    # TTS state (requires cloning, but usually small)
+    tts_last_turn_tokens: Optional[torch.Tensor]
+    # Streaming processor state
+    audio_chunk_idx: int
+    # Mel processor state snapshot (including buffer)
+    mel_processor_snapshot: Optional[dict] = None
+    # Audio encoder KV cache (requires cloning to ensure determinism after recovery)
+    audio_past_key_values: Optional[tuple] = None
+    # timestamp (for debugging)
+    timestamp: float = 0.0
+    # debug field: for verifying correctness of recovery
+    llm_cache_checksum: Optional[float] = None  # LLM KV Cache first layer K sum
+    audio_cache_checksum: Optional[float] = None  # Audio KV Cache first layer K sum
+    mel_buffer_checksum: Optional[float] = None  # Mel buffer sum
+    # RNG state (key: for ensuring determinism of dithering etc. after recovery)
+    rng_state_cpu: Optional[torch.Tensor] = None  # torch CPU RNG state
+    rng_state_cuda: Optional[torch.Tensor] = None  # torch CUDA RNG state (if on GPU)
+    def summary(self) -> str:
+        mel_buf_len = 0
+        if self.mel_processor_snapshot:
+            buf = self.mel_processor_snapshot.get("buffer")
+            if buf is not None:
+                mel_buf_len = len(buf)
+        return (
+            f"llm_cache={self.llm_cache_length}, "
+            f"audio_cache={self.audio_cache_length}, "
+            f"audio_chunk_idx={self.audio_chunk_idx}, "
+            f"mel_buffer={mel_buf_len}, "
+            f"history_len={self.omni_chunk_history_length}, "
+            f"new_user_msg={self.new_user_msg}, "
+            f"llm_generated={self.llm_generated}"
+        )
+# tts
+@dataclass
+class TTSSamplingParams:
+    top_p: float = 0.85
+    min_p: float = 0.01
+    top_k: int = 25
+    repetition_penalty: float = 1.05
+    temperature: float = 0.8
+    win_size: int = 16
+    tau_r: float = 0.1
+class TTSStreamingGenerator:
+    """
+    Streaming generator for TTS that processes chunks and yields audio tokens in real-time.
+    Supported attention types:
+    - full_attention: Full attention, all tokens can attend to each other
+    - sliding_window: Sliding window attention, KV cache is truncated to fixed size (token_window_size)
+    - sliding_recompute: Sliding recompute, only keep previous chunk and recompute with current chunk
+    - reindex: Keep first chunk as sink, reindex sliding window positions via RoPE rotation
+    """
+    def __init__(
+        self,
+        model,
+        temperature: float,
+        eos_token: Union[int, torch.Tensor],
+        chunk_size: int = 25,  # s3tokenizer 1s = 25token
+        tts_last_turn_tokens: torch.Tensor = None,
+        logits_processors=None,
+        logits_warpers=None,
+    ):
+        self.tts = model
+        self.device = model.device
+        self.temperature = torch.tensor([temperature], dtype=torch.float, device=self.device)
+        self.eos_token = (
+            torch.tensor(eos_token, device=self.device) if isinstance(eos_token, int) else eos_token.to(self.device)
+        )
+        self.num_vq = model.num_vq
+        self.num_audio_tokens = model.num_audio_tokens
+        self.recomputed_chunks = model.recomputed_chunks
+        self.emb_code = model.emb_code
+        self.head_code = model.head_code
+        # Attention type and window sizes
+        self.attention_type = model.attention_type  # "full_attention", "sliding_window", "sliding_recompute", "reindex"
+        self.chunk_window_size = model.chunk_window_size  # chunk-level window for sliding_recompute (default 2)
+        self.token_window_size = model.token_window_size  # token-level window for sliding_window/reindex (default 300)
+        # RoPE config (for reindex mode)
+        self.rope_theta = model.model.config.rope_theta
+        self.head_dim = model.model.config.hidden_size // model.model.config.num_attention_heads
+        # Logits processors
+        self.logits_processors = logits_processors if logits_processors is not None else []
+        # Logits warpers (like TopP/TopK), separate from processors
+        self.logits_warpers = logits_warpers if logits_warpers is not None else []
+        # initialize state
+        self.past_key_values = None
+        self.text_start_pos = 0
+        self.idx = -1  # start from -1, become 0 when first called
+        self.all_conditions = []
+        self.all_generated_tokens = []
+        self.tts_last_turn_tokens = tts_last_turn_tokens
+        self.spk_emb = None
+        audio_bos = [self.tts.audio_bos_token_id]
+        audio_bos = torch.Tensor(audio_bos).to(self.tts.emb_text.weight.device, dtype=torch.long)
+        self.audio_bos_embeds = self.tts.emb_text(audio_bos).unsqueeze(0)
+        self.text_eos_embed = self.tts.emb_text(
+            torch.tensor(
+                [self.tts.config.text_eos_token_id],
+                device=self.tts.emb_text.weight.device,
+                dtype=torch.long,
+            )
+        ).unsqueeze(0)
+        # buffer related, used to fill up chunk_size and yield to outside
+        self.chunk_size = chunk_size
+        self._token_buffer: List[torch.Tensor] = []
+        # Chunk info tracking for sliding_recompute and reindex
+        self._chunk_info: List[dict] = []
+        self._total_seq_len = 0
+        # Reindex mode: track sink (first chunk) length
+        self._sink_kv_len = 0
+    def _build_recompute_inputs(self, current_condition: torch.Tensor) -> torch.Tensor:
+        """Build recompute inputs for sliding_recompute mode."""
+        if len(self._chunk_info) == 0:
+            return current_condition
+        prev_chunk = self._chunk_info[-1]
+        prev_condition = prev_chunk["condition"]
+        prev_audio_tokens = prev_chunk["audio_tokens"]
+        recompute_list = [prev_condition]
+        if len(prev_audio_tokens) > 0:
+            prev_audio_embeds = torch.cat([self.emb_code[0](tok) for tok in prev_audio_tokens], dim=1)
+            recompute_list.append(prev_audio_embeds)
+        recompute_list.append(current_condition)
+        return torch.cat(recompute_list, dim=1)
+    def _truncate_kv_cache_sliding_window(self):
+        """Truncate KV cache for sliding_window mode."""
+        if self.past_key_values is None:
+            return
+        if hasattr(self.past_key_values, "get_seq_length"):
+            current_kv_len = self.past_key_values.get_seq_length()
+        else:
+            current_kv_len = self.past_key_values[0][0].shape[2]
+        if current_kv_len <= self.token_window_size:
+            return
+        new_cache = DynamicCache()
+        num_layers = (
+            len(self.past_key_values.key_cache)
+            if hasattr(self.past_key_values, "key_cache")
+            else len(self.past_key_values)
+        )
+        for layer_idx in range(num_layers):
+            if hasattr(self.past_key_values, "key_cache"):
+                key = self.past_key_values.key_cache[layer_idx][:, :, -self.token_window_size :, :]
+                value = self.past_key_values.value_cache[layer_idx][:, :, -self.token_window_size :, :]
+            else:
+                key = self.past_key_values[layer_idx][0][:, :, -self.token_window_size :, :]
+                value = self.past_key_values[layer_idx][1][:, :, -self.token_window_size :, :]
+            new_cache.update(key, value, layer_idx)
+        self.past_key_values = new_cache
+    @staticmethod
+    def _apply_rope_rotation(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        """Apply RoPE rotation to tensor."""
+        return x * cos + rotate_half(x) * sin
+    def _compute_rope_cos_sin(self, positions: torch.Tensor, device: torch.device, dtype: torch.dtype):
+        """Compute RoPE cos and sin for given positions."""
+        dim_half = self.head_dim // 2
+        freq_seq = torch.arange(0, dim_half, dtype=torch.float32, device=device)
+        inv_freq = 1.0 / (self.rope_theta ** (freq_seq / dim_half))
+        # positions: [seq_len]
+        angles = positions.float().unsqueeze(-1) * inv_freq.unsqueeze(0)  # [seq_len, dim_half]
+        angles = torch.cat([angles, angles], dim=-1)  # [seq_len, head_dim]
+        cos = angles.cos().to(dtype)
+        sin = angles.sin().to(dtype)
+        return cos, sin
+    def _reindex_kv_cache(self):
+        """
+        Reindex KV cache for reindex mode:
+        1. Keep first chunk as attention sink
+        2. Keep last chunk
+        3. Discard middle chunks
+        4. Reindex the last chunk's key positions to be right after sink via RoPE rotation
+        """
+        if self.past_key_values is None or len(self._chunk_info) < 2:
+            return
+        # Get current KV cache length
+        if hasattr(self.past_key_values, "get_seq_length"):
+            current_kv_len = self.past_key_values.get_seq_length()
+        else:
+            current_kv_len = self.past_key_values[0][0].shape[2]
+        # Calculate sink length (first chunk)
+        sink_len = self._chunk_info[0]["condition_len"] + self._chunk_info[0]["audio_token_count"]
+        # Last chunk length
+        last_chunk = self._chunk_info[-1]
+        last_chunk_len = last_chunk["condition_len"] + last_chunk["audio_token_count"]
+        keep_len = sink_len + last_chunk_len
+        # Get device and dtype
+        device = self.past_key_values.key_cache[0].device
+        dtype = self.past_key_values.key_cache[0].dtype
+        if current_kv_len <= keep_len:
+            last_chunk_kv_len = current_kv_len - sink_len
+            if last_chunk_kv_len <= 0:
+                return
+            self.text_start_pos = current_kv_len
+            return
+        # Step 1: Truncate KV cache - keep sink and last chunk
+        new_cache = DynamicCache()
+        num_layers = len(self.past_key_values.key_cache)
+        original_start_pos = current_kv_len - last_chunk_len
+        new_start_pos = sink_len
+        delta = new_start_pos - original_start_pos  # This is a scalar constant
+        delta_positions = torch.full((last_chunk_len,), delta, dtype=torch.float32, device=device)
+        # Compute rotation cos/sin
+        cos, sin = self._compute_rope_cos_sin(delta_positions, device, dtype)
+        cos = cos.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, head_dim]
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        for layer_idx in range(num_layers):
+            key_full = self.past_key_values.key_cache[layer_idx]
+            value_full = self.past_key_values.value_cache[layer_idx]
+            # Extract sink and last chunk
+            key_sink = key_full[:, :, :sink_len, :]
+            value_sink = value_full[:, :, :sink_len, :]
+            key_last = key_full[:, :, -last_chunk_len:, :]
+            value_last = value_full[:, :, -last_chunk_len:, :]
+            # Apply RoPE rotation to reindex key positions
+            key_last_reindexed = self._apply_rope_rotation(key_last, cos, sin)
+            # Concatenate sink and reindexed last chunk
+            key = torch.cat([key_sink, key_last_reindexed], dim=2)
+            value = torch.cat([value_sink, value_last], dim=2)
+            new_cache.update(key, value, layer_idx)
+        self.past_key_values = new_cache
+        # Update text_start_pos to reflect new positions
+        self.text_start_pos = sink_len + last_chunk_len
+    @torch.inference_mode()
+    def generate_with_buffer(
+        self,
+        condition: torch.Tensor,
+        text_finished: bool = False,
+        max_new_token: int = 500,
+    ):
+        """input a condition embedding chunk, generate audio token each time,
+        and accumulate to buffer, only yield when buffer satisfies chunk_size.
+        Yields:
+            torch.Tensor of shape [chunk_size] (2D: [1, chunk_size])
+        """
+        self.idx += 1
+        self.device = self.tts.device
+        # if text finished, first concatenate Text EOS
+        if text_finished:
+            condition = torch.cat([condition, self.text_eos_embed], dim=1)
+        # always concatenate Audio BOS
+        condition = torch.cat([condition, self.audio_bos_embeds], dim=1).to(self.device)
+        self.all_conditions.append(condition)
+        # Initialize current chunk info
+        current_chunk_info = {
+            "condition_len": condition.shape[1],
+            "audio_token_count": 0,
+            "condition": condition.clone(),
+            "audio_tokens": [],
+        }
+        # Handle different attention types
+        if self.attention_type == "sliding_recompute" and self.idx >= 1:
+            # sliding_recompute: discard KV cache, recompute with previous + current chunk
+            self.past_key_values = None
+            current_condition = self._build_recompute_inputs(condition)
+            self.text_start_pos = 0
+        elif self.attention_type == "reindex" and self.idx >= 1:
+            # reindex: truncate KV cache keeping sink + last chunk, reindex positions via RoPE
+            self._reindex_kv_cache()
+            current_condition = condition
+            # Always update text_start_pos based on actual KV cache length (like reference code)
+            if self.past_key_values is not None:
+                if hasattr(self.past_key_values, "get_seq_length"):
+                    kv_len = self.past_key_values.get_seq_length()
+                else:
+                    kv_len = self.past_key_values[0][0].shape[2]
+                self.text_start_pos = kv_len
+        else:
+            current_condition = condition
+        condition_length = current_condition.shape[1]
+        prefill_len = condition_length
+        finished = torch.zeros(1, dtype=torch.bool, device=self.device)
+        chunk_generated_tokens = []
+        for t in range(max_new_token):
+            if t == 0:
+                inputs_embeds = current_condition
+                pos_ids = torch.arange(
+                    self.text_start_pos,
+                    self.text_start_pos + condition_length,
+                    dtype=torch.long,
+                    device=self.device,
+                ).unsqueeze(0)
+            else:
+                last = self.all_generated_tokens[-1]
+                # last: [1,1], directly as code id
+                inputs_embeds = self.emb_code[0](last)
+                pos_ids = torch.tensor(
+                    [self.text_start_pos + prefill_len + t - 1],
+                    dtype=torch.long,
+                    device=self.device,
+                ).unsqueeze(0)
+            outputs = self.tts.model(
+                position_ids=pos_ids,
+                past_key_values=self.past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+            )
+            hidden_states = outputs.last_hidden_state
+            # Handle KV cache based on attention type
+            if self.attention_type == "sliding_window":
+                self.past_key_values = outputs.past_key_values
+                self._truncate_kv_cache_sliding_window()
+            else:
+                self.past_key_values = outputs.past_key_values
+            with P.cached():
+                logits = torch.empty(
+                    hidden_states.size(0),
+                    hidden_states.size(1),
+                    self.num_audio_tokens,
+                    self.num_vq,
+                    dtype=torch.float,
+                    device=self.device,
+                )
+                for num_vq_iter in range(self.num_vq):
+                    x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                    logits[..., num_vq_iter] = x
+                    del x
+            del hidden_states
+            logits = logits[:, -1].float()
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+            logits /= self.temperature
+            audio_bos = len(self.all_generated_tokens) == 0 and t == 0
+            if not audio_bos:
+                # use generated tokens (current chunk) as input for processor/warper (align with modeling_minicpmo)
+                all_generated_tokens = torch.cat(self.all_generated_tokens, dim=1).to(self.device)  # [1, T]
+                for processor in self.logits_processors:
+                    logits = processor(all_generated_tokens, logits)
+                for warper in self.logits_warpers:
+                    logits = warper(all_generated_tokens, logits)
+                del all_generated_tokens
+            # sample next token (only use first codebook, same as generate)
+            scores = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(scores, num_samples=1)  # [(B*num_vq), 1]
+            next_id = idx_next.view(-1, self.num_vq)[:, 0:1]  # only take first codebook → [B, 1]
+            del scores
+            if next_id.eq(
+                self.eos_token
+            ).any():  # generated audio eos token, means this chunk is finished, no longer generate new tokens
+                finished[:] = True
+            else:  # eos token cannot be added to buffer, he does not speak.
+                # convert next_id to correct shape [1, 1], no num_vq dimension
+                if next_id.dim() == 0:  # if scalar
+                    next_tok = next_id.unsqueeze(0).unsqueeze(0)  # [1, 1]
+                elif next_id.dim() == 1:  # if 1D [1]
+                    next_tok = next_id.unsqueeze(0)  # [1, 1]
+                else:
+                    next_tok = next_id
+                self.all_generated_tokens.append(next_tok)
+                chunk_generated_tokens.append(next_tok)
+                # Update chunk info for sliding_recompute
+                current_chunk_info["audio_tokens"].append(next_tok.clone())
+                current_chunk_info["audio_token_count"] += 1
+                self._token_buffer.append(next_tok)
+            if len(self._token_buffer) == 0:
+                # case 1: if last text chunk, yield None
+                if text_finished:
+                    yield torch.empty(1, 0, dtype=torch.long, device=self.device), True
+                    break
+                # case 2: if not last text chunk, break directly
+                else:
+                    break
+            else:  # buffer has something
+                # case 1: if buffer is larger/equal to chunk_size, yield out
+                if len(self._token_buffer) >= self.chunk_size:
+                    batch = torch.cat(self._token_buffer[: self.chunk_size], dim=1)  # [1, chunk_size]
+                    yield batch, False  # → [1, chunk_size]
+                    # discard yielded part
+                    self._token_buffer = self._token_buffer[self.chunk_size :]
+                # case 2: if buffer is smaller than chunk_size
+                else:
+                    # if generation finished, and is the last text chunk, yield all remaining tokens, then break
+                    if finished.all():
+                        if text_finished:
+                            batch = torch.cat(self._token_buffer, dim=1)  # [1, chunk_size]
+                            yield batch, True  # → [1, chunk_size]
+                            self._token_buffer = []
+                            break
+                        else:
+                            # not the last text chunk, need to wait for next text chunk to fill up buffer, then this call ends
+                            break
+                    else:  # generation of this audio chunk is not finished, continue generating
+                        continue
+        # Save current chunk info for sliding_recompute and reindex
+        self._chunk_info.append(current_chunk_info)
+        self._total_seq_len += condition.shape[1] + len(chunk_generated_tokens)
+        # Update text_start_pos based on attention type
+        if self.attention_type == "sliding_recompute":
+            # sliding_recompute: will be reset at next chunk start, update normally here
+            self.text_start_pos += prefill_len + len(chunk_generated_tokens)
+        elif self.attention_type == "reindex":
+            # reindex: position based on actual KV cache length (positions have been reindexed to be continuous)
+            if self.past_key_values is not None:
+                if hasattr(self.past_key_values, "get_seq_length"):
+                    self.text_start_pos = self.past_key_values.get_seq_length()
+                else:
+                    self.text_start_pos = self.past_key_values[0][0].shape[2]
+            else:
+                self.text_start_pos += condition.shape[1] + len(chunk_generated_tokens)
+        else:
+            self.text_start_pos += condition.shape[1] + len(chunk_generated_tokens)
+        # note: remaining tokens in buffer will be kept, and accumulated next time
+# sliding window
+@dataclass
+class StreamingWindowConfig:
+    text_window_high_tokens: int = 8000
+    text_window_low_tokens: int = 6000
+@dataclass
+class DuplexWindowConfig:
+    """duplex sliding window configuration
+    sliding window mode:
+    - "off": disable sliding window
+    - "basic": basic sliding window (trigger by cache length)
+    - "context": sliding window with context (trigger by unit number, preserve generated text to previous)
+    """
+    # sliding window mode
+    sliding_window_mode: str = "off"  # "off" / "basic" / "context"
+    # basic sliding window parameters
+    basic_window_high_tokens: int = 8000  # high watermark: trigger sliding window when exceeded
+    basic_window_low_tokens: int = 6000  # low watermark: keep to this value after sliding window
+    # context sliding window parameters
+    context_previous_max_tokens: int = 500  # previous maximum token number
+    context_max_units: int = 24  # maximum unit number (trigger sliding window when exceeded)
+    # verification mode (for comparison test)
+    verify_mode: bool = False  # whether to enable verification log
+def as_dynamic_cache(past_key_values):
+    """Convert legacy tuple cache to DynamicCache if needed."""
+    if isinstance(past_key_values, DynamicCache):
+        return past_key_values
+    if isinstance(past_key_values, tuple):
+        return DynamicCache.from_legacy_cache(past_key_values)
+    return past_key_values
+def get_kv_cache_length(cache) -> int:
+    """Get the sequence length of a KV cache.
+    Args:
+        cache: DynamicCache or tuple-based cache
+    Returns:
+        The number of tokens in the cache
+    """
+    if cache is None:
+        return 0
+    if isinstance(cache, DynamicCache):
+        if not cache.key_cache or not cache.key_cache[0].numel():
+            return 0
+        return cache.key_cache[0].shape[-2]
+    if isinstance(cache, tuple):
+        return cache[0][0].shape[2]
+    return 0
+def get_rotary_cos_sin(
+    head_dim: int,
+    positions: torch.Tensor,
+    device: torch.device,
+    dtype: torch.dtype,
+    rope_theta: float = 10000.0,
+    inv_freq_cache: Optional[Dict[Tuple, torch.Tensor]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Compute RoPE cos and sin components for given positions.
+    Args:
+        head_dim: Dimension of each attention head
+        positions: Position indices tensor
+        device: Target device
+        dtype: Target dtype
+        rope_theta: RoPE base frequency (default 10000.0)
+        inv_freq_cache: Optional cache dict for inverse frequencies
+    Returns:
+        Tuple of (cos, sin) tensors with shape [1, 1, seq_len, head_dim]
+    """
+    cache_key = (head_dim, device)
+    inv_freq = inv_freq_cache.get(cache_key) if inv_freq_cache is not None else None
+    if inv_freq is None or inv_freq.device != device or inv_freq.shape[0] != head_dim // 2:
+        exponent = torch.arange(0, head_dim, 2, device=device, dtype=torch.float32) / head_dim
+        inv_freq = 1.0 / (rope_theta**exponent)
+        if inv_freq_cache is not None:
+            inv_freq_cache[cache_key] = inv_freq
+    positions = positions.to(device=device, dtype=torch.float32)
+    angles = torch.einsum("i,j->ij", positions, inv_freq)
+    cos = torch.cos(angles)
+    sin = torch.sin(angles)
+    # Use cat instead of repeat_interleave, consistent with model's original RotaryEmbedding
+    # Original: emb = torch.cat((freqs, freqs), dim=-1) -> [f0, f1, ..., f_{d/2}, f0, f1, ..., f_{d/2}]
+    cos_full = torch.cat([cos, cos], dim=-1).to(dtype=dtype)
+    sin_full = torch.cat([sin, sin], dim=-1).to(dtype=dtype)
+    cos_full = cos_full.unsqueeze(0).unsqueeze(0)
+    sin_full = sin_full.unsqueeze(0).unsqueeze(0)
+    return cos_full, sin_full
+def realign_rotary_suffix(
+    suffix_keys: torch.Tensor,
+    old_positions: torch.Tensor,
+    new_positions: torch.Tensor,
+    rope_theta: float = 10000.0,
+    inv_freq_cache: Optional[Dict[Tuple, torch.Tensor]] = None,
+) -> torch.Tensor:
+    """Realign RoPE position encoding after cache eviction.
+    When tokens are dropped from the middle of a cache, the suffix tokens
+    need their RoPE embeddings recalculated with new position indices.
+    Args:
+        suffix_keys: Key tensor to realign, shape [batch, heads, seq_len, head_dim]
+        old_positions: Original position indices
+        new_positions: New position indices after eviction
+        rope_theta: RoPE base frequency
+        inv_freq_cache: Optional cache dict for inverse frequencies
+    Returns:
+        Realigned key tensor with same shape as input
+    """
+    if suffix_keys.numel() == 0:
+        return suffix_keys
+    head_dim = suffix_keys.shape[-1]
+    device = suffix_keys.device
+    dtype = suffix_keys.dtype
+    # Compute old position cos/sin
+    cos_old, sin_old = get_rotary_cos_sin(head_dim, old_positions, device, dtype, rope_theta, inv_freq_cache)
+    # Inverse transform: recover original key
+    base = cos_old * suffix_keys - sin_old * rotate_half(suffix_keys)
+    # Compute new position cos/sin
+    cos_new, sin_new = get_rotary_cos_sin(head_dim, new_positions, device, dtype, rope_theta, inv_freq_cache)
+    # Forward transform: re-encode with new positions
+    return cos_new * base + sin_new * rotate_half(base)
+def drop_tokens_from_cache(
+    cache: Optional[DynamicCache | Tuple],
+    length: int,
+    preserve: int,
+    position_offset: int,
+    rope_theta: float = 10000.0,
+    inv_freq_cache: Optional[Dict[Tuple, torch.Tensor]] = None,
+) -> Tuple[Optional[DynamicCache], int, bool]:
+    """Drop tokens from a KV cache while preserving system prompt.
+    Removes tokens in the range [preserve, preserve + length) from the cache,
+    realigning RoPE embeddings for the suffix.
+    Args:
+        cache: DynamicCache or tuple-based cache (will be converted to DynamicCache)
+        length: Number of tokens to drop
+        preserve: Number of tokens to preserve at the start (system prompt)
+        position_offset: Current position offset for RoPE calculation
+        rope_theta: RoPE base frequency
+        inv_freq_cache: Optional cache dict for inverse frequencies
+    Returns:
+        Tuple of (cache, new_position_offset, success)
+        Note: Tuple cache will be converted to DynamicCache. Modification is in-place.
+    """
+    if cache is None or length <= 0:
+        return cache, position_offset, False
+    cache = as_dynamic_cache(cache)
+    total_len = get_kv_cache_length(cache)
+    if total_len <= 0:
+        return cache, position_offset, False
+    preserve = min(preserve, total_len)
+    available = total_len - preserve
+    if available < length:
+        logger.warning(
+            "Cannot drop %d tokens: only %d available (total=%d, preserve=%d)",
+            length,
+            available,
+            total_len,
+            preserve,
+        )
+        return cache, position_offset, False
+    suffix_len = total_len - preserve - length
+    # note: after RoPE reindex, the position of cache has been compressed (from preserve start)
+    # so here should not add position_offset, but use the actual layout of current cache
+    suffix_offset = preserve + length  # suffix current position in cache
+    prefix_offset = preserve  # suffix new position (follow preserve)
+    # Prepare position tensors for RoPE realignment
+    old_positions = None
+    new_positions = None
+    if suffix_len > 0:
+        device = cache.key_cache[0].device
+        old_positions = torch.arange(
+            suffix_offset,
+            suffix_offset + suffix_len,
+            device=device,
+            dtype=torch.long,
+        )
+        new_positions = torch.arange(
+            prefix_offset,
+            prefix_offset + suffix_len,
+            device=device,
+            dtype=torch.long,
+        )
+    keep_len = total_len - length
+    # Process each layer (in-place modification)
+    for layer_idx in range(len(cache.key_cache)):
+        key_tensor = cache.key_cache[layer_idx]
+        value_tensor = cache.value_cache[layer_idx]
+        if not key_tensor.numel():
+            continue
+        # Preserve prefix (system prompt)
+        prefix_keys = key_tensor[:, :, :preserve, :]
+        prefix_values = value_tensor[:, :, :preserve, :]
+        if suffix_len > 0:
+            # Keep and realign suffix
+            suffix_keys = key_tensor[:, :, preserve + length :, :]
+            suffix_values = value_tensor[:, :, preserve + length :, :]
+            if old_positions is not None and new_positions is not None and suffix_keys.numel():
+                suffix_keys = realign_rotary_suffix(
+                    suffix_keys,
+                    old_positions,
+                    new_positions,
+                    rope_theta,
+                    inv_freq_cache,
+                )
+            cache.key_cache[layer_idx] = torch.cat([prefix_keys, suffix_keys], dim=-2).contiguous()
+            cache.value_cache[layer_idx] = torch.cat([prefix_values, suffix_values], dim=-2).contiguous()
+        else:
+            cache.key_cache[layer_idx] = prefix_keys.contiguous()
+            cache.value_cache[layer_idx] = prefix_values.contiguous()
+    cache.crop(keep_len)
+    cache._seen_tokens = max(keep_len, 0)
+    new_offset = position_offset + length
+    logger.debug("Dropped %d tokens from cache, new length=%d", length, keep_len)
+    return cache, new_offset, True
+# stream decoder
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float("inf")):
+    logits = logits.clone()
+    # Top-k filtering
+    if top_k > 0:
+        top_k = min(top_k, logits.size(-1))
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    # Top-p (nucleus) filtering
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        probs = F.softmax(sorted_logits, dim=-1)
+        cumulative_probs = torch.cumsum(probs, dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # keep the first token that exceeds top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[0, indices_to_remove] = filter_value
+    return logits
+class StreamDecoder:
+    def __init__(self, llm, tokenizer, special_token_ids=None, forbidden_token_ids=None):
+        self.m = llm
+        self.tokenizer = tokenizer
+        self.listen_id = self.tokenizer.eos_token_id
+        self.chunk_eos_id = self.tokenizer.convert_tokens_to_ids("<|chunk_eos|>")
+        self.chunk_tts_eos_id = self.tokenizer.convert_tokens_to_ids("<|chunk_tts_eos|>")
+        self.turn_eos_id = self.tokenizer.convert_tokens_to_ids("<|turn_eos|>")
+        self.speak_id = self.tokenizer.convert_tokens_to_ids("<|speak|>")
+        self.special_token_ids = special_token_ids if special_token_ids is not None else []
+        # cache special tokens (used for context sliding window filtering)
+        self._all_special_ids = set()
+        self._all_special_tokens_text = set()
+        if self.tokenizer:
+            if hasattr(self.tokenizer, "all_special_ids"):
+                self._all_special_ids = set(self.tokenizer.all_special_ids)
+            if hasattr(self.tokenizer, "all_special_tokens"):
+                self._all_special_tokens_text = set(self.tokenizer.all_special_tokens)
+        custom_special_tokens = [
+            "<unit>",
+            "</unit>",
+            "<image>",
+            "</image>",
+            "<slice>",
+            "</slice>",
+            "<|listen|>",
+            "<|speak|>",
+            "<|tts_bos|>",
+            "<|tts_eos|>",
+            "<|audio_start|>",
+            "<|audio_end|>",
+            "<|chunk_eos|>",
+            "<|chunk_tts_eos|>",
+            "<|turn_eos|>",
+            "<|audio_start|>",
+            "<|audio_end|>",
+        ]
+        self._all_special_tokens_text.update(custom_special_tokens)
+        for token in custom_special_tokens:
+            token_id = self.tokenizer.convert_tokens_to_ids(token)
+            if token_id is not None and token_id != self.tokenizer.unk_token_id:
+                self._all_special_ids.add(token_id)
+        if forbidden_token_ids is None:
+            self.forbidden_token_ids = []
+        elif isinstance(forbidden_token_ids, int):
+            self.forbidden_token_ids = [self.forbidden_token_ids]
+        else:
+            self.forbidden_token_ids = forbidden_token_ids
+        self.forbidden_token_ids.append(self.chunk_eos_id)
+        assert isinstance(self.forbidden_token_ids, list)
+        self.cache = None
+        self.context = ""
+        self.generated_tokens = []  # track generated tokens
+        self.generated_special_tokens = []  # track generated special tokens
+        self.reset()
+        self.embeds = None
+        self.system_embeds = None
+        # sliding window related states
+        self._unit_history: List[Dict[str, Any]] = []
+        self._next_unit_id: int = 0
+        self._pending_unit_id: Optional[int] = None
+        self._pending_unit_start_cache_len: int = 0
+        self._system_preserve_length: int = 0
+        self._position_offset: int = 0
+        self._window_config = DuplexWindowConfig()
+        self._window_enabled: bool = True
+        self._rope_inv_freq_cache: Dict[Tuple, torch.Tensor] = {}
+        # context preserving sliding window states
+        # initial cache layout: [prefix] [suffix] [units...]
+        # after first sliding window: [prefix] [previous_marker + content] [suffix] [units...]
+        #                              fixed     dynamic sliding region      fixed
+        self._preserve_prefix_length: int = 0  # original prefix length (fixed)
+        self._previous_content_length: int = 0  # previous content length (dynamic, including marker)
+        self._suffix_token_ids: List[int] = []  # suffix token ids (e.g. <|im_end|>)
+        # previous marker (added dynamically after first sliding window)
+        self._previous_marker: str = "\n\nprevious: "  # fixed prefix marker
+        self._previous_marker_token_ids: List[int] = []  # marker token ids (initialized)
+        self._has_previous: bool = False  # whether previous marker has been added
+        # previous content
+        self._previous_text: str = ""  # accumulated generated text (without marker)
+        self._previous_token_ids: List[int] = []  # previous full token ids (including marker)
+        # validation statistics
+        self._sliding_event_count: int = 0  # sliding window trigger count
+        self._total_dropped_tokens: int = 0  # total dropped token count
+        self._total_dropped_units: int = 0  # total dropped unit count
+    def sliding_embeds(self):
+        # tmp = system_embeds
+        # tmp +-》 embeds after 5s
+        # reset
+        # feed
+        pass
+    def reset(self):
+        self.context = ""
+        self.cache = None
+        self.generated_tokens = []
+        self.generated_special_tokens = []
+        self.embeds = None
+        self.system_embeds = None
+        # sliding window state reset
+        old_unit_count = len(self._unit_history) if hasattr(self, "_unit_history") else 0
+        self._unit_history = []
+        self._next_unit_id = 0
+        self._pending_unit_id = None
+        self._pending_unit_start_cache_len = 0
+        self._system_preserve_length = 0
+        self._position_offset = 0
+        self._rope_inv_freq_cache = {}
+        # context preserving sliding window state reset
+        self._preserve_prefix_length = 0
+        self._previous_content_length = 0
+        self._suffix_token_ids = []
+        self._previous_marker = "\n\nprevious: "
+        self._previous_marker_token_ids = []
+        self._has_previous = False
+        self._previous_text = ""
+        self._previous_token_ids = []
+        # validation statistics
+        self._sliding_event_count = 0  # sliding window trigger count
+        self._total_dropped_tokens = 0  # total dropped token count
+        self._total_dropped_units = 0  # total dropped unit count
+    def get_cache_length(self) -> int:
+        if self.cache is None:
+            return 0
+        if isinstance(self.cache, DynamicCache):
+            if len(self.cache.key_cache) > 0 and self.cache.key_cache[0].numel() > 0:
+                return self.cache.key_cache[0].shape[2]
+            return 0
+        # Tuple cache format
+        return self.cache[0][0].shape[2]
+    def get_total_generated_tokens(self) -> int:
+        return sum(len(u.get("generated_tokens", [])) for u in self._unit_history)
+    def register_unit_start(self) -> int:
+        self._pending_unit_id = self._next_unit_id
+        self._pending_unit_start_cache_len = self.get_cache_length()
+        return self._pending_unit_id
+    def register_unit_end(
+        self,
+        input_type: str,
+        generated_tokens: Optional[List[int]] = None,
+        is_listen: bool = False,
+        generated_text: Optional[str] = None,
+    ):
+        """Call when unit ends, record unit information
+        Should be called after feeding </unit> token
+        Args:
+            input_type: "audio" / "video" / "omni" / "system"
+            generated_tokens: tokens generated by the unit (token ids)
+            is_listen: whether the unit is in listen state
+            generated_text: text generated by the unit (used for context preserving mode)
+        """
+        if self._pending_unit_id is None:
+            logger.warning("register_unit_end called without register_unit_start")
+            return
+        # calculate the length of the unit
+        current_cache_len = self.get_cache_length()
+        unit_len = current_cache_len - self._pending_unit_start_cache_len
+        if unit_len > 0:
+            entry = {
+                "unit_id": self._pending_unit_id,
+                "length": unit_len,
+                "type": input_type,
+                "generated_tokens": generated_tokens or [],
+                "generated_text": generated_text or "",  # used for context preserving mode
+                "is_listen": is_listen,
+            }
+            self._unit_history.append(entry)
+        self._pending_unit_id = None
+        self._pending_unit_start_cache_len = 0
+        self._next_unit_id += 1
+    def register_system_prompt(self):
+        """Call after system prompt prefill, record preserve length"""
+        self._system_preserve_length = self.get_cache_length()
+    # sliding window core methods
+    def _get_rope_theta(self) -> float:
+        """get model rope_theta configuration"""
+        return float(getattr(self.m.config, "rope_theta", 10000.0))
+    def _drop_tokens_from_cache(self, length: int) -> bool:
+        """remove specified number of tokens from cache (protect system prompt)
+        remove tokens in the range [preserve, preserve + length)
+        supports DynamicCache and tuple cache formats
+        """
+        if self.cache is None or length <= 0:
+            return False
+        cache_type = "DynamicCache" if isinstance(self.cache, DynamicCache) else "TupleCache"
+        cache_len_before = self.get_cache_length()
+        offset_before = self._position_offset
+        new_cache, new_offset, success = drop_tokens_from_cache(
+            cache=self.cache,
+            length=length,
+            preserve=self._system_preserve_length,
+            position_offset=self._position_offset,
+            rope_theta=self._get_rope_theta(),
+            inv_freq_cache=self._rope_inv_freq_cache,
+        )
+        if success:
+            self.cache = new_cache  # For DynamicCache this is the same object (in-place)
+            self._position_offset = new_offset
+        return success
+    def _drop_unit(self, unit_id: int) -> bool:
+        """remove specified unit"""
+        entries = [u for u in self._unit_history if u["unit_id"] == unit_id]
+        if not entries:
+            return False
+        total_len = sum(e["length"] for e in entries)
+        if total_len <= 0:
+            for e in entries:
+                self._unit_history.remove(e)
+            return False
+        if not self._drop_tokens_from_cache(total_len):
+            return False
+        for e in entries:
+            self._unit_history.remove(e)
+        return True
+    def _drop_next_unit(self) -> bool:
+        """remove the earliest non-system unit"""
+        for entry in self._unit_history:
+            unit_id = entry.get("unit_id")
+            if unit_id is None:
+                continue
+            # skip system type
+            if entry.get("type") == "system":
+                continue
+            if self._drop_unit(unit_id):
+                return True
+        return False
+    def enforce_window(self) -> bool:
+        """enforce sliding window strategy (same as single-mode, only look at cache length)
+        when cache length exceeds high water line, loop to remove the earliest unit,
+        until cache length drops below the low water line.
+        """
+        if not self._window_enabled:
+            return False
+        cfg = self._window_config
+        cache_len_before = self.get_cache_length()
+        if cache_len_before <= cfg.basic_window_high_tokens:
+            return False  # not above high water line, no trigger
+        dropped_count = 0
+        cache_len = cache_len_before
+        while cache_len > cfg.basic_window_low_tokens:
+            if not self._drop_next_unit():
+                break
+            dropped_count += 1
+            cache_len = self.get_cache_length()
+        if dropped_count > 0:
+            # update statistics counters
+            self._sliding_event_count += 1
+            self._total_dropped_tokens += cache_len_before - cache_len
+            self._total_dropped_units += dropped_count
+            # consistency check
+            expected = self._system_preserve_length + sum(u["length"] for u in self._unit_history)
+            is_consistent = expected == cache_len
+            if not is_consistent:
+                logger.error(
+                    "CONSISTENCY ERROR! preserve=%d + sum(units)=%d != cache=%d, offset=%d",
+                    self._system_preserve_length,
+                    sum(u["length"] for u in self._unit_history),
+                    cache_len,
+                    self._position_offset,
+                )
+        return dropped_count > 0
+    # context preserving sliding window methods
+    def register_system_prompt_with_context(
+        self,
+        suffix_token_ids: Optional[List[int]] = None,
+        context_previous_marker: str = "\n\nprevious: ",
+    ):
+        """register system prompt (with context preserving mode)
+        initial cache layout: [prefix] [suffix] [units...]
+        after first sliding window: [prefix] [context_previous_marker + content] [suffix] [units...]
+        when calling this method, cache should only have prefix (without previous marker)
+        suffix will be fed in later
+        Args:
+            suffix_token_ids: suffix token ids (e.g. id of <|im_end|>)
+            context_previous_marker: previous marker prefix, e.g. "\\n\\nprevious: "
+        """
+        # prefix = current cache content (fixed, without previous marker)
+        self._preserve_prefix_length = self.get_cache_length()
+        self._previous_content_length = 0  # initially no previous content
+        self._suffix_token_ids = suffix_token_ids or []
+        # total preserve length = prefix + suffix (initially no previous)
+        self._system_preserve_length = self._preserve_prefix_length + len(self._suffix_token_ids)
+        # initialize previous related states
+        self._previous_marker = context_previous_marker
+        self._previous_marker_token_ids = (
+            self.tokenizer.encode(context_previous_marker, add_special_tokens=False) if self.tokenizer else []
+        )
+        self._has_previous = False
+        self._previous_text = ""
+        self._previous_token_ids = []
+    def _extract_generated_text(self, units: List[Dict[str, Any]]) -> Tuple[str, List[int]]:
+        """extract generated text and token ids from units
+        Args:
+            units: list of units to extract
+        Returns:
+            (text, token_ids): concatenated text and token ids (filtered out special tokens)
+        """
+        text_parts = []
+        token_ids = []
+        for u in units:
+            # only keep generated content of non-listen units
+            if u.get("is_listen", False):
+                continue
+            gen_text = u.get("generated_text", "")
+            gen_tokens = u.get("generated_tokens", [])
+            # filter out special tokens from text
+            if gen_text:
+                clean_text = gen_text
+                for st in self._all_special_tokens_text:
+                    clean_text = clean_text.replace(st, "")
+                if clean_text.strip():
+                    text_parts.append(clean_text)
+            # filter out special tokens
+            if gen_tokens:
+                filtered_tokens = [t for t in gen_tokens if t not in self._all_special_ids]
+                token_ids.extend(filtered_tokens)
+        return "".join(text_parts), token_ids
+    def _rebuild_cache_with_previous(
+        self,
+        new_previous_tokens: List[int],
+        units_to_keep_len: Optional[int] = None,
+    ) -> bool:
+        """rebuild cache, insert new previous content between prefix and suffix
+        cache layout change:
+        [prefix] [old_prev] [suffix] [old_units]  →  [prefix] [new_prev] [suffix] [remaining_units]
+        Args:
+            new_previous_tokens: new previous token ids
+            units_to_keep_len: length of units to keep (from cache end backwards)
+                                if None, calculate based on unit_history
+        Returns:
+            whether successful rebuild
+        """
+        if self.cache is None:
+            return False
+        old_previous_len = self._previous_content_length
+        new_previous_len = len(new_previous_tokens)
+        suffix_len = len(self._suffix_token_ids)
+        total_cache_len = self.get_cache_length()
+        # calculate length of units to keep
+        if units_to_keep_len is None:
+            units_to_keep_len = sum(u["length"] for u in self._unit_history)
+        # special case: if previous is unchanged (new and old are empty), no need to rebuild prefix+suffix part of cache
+        # but still need to reindex units RoPE (because a unit was deleted, position changed)
+        if new_previous_len == 0 and old_previous_len == 0:
+            # cache layout: [prefix(7)] [suffix(1)] [units...]
+            # only keep prefix + suffix + remaining_units
+            preserve_len = self._preserve_prefix_length + suffix_len
+            # simply slice cache: [prefix+suffix] + [remaining_units]
+            # remaining_units in cache end
+            if units_to_keep_len > 0:
+                # [0:preserve_len] + [total-units_to_keep_len:total]
+                prefix_suffix_cache = self._slice_cache(0, preserve_len)
+                units_cache = self._slice_cache(total_cache_len - units_to_keep_len, None)
+                # calculate number of dropped tokens
+                dropped_tokens = total_cache_len - preserve_len - units_to_keep_len
+                # reindex units RoPE: position from (preserve_len + dropped_tokens) to preserve_len
+                # note: no position_offset, because cache position has been compressed (from 0 start)
+                if dropped_tokens > 0:
+                    old_start = preserve_len + dropped_tokens
+                    new_start = preserve_len
+                    units_cache = self._reindex_rope_for_cache(units_cache, old_start, new_start, units_to_keep_len)
+                self.cache = self._concat_caches(prefix_suffix_cache, units_cache)
+            else:
+                self.cache = self._slice_cache(0, preserve_len)
+            return True
+        # 1. get prefix cache (fixed)
+        prefix_end = self._preserve_prefix_length
+        prefix_cache = self._slice_cache(0, prefix_end)
+        # 2. get units cache to keep (from end)
+        units_start_in_old_cache = total_cache_len - units_to_keep_len
+        units_cache = None
+        if units_to_keep_len > 0:
+            units_cache = self._slice_cache(units_start_in_old_cache, None)
+        # 3. calculate new previous + suffix cache (needs forward)
+        # merge previous tokens and suffix tokens
+        prev_suffix_tokens = new_previous_tokens + self._suffix_token_ids
+        prev_suffix_len = len(prev_suffix_tokens)
+        new_prefix_prev_suffix_cache = prefix_cache
+        if prev_suffix_len > 0:
+            # Embed tokens
+            prev_suffix_embeds = self.embed_tokens(prev_suffix_tokens)
+            # calculate start position (after prefix)
+            start_pos = self._preserve_prefix_length + self._position_offset
+            # forward calculate KV cache
+            with torch.no_grad():
+                device = prev_suffix_embeds.device
+                position_ids = torch.arange(
+                    start_pos,
+                    start_pos + prev_suffix_len,
+                    device=device,
+                ).unsqueeze(0)
+                # use prefix cache as past_key_values
+                outputs = self.m(
+                    inputs_embeds=(
+                        prev_suffix_embeds.unsqueeze(0) if prev_suffix_embeds.dim() == 2 else prev_suffix_embeds
+                    ),
+                    position_ids=position_ids,
+                    past_key_values=prefix_cache,
+                    use_cache=True,
+                    return_dict=True,
+                )
+                # new cache contains prefix + new_previous + suffix
+                new_prefix_prev_suffix_cache = outputs.past_key_values
+        # 4. adjust units cache RoPE
+        # new layout: [prefix] [new_prev] [suffix] [units]
+        # note: no position_offset, because cache position has been compressed (from 0 start)
+        new_system_total = prefix_end + new_previous_len + suffix_len
+        if units_cache is not None and self._get_cache_len(units_cache) > 0:
+            old_start = units_start_in_old_cache
+            new_start = new_system_total
+            if old_start != new_start:
+                units_cache = self._reindex_rope_for_cache(units_cache, old_start, new_start, units_to_keep_len)
+        # 5. concatenate new cache
+        if units_cache is not None and self._get_cache_len(units_cache) > 0:
+            self.cache = self._concat_caches(new_prefix_prev_suffix_cache, units_cache)
+        else:
+            self.cache = new_prefix_prev_suffix_cache
+        # 6. update length
+        self._previous_content_length = new_previous_len
+        # total preserve length = prefix + previous + suffix
+        self._system_preserve_length = prefix_end + new_previous_len + suffix_len
+        # print detailed cache layout information
+        prev_text_preview = self._previous_text[:50] + "..." if len(self._previous_text) > 50 else self._previous_text
+        suffix_preview = self.tokenizer.decode(self._suffix_token_ids) if self._suffix_token_ids else ""
+        return True
+    def _slice_cache(self, start: int, end: Optional[int], clone: bool = True):
+        """slice cache
+        Args:
+            start: start position
+            end: end position (None means to end)
+            clone: whether to clone (default True, to prevent shared memory issues)
+        """
+        if self.cache is None:
+            return None
+        if isinstance(self.cache, DynamicCache):
+            # DynamicCache
+            new_key_cache = [
+                k[:, :, start:end, :].clone() if clone else k[:, :, start:end, :] for k in self.cache.key_cache
+            ]
+            new_value_cache = [
+                v[:, :, start:end, :].clone() if clone else v[:, :, start:end, :] for v in self.cache.value_cache
+            ]
+            new_cache = DynamicCache()
+            new_cache.key_cache = new_key_cache
+            new_cache.value_cache = new_value_cache
+            return new_cache
+        else:
+            # Tuple cache
+            if clone:
+                return tuple(
+                    (layer[0][:, :, start:end, :].clone(), layer[1][:, :, start:end, :].clone()) for layer in self.cache
+                )
+            else:
+                return tuple((layer[0][:, :, start:end, :], layer[1][:, :, start:end, :]) for layer in self.cache)
+    @staticmethod
+    def _get_cache_len(cache) -> int:
+        if cache is None:
+            return 0
+        if isinstance(cache, DynamicCache):
+            if len(cache.key_cache) > 0 and cache.key_cache[0].numel() > 0:
+                return cache.key_cache[0].shape[2]
+            return 0
+        if cache and cache[0] and cache[0][0] is not None:
+            return cache[0][0].shape[2]
+        return 0
+    @staticmethod
+    def _concat_caches(cache1, cache2):
+        if cache1 is None:
+            return cache2
+        if cache2 is None:
+            return cache1
+        if isinstance(cache1, DynamicCache):
+            new_cache = DynamicCache()
+            new_cache.key_cache = [torch.cat([k1, k2], dim=2) for k1, k2 in zip(cache1.key_cache, cache2.key_cache)]
+            new_cache.value_cache = [
+                torch.cat([v1, v2], dim=2) for v1, v2 in zip(cache1.value_cache, cache2.value_cache)
+            ]
+            return new_cache
+        else:
+            return tuple(
+                (
+                    torch.cat([layer1[0], layer2[0]], dim=2),
+                    torch.cat([layer1[1], layer2[1]], dim=2),
+                )
+                for layer1, layer2 in zip(cache1, cache2)
+            )
+    def _reindex_rope_for_cache(self, cache, old_start: int, new_start: int, length: int):
+        """reindex RoPE position for cache"""
+        if cache is None or length <= 0:
+            return cache
+        if isinstance(cache, DynamicCache):
+            device = cache.key_cache[0].device if cache.key_cache else None
+        else:
+            device = cache[0][0].device if cache and cache[0] else None
+        if device is None:
+            return cache
+        old_positions = torch.arange(old_start, old_start + length, device=device, dtype=torch.long)
+        new_positions = torch.arange(new_start, new_start + length, device=device, dtype=torch.long)
+        rope_theta = self._get_rope_theta()
+        if isinstance(cache, DynamicCache):
+            new_key_cache = []
+            for k in cache.key_cache:
+                new_k = realign_rotary_suffix(k, old_positions, new_positions, rope_theta, self._rope_inv_freq_cache)
+                new_key_cache.append(new_k)
+            cache.key_cache = new_key_cache
+            return cache
+        else:
+            new_cache = []
+            for layer in cache:
+                new_k = realign_rotary_suffix(
+                    layer[0], old_positions, new_positions, rope_theta, self._rope_inv_freq_cache
+                )
+                new_cache.append((new_k, layer[1]))
+            return tuple(new_cache)
+    def _update_previous(
+        self,
+        new_text: str,
+        new_tokens: List[int],
+        max_tokens: int,
+    ) -> None:
+        """update previous context (also update cache)
+        when first sliding window, dynamically add marker + text, subsequent sliding window append text
+        when content exceeds max_tokens, truncate content (keep marker)
+        rebuild cache to maintain consistency
+        Args:
+            new_text: new text
+            new_tokens: new token ids
+            max_tokens: previous content maximum token count (without marker)
+        """
+        marker_len = len(self._previous_marker_token_ids)
+        tokens_to_drop = 0
+        # if no new content, do not add marker, but still need to rebuild cache
+        if not new_tokens and not new_text:
+            # still need to rebuild cache (because a unit was deleted)
+            self._rebuild_cache_with_previous(self._previous_token_ids)
+            return
+        if not self._has_previous:
+            # when first has actual content: add marker + text
+            self._previous_text = new_text
+            self._previous_token_ids = self._previous_marker_token_ids.copy() + new_tokens
+            self._has_previous = True
+        else:
+            # subsequent sliding window: append text to previous
+            self._previous_text += new_text
+            self._previous_token_ids.extend(new_tokens)
+        # calculate token count of content (without marker)
+        content_token_count = len(self._previous_token_ids) - marker_len
+        # check if need to truncate content (keep marker)
+        if content_token_count > max_tokens:
+            # truncate left content, keep marker + latest max_tokens content
+            tokens_to_drop = content_token_count - max_tokens
+            old_text = self._previous_text
+            # keep marker + truncated content
+            content_tokens = self._previous_token_ids[marker_len + tokens_to_drop :]
+            self._previous_token_ids = self._previous_marker_token_ids.copy() + content_tokens
+            # redecode text (only decode content part)
+            try:
+                self._previous_text = self.tokenizer.decode(
+                    content_tokens,
+                    skip_special_tokens=True,
+                )
+            except Exception as e:
+                logger.warning("_update_previous: decode failed: %s", e)
+        # rebuild cache
+        self._rebuild_cache_with_previous(self._previous_token_ids)
+    def _drop_unit_with_context(
+        self,
+        unit_id: int,
+        max_previous_tokens: int,
+    ) -> Tuple[bool, str, List[int]]:
+        """remove specified unit and return its generated content (for context preserving)
+        process:
+        1. extract generated content of unit
+        2. remove unit from cache (without prefix+previous)
+        3. append generated content to previous
+        4. rebuild cache (in _update_previous)
+        Args:
+            unit_id: unit ID to remove
+            max_previous_tokens: previous maximum token count
+        Returns:
+            (success, extracted_text, extracted_tokens): whether successful, extracted text and tokens
+        """
+        entries = [u for u in self._unit_history if u["unit_id"] == unit_id]
+        if not entries:
+            return False, "", []
+        # extract generated content
+        extracted_text, extracted_tokens = self._extract_generated_text(entries)
+        # calculate total length
+        total_len = sum(e["length"] for e in entries)
+        if total_len <= 0:
+            for e in entries:
+                self._unit_history.remove(e)
+            return False, extracted_text, extracted_tokens
+        cache_before = self.get_cache_length()
+        # remove from unit_history (record for later processing)
+        for e in entries:
+            self._unit_history.remove(e)
+        # note: here no longer call _drop_tokens_from_cache
+        # because _update_previous will rebuild the entire cache
+        # update previous (also rebuild cache)
+        self._update_previous(extracted_text, extracted_tokens, max_previous_tokens)
+        return True, extracted_text, extracted_tokens
+    def _drop_next_unit_with_context(self, max_previous_tokens: int) -> bool:
+        """remove the earliest non-system unit (with context preserving)"""
+        for entry in self._unit_history:
+            unit_id = entry.get("unit_id")
+            if unit_id is None:
+                continue
+            if entry.get("type") == "system":
+                continue
+            success, _, _ = self._drop_unit_with_context(unit_id, max_previous_tokens)
+            if success:
+                return True
+        return False
+    def enforce_window_with_context(self) -> bool:
+        """context preserving sliding window execution
+        when unit count exceeds max_units, remove the earliest unit,
+        and accumulate its generated content to previous.
+        Cache will be automatically rebuilt in _update_previous.
+        Returns:
+            whether sliding window is executed
+        """
+        if not self._window_enabled:
+            return False
+        cfg = self._window_config
+        if cfg.sliding_window_mode != "context":
+            # if not context mode, fallback to basic sliding window
+            return self.enforce_window()
+        cache_len_before = self.get_cache_length()
+        units_before = len(self._unit_history)
+        # context preserving mode: only check if unit count exceeds limit
+        # (previous exceeds limit in _update_previous will automatically truncate left)
+        if units_before <= cfg.context_max_units:
+            return False
+        # sliding window loop: remove unit until count ≤ max_units
+        dropped_count = 0
+        while len(self._unit_history) > cfg.context_max_units:
+            if not self._drop_next_unit_with_context(cfg.context_previous_max_tokens):
+                break
+            dropped_count += 1
+        cache_len_after = self.get_cache_length()
+        if dropped_count > 0:
+            # update statistics counter
+            self._sliding_event_count += 1
+            self._total_dropped_tokens += cache_len_before - cache_len_after
+            self._total_dropped_units += dropped_count
+            # consistency check
+            expected = self._system_preserve_length + sum(u["length"] for u in self._unit_history)
+        return dropped_count > 0
+    def get_previous_context(self) -> Tuple[str, List[int]]:
+        """get current accumulated previous context
+        Returns:
+            (previous_text, previous_token_ids): current accumulated text and token ids
+        """
+        return self._previous_text, self._previous_token_ids.copy()
+    def get_window_stats(self) -> Dict[str, Any]:
+        """get sliding window statistics"""
+        unit_lengths = [u["length"] for u in self._unit_history]
+        return {
+            "cache_length": self.get_cache_length(),
+            "unit_count": len(self._unit_history),
+            "unit_lengths": unit_lengths,
+            "unit_total_length": sum(unit_lengths),
+            "system_preserve_length": self._system_preserve_length,
+            "position_offset": self._position_offset,
+            "window_enabled": self._window_enabled,
+            "total_generated_tokens": self.get_total_generated_tokens(),
+            "pending_unit_id": self._pending_unit_id,
+            "next_unit_id": self._next_unit_id,
+            "config": {
+                "sliding_window_mode": self._window_config.sliding_window_mode,
+                "basic_window_high_tokens": self._window_config.basic_window_high_tokens,
+                "basic_window_low_tokens": self._window_config.basic_window_low_tokens,
+                "context_previous_max_tokens": self._window_config.context_previous_max_tokens,
+                "context_max_units": self._window_config.context_max_units,
+            },
+            # context preserving related
+            "preserve_prefix_length": self._preserve_prefix_length,
+            "previous_content_length": self._previous_content_length,
+            "suffix_token_count": len(self._suffix_token_ids),
+            "previous_text_length": len(self._previous_text),
+            "previous_token_count": len(self._previous_token_ids),
+            "has_system_template": self._system_prompt_template is not None,
+        }
+    def _verify_consistency(self) -> bool:
+        """verify unit history and cache length consistency"""
+        expected = self._system_preserve_length + sum(u["length"] for u in self._unit_history)
+        actual = self.get_cache_length()
+        return expected == actual
+    def print_verification_summary(self) -> Dict[str, Any]:
+        """print verification summary (for comparing off/basic/context mode)
+        Returns:
+            dictionary containing key verification data
+        """
+        cfg = self._window_config
+        # collect all generated text
+        all_generated_text = []
+        all_generated_tokens = []
+        for u in self._unit_history:
+            if not u.get("is_listen", False):
+                gen_text = u.get("generated_text", "")
+                gen_tokens = u.get("generated_tokens", [])
+                if gen_text:
+                    all_generated_text.append(gen_text)
+                if gen_tokens:
+                    all_generated_tokens.extend(gen_tokens)
+        combined_text = "".join(all_generated_text)
+        summary = {
+            "mode": cfg.sliding_window_mode,
+            "final_cache_length": self.get_cache_length(),
+            "final_unit_count": len(self._unit_history),
+            "sliding_event_count": self._sliding_event_count,
+            "total_dropped_tokens": self._total_dropped_tokens,
+            "total_dropped_units": self._total_dropped_units,
+            "total_generated_tokens": len(all_generated_tokens),
+            "generated_text": combined_text,
+            "previous_text": self._previous_text,
+            "previous_token_count": len(self._previous_token_ids),
+            "position_offset": self._position_offset,
+            "system_preserve_length": self._system_preserve_length,
+        }
+        return summary
+    def set_window_config(self, config: DuplexWindowConfig) -> None:
+        """set sliding window configuration"""
+        self._window_config = config
+    def set_window_enabled(self, enabled: bool) -> None:
+        """enable/disable sliding window"""
+        old_enabled = self._window_enabled
+        self._window_enabled = enabled
+    def get_context(self):
+        return self.context
+    def embed_token(self, tid):
+        if isinstance(tid, int):
+            tid = torch.tensor([tid], device=self.m.device)
+        return self.m.model.embed_tokens(tid)
+    def embed_tokens(self, token_ids: List[int]) -> torch.Tensor:
+        """batch embed multiple tokens
+        Args:
+            token_ids: list of token ids
+        Returns:
+            embeddings tensor [L, H]
+        """
+        if not token_ids:
+            return torch.empty(0, self.m.config.hidden_size, device=self.m.device)
+        tids = torch.tensor(token_ids, device=self.m.device)
+        return self.m.model.embed_tokens(tids)
+    @torch.no_grad()
+    def feed(self, embeds: torch.Tensor, return_logits: bool = False):
+        """
+        embeds : [L, H]   —— new embedding sequence fed into model at once
+        """
+        L = embeds.size(0)
+        device = embeds.device
+        past_len = self.get_cache_length()
+        pos_ids = torch.arange(past_len, past_len + L, device=device).unsqueeze(0)  # [1, L]
+        out = self.m(
+            inputs_embeds=embeds.unsqueeze(0),  # [1, L, H]
+            position_ids=pos_ids,
+            past_key_values=self.cache,
+            # use_cache = True,
+            return_dict=True,
+            output_hidden_states=True,
+            # attention_mask=attention_mask
+        )
+        self.cache = out.past_key_values
+        if return_logits:
+            logits = self.m.lm_head(out.hidden_states[-1])[:, -1]  # [1, vocab]
+            return logits, out.hidden_states[-1]
+    @torch.no_grad()
+    def decode(
+        self,
+        logits,
+        mode: Literal["sampling", "greedy"] = "sampling",
+        temperature=0.7,
+        top_k=20,
+        top_p=0.8,
+        listen_top_k=None,
+        listen_prob_scale=1.0,
+        text_repetition_penalty=1.05,
+        text_repetition_window_size=512,
+    ):
+        """
+        Args:
+            logits:
+            mode: sampling or greedy
+            temperature:
+            top_k:
+            top_p:
+            listen_top_k: force listen_id to be in top-k to keep
+            listen_prob_scale: multiply listen_id probability by a weight (<1 means decrease, >1 means increase)
+            text_repetition_penalty: repetition penalty coefficient, >1.0 means decrease repetition, <1.0 means increase repetition
+            text_repetition_window_size: repetition penalty window size
+        Sampling strategy:
+            1. first sample all tokens with original logits (apply temperature)
+            2. if sampled chunk_eos, return directly (keep the original model's decision of when to stop)
+            3. if not sampled chunk_eos, mask it (set logit to -inf), continue sampling text tokens
+            4. apply repetition penalty, top-k, top-p, etc. to the text tokens for the final sampling
+        """
+        logits = logits.clone()
+        # 0. independently check chunk_eos before sampling
+        eos_id = self.chunk_eos_id
+        with torch.no_grad():
+            if mode == "greedy":
+                sampled_token = torch.argmax(logits[0]).item()
+            else:
+                original_probs = F.softmax(logits[0], dim=-1)
+                sampled_token = torch.multinomial(original_probs, num_samples=1).item()
+            # if sampled chunk_eos, return directly
+            if sampled_token == eos_id:
+                next_token_id = torch.tensor([eos_id], device=logits.device)
+                next_token_str = self.tokenizer.decode(next_token_id)
+                return next_token_id
+        # if not sampled chunk_eos, set its logit to -inf
+        if self.forbidden_token_ids:
+            logits[:, self.forbidden_token_ids] = float("-inf")
+        # 1. apply repetition penalty
+        if text_repetition_penalty != 1.0 and len(self.generated_tokens) > 0:
+            # get recent tokens (within window size) considering special tokens and normal tokens
+            recent_tokens = self.generated_tokens[-text_repetition_window_size:]
+            # make it unique
+            recent_tokens = list(set(recent_tokens))
+            # apply penalty to repeated tokens
+            for token_id in recent_tokens:
+                if token_id < logits.size(-1):  # ensure token_id is in vocabulary range
+                    if text_repetition_penalty > 1.0:
+                        # penalize repetition: decrease logits
+                        logits[0, token_id] /= text_repetition_penalty
+                    else:
+                        # encourage repetition: increase logits
+                        logits[0, token_id] *= 1.0 / text_repetition_penalty
+        if listen_prob_scale != 1.0:  # modify listen token logit separately
+            logits[0, self.listen_id] *= listen_prob_scale
+        listen_rank = (logits[0] > logits[0, self.listen_id]).sum().item()
+        if listen_top_k is not None and listen_rank < listen_top_k:  # listen_id is in top-k, return directly
+            next_token_id = torch.tensor([self.listen_id], device=logits.device)
+            next_token_str = self.tokenizer.decode(next_token_id)
+            if next_token_str == "<|listen|>":
+                self.context += " "
+            else:
+                self.context += next_token_str
+            return next_token_id
+        if mode == "greedy":
+            next_token_id = torch.argmax(logits, dim=-1)
+        elif mode == "sampling":
+            logits = logits / temperature
+            logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+            probs = F.softmax(logits, dim=-1)
+            next_token_id = torch.multinomial(probs, num_samples=1).squeeze(1)
+        else:
+            raise ValueError(f"Unsupported decode mode: {mode}")
+        if next_token_id.item() not in self.special_token_ids:
+            self.generated_tokens.append(next_token_id.item())
+        else:
+            self.generated_special_tokens.append(next_token_id.item())
+        return next_token_id
+def _download_url_to_tempfile(url: str, suffix: str = "", timeout: int = 60) -> str:
+    """
+    Download a URL to a temporary file and return the path.
+    Args:
+        url: HTTP/HTTPS URL to download
+        suffix: File suffix (e.g., ".jpg", ".wav", ".mp4")
+        timeout: Download timeout in seconds
+    Returns:
+        Path to the downloaded temporary file
+    """
+    import tempfile
+    import requests
+    response = requests.get(url, timeout=timeout)
+    response.raise_for_status()
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
+        f.write(response.content)
+        return f.name
+def _is_url(path: str) -> bool:
+    return path.startswith(("http://", "https://"))
+def normalize_content_item(item) -> Union[str, Any, List[Any]]:
+    """Normalize structured content item to native format.
+    Supports:
+    - Native format: str, PIL.Image, np.ndarray (pass through)
+    - OpenAI structured format:
+        - {"type": "text", "text": "..."} -> str
+        - {"type": "image_url", "image_url": {"url": "..."}} -> PIL.Image
+        - {"type": "audio_url", "audio_url": {"url": "..."}} -> np.ndarray
+        - {"type": "video_url", "video_url": {"url": "...", ...}} -> List[Image, ndarray, ...]
+    URL formats supported:
+        - Local file path: "/path/to/file.jpg"
+        - HTTP/HTTPS URL: "https://example.com/image.jpg"
+    Args:
+        item: Content item to normalize
+    Returns:
+        Normalized item. For video_url, returns a tuple ("__video_contents__", list)
+        that will be flattened by normalize_content().
+    Raises:
+        ValueError: If content type is unknown or unsupported
+    """
+    import os
+    import numpy as np
+    from PIL import Image
+    if isinstance(item, str):
+        return item
+    if isinstance(item, Image.Image):
+        return item
+    if isinstance(item, np.ndarray):
+        return item
+    if isinstance(item, dict):
+        item_type = item.get("type")
+        if item_type == "text":
+            return item.get("text", "")
+        elif item_type == "image_url":
+            image_url_obj = item.get("image_url", {})
+            url = image_url_obj.get("url", "") if isinstance(image_url_obj, dict) else image_url_obj
+            if _is_url(url):
+                # Download to temp file
+                temp_path = _download_url_to_tempfile(url, suffix=".jpg", timeout=30)
+                img = Image.open(temp_path)
+                os.unlink(temp_path)
+                return img
+            else:
+                return Image.open(url)
+        elif item_type == "audio_url":
+            import librosa
+            audio_url_obj = item.get("audio_url", {})
+            url = audio_url_obj.get("url", "") if isinstance(audio_url_obj, dict) else audio_url_obj
+            if _is_url(url):
+                # Download to temp file
+                temp_path = _download_url_to_tempfile(url, suffix=".wav", timeout=60)
+                audio_np, _ = librosa.load(temp_path, sr=16000, mono=True)
+                os.unlink(temp_path)
+                return audio_np
+            else:
+                audio_np, _ = librosa.load(url, sr=16000, mono=True)
+                return audio_np
+        elif item_type == "video_url":
+            # Video processing - returns a LIST of items (frames + audio segments)
+            # Note: Unlike image_url/audio_url which return single items,
+            # video_url returns a list that will be flattened into the content
+            from minicpmo.utils import get_video_frame_audio_segments
+            video_url_obj = item.get("video_url", {})
+            if isinstance(video_url_obj, dict):
+                video_url = video_url_obj.get("url", "")
+                # Get optional parameters from video_url object (OpenAI style)
+                stack_frames = video_url_obj.get("stack_frames", 1)
+                use_ffmpeg = video_url_obj.get("use_ffmpeg", False)
+                use_audio = video_url_obj.get("use_audio", True)
+            else:
+                video_url = video_url_obj
+                stack_frames = 1
+                use_ffmpeg = False
+                use_audio = True
+            # Handle HTTP/HTTPS URL - download to temp file
+            temp_video_path = None
+            if _is_url(video_url):
+                temp_video_path = _download_url_to_tempfile(video_url, suffix=".mp4", timeout=120)
+                video_path = temp_video_path
+            else:
+                video_path = video_url
+            # Extract frames and audio segments
+            video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(
+                video_path,
+                stack_frames=stack_frames,
+                use_ffmpeg=use_ffmpeg,
+                use_audio=use_audio
+            )
+            # Clean up temp file if downloaded
+            if temp_video_path is not None:
+                os.unlink(temp_video_path)
+            # Build omni_contents (interleaved frames and audio, or frames only)
+            omni_contents = []
+            for i in range(len(video_frames)):
+                omni_contents.append(video_frames[i])
+                if use_audio and audio_segments is not None:
+                    omni_contents.append(audio_segments[i])
+                if stacked_frames is not None and i < len(stacked_frames) and stacked_frames[i] is not None:
+                    omni_contents.append(stacked_frames[i])
+            # Return as a special marker to be flattened later
+            return "__video_contents__", omni_contents
+        else:
+            raise ValueError(f"Unknown content type: {item_type}")
+    raise ValueError(f"Cannot normalize content item of type: {type(item)}")
+def normalize_content(content) -> list:
+    """Normalize message content to list of native items.
+    Input formats:
+    - str: "hello" -> ["hello"]
+    - list of native items: [str, Image, np.ndarray] -> pass through with normalization
+    - list of structured items: [{"type": "text", ...}] -> normalize each
+    - video type: automatically expanded to omni_contents
+    - mixed: works too
+    Args:
+        content: Message content in any supported format
+    Returns:
+        List of native items (str, PIL.Image, np.ndarray)
+    Examples:
+        >>> normalize_content("hello")
+        ["hello"]
+        >>> normalize_content([{"type": "text", "text": "hi"}])
+        ["hi"]
+        >>> normalize_content([{"type": "video", "video": "/path/to/video.mp4"}])
+        [<PIL.Image>, <np.ndarray>, <PIL.Image>, <np.ndarray>, ...]
+    """
+    import numpy as np
+    from PIL import Image
+    if isinstance(content, str):
+        return [content]
+    if isinstance(content, list):
+        result = []
+        for item in content:
+            normalized = normalize_content_item(item)
+            # Handle video content (returns tuple with marker)
+            if isinstance(normalized, tuple) and len(normalized) == 2 and normalized[0] == "__video_contents__":
+                # Flatten video contents into result
+                result.extend(normalized[1])
+            else:
+                result.append(normalized)
+        return result
+    # Single non-list item (Image or np.ndarray)
+    if isinstance(content, (Image.Image, np.ndarray)):
+        return [content]
+    normalized = normalize_content_item(content)
+    if isinstance(normalized, tuple) and len(normalized) == 2 and normalized[0] == "__video_contents__":
+        return normalized[1]
+    return [normalized]

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff