YXXPP

airlsyn commited on Apr 2

Commit

a6edb9e

0 Parent(s):

Duplicate from openbmb/MiniCPM-o-4_5

Browse files

Co-authored-by: airlsyn <airlsyn@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +58 -0
.gitignore +1 -0
README.md +2149 -0
added_tokens.json +107 -0
assets/HT_ref_audio.wav +3 -0
assets/Skiing.mp4 +3 -0
assets/Trump_WEF_2018_10s.mp3 +3 -0
assets/audio_cases/assistant_ref.mp4 +3 -0
assets/audio_cases/assistant_response.mp4 +3 -0
assets/audio_cases/elon_musk__000_assistant_audio.wav +3 -0
assets/audio_cases/elon_musk__system_ref_audio.wav +3 -0
assets/audio_cases/elon_musk_ref.mp4 +3 -0
assets/audio_cases/elon_musk_response.mp4 +3 -0
assets/audio_cases/hermione__000_assistant_audio.wav +3 -0
assets/audio_cases/hermione__system_ref_audio.wav +3 -0
assets/audio_cases/minicpm_assistant__000_assistant_audio.wav +3 -0
assets/audio_cases/minicpm_assistant__system_ref_audio.wav +3 -0
assets/audio_cases/paimon__000_assistant_audio.wav +3 -0
assets/audio_cases/paimon__system_ref_audio.wav +3 -0
assets/audio_cases/readme.txt +1 -0
assets/bajie.wav +3 -0
assets/fossil.png +3 -0
assets/haimianbaobao.wav +3 -0
assets/highway.png +3 -0
assets/nezha.wav +3 -0
assets/omni_duplex1.mp4 +3 -0
assets/omni_duplex2.mp4 +3 -0
assets/sunwukong.wav +3 -0
assets/system_ref_audio.wav +3 -0
assets/system_ref_audio_2.wav +3 -0
assets/token2wav/campplus.onnx +3 -0
assets/token2wav/flow.pt +3 -0
assets/token2wav/flow.yaml +34 -0
assets/token2wav/hift.pt +3 -0
assets/token2wav/speech_tokenizer_v2_25hz.onnx +3 -0
config.json +285 -0
configuration_minicpmo.py +260 -0
generation_config.json +12 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_minicpmo.py +0 -0
modeling_navit_siglip.py +981 -0
preprocessor_config.json +35 -0
processing_minicpmo.py +1665 -0
special_tokens_map.json +580 -0
tokenization_minicpmo_fast.py +120 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,58 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/HT_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/Skiing.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/bajie.wav filter=lfs diff=lfs merge=lfs -text
+assets/fossil.png filter=lfs diff=lfs merge=lfs -text
+assets/haimianbaobao.wav filter=lfs diff=lfs merge=lfs -text
+assets/highway.png filter=lfs diff=lfs merge=lfs -text
+assets/nezha.wav filter=lfs diff=lfs merge=lfs -text
+assets/omni_duplex1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/omni_duplex2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/sunwukong.wav filter=lfs diff=lfs merge=lfs -text
+assets/system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/system_ref_audio_2.wav filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/Trump_WEF_2018_10s.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/elon_musk__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/elon_musk__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/hermione__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/hermione__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/minicpm_assistant__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/minicpm_assistant__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/paimon__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/audio_cases/paimon__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,2149 @@

+---
+license: apache-2.0
+pipeline_tag: any-to-any
+library_name: transformers
+tags:
+- minicpm-o
+- minicpm-v
+- multimodal
+- full-duplex
+---
+A Gemini 2.5 Flash Level MLLM for Vision, Speech, and Full-Duplex Mulitmodal Live Streaming on Your Phone
+[GitHub](https://github.com/OpenBMB/MiniCPM-o) | [CookBook](https://github.com/OpenSQZ/MiniCPM-V-CookBook) | [Omni-modal Demo](https://openbmb.github.io/MiniCPM-o-Demo/) | [Vision-Language Demo](http://211.93.21.133:18121/) </br>
+[WeChat](https://github.com/OpenBMB/MiniCPM-o/blob/main/docs/wechat.md) | [Discord](https://discord.gg/N2RnxGdJ) | CaseBook([Audio](https://openbmb.github.io/minicpm-o-4_5/), [Omni Full-Duplex](https://openbmb.github.io/minicpm-o-4_5-omni/))
+## News
+> [!NOTE]
+> [2026.02.06] 🥳 🥳 🥳 We open-sourced a realtime web demo deployable on your own devices like Mac or GPU. [Try it now](#deploy-a-realtime-web-demo-on-your-own-device)!
+## MiniCPM-o 4.5
+**MiniCPM-o 4.5** is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip2, Whisper-medium, CosyVoice2, and Qwen3-8B with a total of 9B parameters. It exhibits a significant performance improvement, and introduces new features for full-duplex multimodal live streaming. Notable features of MiniCPM-o 4.5 include:
+- 🔥 **Leading Visual Capability.**
+  MiniCPM-o 4.5 achieves an average score of 77.6 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. **With only 9B parameters, it surpasses widely used proprietary models like GPT-4o, Gemini 2.0 Pro, and approaches Gemini 2.5 Flash** for vision-language capabilities. It supports instruct and thinking modes in a single model, better covering efficiency and performance trade-offs in different user scenarios.
+- 🎙 **Strong Speech Capability.**
+  MiniCPM-o 4.5 supports **bilingual real-time speech conversation with configurable voices** in English and Chinese. It features **more natural, expressive and stable speech conversation**. The model also allows for fun features such as **voice cloning and role play via a simple reference audio clip**, where the cloning performance surpasses strong TTS tools such as CosyVoice2.
+- 🎬 **New Full-Duplex and Proactive Multimodal Live Streaming Capability.**
+  As a new feature, MiniCPM-o 4.5 can process real-time, continuous video and audio input streams simultaneously while generating concurrent text and speech output streams in an end-to-end fashion, without mutual blocking. This **allows MiniCPM-o 4.5 to see, listen, and speak simultaneously**, creating a fluid, real-time omnimodal conversation experience. Beyond reactive responses, the model can also perform **proactive interaction**, such as initiating reminders or comments based on its continuous understanding of the live scene.
+- 💪 **Strong OCR Capability, Efficiency and Others.**
+Advancing popular visual capabilities from MiniCPM-V series, MiniCPM-o 4.5 can process **high-resolution images** (up to 1.8 million pixels) and **high-FPS videos** (up to 10fps) in any aspect ratio efficiently. It achieves **state-of-the-art performance for end-to-end English document parsing** on OmniDocBench, outperforming proprietary models such as Gemini-3 Flash and GPT-5, and specialized tools such as DeepSeek-OCR 2. It also features **trustworthy behaviors**, matching Gemini 2.5 Flash on MMHal-Bench, and supports **multilingual capabilities** on more than 30 languages.
+-  💫  **Easy Usage.**
+  MiniCPM-o 4.5 can be easily used in various ways:  **Basic usage, recommended for 100% precision:** PyTorch inference with Nvidia GPU. **Other end-side adaptation** includes (1) llama.cpp and Ollama support for efficient CPU inference on local devices, (2) int4 and GGUF format quantized models in 16 sizes, (3) vLLM and SGLang support for high-throughput and memory-efficient inference, (4) FlagOS support for the unified multi-chip backend plugin. **We also open-sourced web demos** on which **enables the full-duplex multimodal live streaming experience on local devices** such as GPUs, PCs (e.g., on a MacBook).
+**Model Architecture.**
+- **End-to-end Omni-modal Architecture.** The modality encoders/decoders and LLM are densely connected via hidden states in an end-to-end fashion. This enables better information flow and control, and also facilitates full exploitation of rich multimodal knowledge during training.
+- **Full-Duplex Omni-modal Live Streaming Mechanism.** (1) We turn the offline modality encoder/decoders into online and full-duplex ones for streaming inputs/outputs. The speech token decoder models text and speech tokens in an interleaved fashion to support full-duplex speech generation (i.e., sync timely with new input). This also facilitates more stable long speech generation (e.g., > 1min).
+(2) **We sync all the input and output streams on timeline in milliseconds**, which are jointly modeled by a time-division multiplexing (TDM) mechanism for omni-modality streaming processing in the LLM backbone. It divides parallel omni-modality streams into sequential info groups within small periodic time slices.
+- **Proactive Interaction Mechanism.** The LLM continuously monitors the input video and audio streams, and decides at a frequency of 1Hz to speak or not. This high decision-making frequency together with full-duplex nature are curcial to enable the proactive interaction capability.
+- **Configurable Speech Modeling Design.** We inherent the multimodal system prompt design of MiniCPM-o 2.6, which includes a traditional text system prompt, and a new audio system prompt to determine the assistant voice. This enables cloning new voices and role play in inference time for speech conversation.
+<div align="center">
+  <img src="https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpm-o-45-framework.png" width=100%>
+</div>
+### Evaluation  <!-- omit in toc -->
+<div align="center">
+  <img src="https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/radar_minicpmo4.5.png", width=80%>
+</div>
+<div align="center">
+  <img src="https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/minicpm_o_45_main_exp_table.png", width=90%>
+</div>
+<strong>Note</strong>: Scores marked with ∗ are from our evaluation; others are cited from referenced reports. n/a indicates that the model does not support the corresponding modality. All results are reported in instruct mode/variant.
+&emsp;
+<br>
+<details>
+<summary>Click to view visual understanding results.</summary>
+**Image Understanding (Instruct)**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>OpenCompass</b></th>
+  <th nowrap="nowrap"><b>MMBench EN v1.1</b></th>
+  <th nowrap="nowrap"><b>MMBench CN v1.1</b></th>
+  <th nowrap="nowrap"><b>MathVista</b></th>
+  <th nowrap="nowrap"><b>MMVet</b></th>
+  <th nowrap="nowrap"><b>MMMU</b></th>
+  <th nowrap="nowrap"><b>MMStar</b></th>
+  <th nowrap="nowrap"><b>HallusionBench</b></th>
+  <th nowrap="nowrap"><b>AI2D</b></th>
+  <th nowrap="nowrap"><b>OCRBench</b></th>
+  <th nowrap="nowrap"><b>TextVQA_VAL</b></th>
+  <th nowrap="nowrap"><b>DocVQA_VAL</b></th>
+  <th nowrap="nowrap"><b>MMT-Bench_VAL</b></th>
+  <th nowrap="nowrap"><b>MM-IFEval</b></th>
+  <th nowrap="nowrap"><b>Mantis-Eval</b></th>
+  <th nowrap="nowrap"><b>MuirBench</b></th>
+  <th nowrap="nowrap"><b>MMSI-Bench</b></th>
+  <th nowrap="nowrap"><b>MMHal-Score</b></th>
+  <th nowrap="nowrap"><b>MMHal-Hallrate↓</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Gemini2.5-Flash-Nonthinking</td>
+  <td align="center"><b>78.5</b></td>
+  <td align="center"><ins>86.6</ins></td>
+  <td align="center"><ins>86.0</ins></td>
+  <td align="center">75.3</td>
+  <td align="center"><ins>81.4</ins><sup>*</sup></td>
+  <td align="center"><b>76.3</b></td>
+  <td align="center"><b>75.8</b></td>
+  <td align="center">59.1</td>
+  <td align="center"><b>87.7</b></td>
+  <td align="center">864</td>
+  <td align="center">74.3<sup>*</sup></td>
+  <td align="center">93.0</td>
+  <td align="center"><ins>70.0</ins><sup>*</sup></td>
+  <td align="center"><b>75.8<sup>*</sup></b></td>
+  <td align="center">72.8<sup>*</sup></td>
+  <td align="center"><b>74.5<sup>*</sup></b></td>
+  <td align="center">12.1<sup>*</sup></td>
+  <td align="center"><ins>4.6</ins><sup>*</sup></td>
+  <td align="center"><b>23.9<sup>*</sup></b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Gemini2.0-Pro</td>
+  <td align="center">73.3</td>
+  <td align="center">83.0</td>
+  <td align="center">83.0</td>
+  <td align="center">71.3</td>
+  <td align="center">70.4</td>
+  <td align="center">72.6</td>
+  <td align="center">68.5</td>
+  <td align="center">49.8</td>
+  <td align="center">84.8</td>
+  <td align="center">863</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">GPT-4o</td>
+  <td align="center">75.4</td>
+  <td align="center">86.0</td>
+  <td align="center"><ins>86.0</ins></td>
+  <td align="center">71.6</td>
+  <td align="center">76.9</td>
+  <td align="center">72.9</td>
+  <td align="center">70.2</td>
+  <td align="center">57.0</td>
+  <td align="center">86.3</td>
+  <td align="center">822</td>
+  <td align="center">77.4</td>
+  <td align="center">93.0</td>
+  <td align="center">66.7<sup>*</sup></td>
+  <td align="center">64.6</td>
+  <td align="center">70.1<sup>*</sup></td>
+  <td align="center">70.5<sup>*</sup></td>
+  <td align="center">8.1<sup>*</sup></td>
+  <td align="center">4.2<sup>*</sup></td>
+  <td align="center">25.0<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">InternVL-3.5-8B</td>
+  <td align="center">75.8</td>
+  <td align="center">79.5</td>
+  <td align="center">80.0<sup>*</sup></td>
+  <td align="center"><ins>78.4</ins></td>
+  <td align="center"><b>83.1</b></td>
+  <td align="center"><ins>73.4</ins></td>
+  <td align="center">69.3</td>
+  <td align="center">54.5</td>
+  <td align="center">84.0</td>
+  <td align="center">840</td>
+  <td align="center">78.2</td>
+  <td align="center">92.3</td>
+  <td align="center">66.7</td>
+  <td align="center">56.3<sup>*</sup></td>
+  <td align="center">70.5</td>
+  <td align="center">55.8</td>
+  <td align="center">-</td>
+  <td align="center">3.8<sup>*</sup></td>
+  <td align="center">34.7<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-VL-8B-Instruct</td>
+  <td align="center">76.5</td>
+  <td align="center">84.5</td>
+  <td align="center">84.7</td>
+  <td align="center">77.2</td>
+  <td align="center">73.7<sup>*</sup></td>
+  <td align="center">69.6</td>
+  <td align="center">70.9</td>
+  <td align="center"><ins>61.1</ins></td>
+  <td align="center">85.7</td>
+  <td align="center"><b>896</b></td>
+  <td align="center">82.9<sup>*</sup></td>
+  <td align="center"><b>96.1</b></td>
+  <td align="center">60.9<sup>*</sup></td>
+  <td align="center">59.4<sup>*</sup></td>
+  <td align="center">74.2<sup>*</sup></td>
+  <td align="center">64.4</td>
+  <td align="center">11.3<sup>*</sup></td>
+  <td align="center"><b>4.7<sup>*</sup></b></td>
+  <td align="center">29.9<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center">75.7</td>
+  <td align="center">84.9<sup>*</sup></td>
+  <td align="center">84.1<sup>*</sup></td>
+  <td align="center">75.9</td>
+  <td align="center">74.8<sup>*</sup></td>
+  <td align="center">69.1</td>
+  <td align="center">68.5</td>
+  <td align="center">59.7</td>
+  <td align="center">85.2</td>
+  <td align="center"><ins>880</ins><sup>*</sup></td>
+  <td align="center"><b>84.1<sup>*</sup></b></td>
+  <td align="center"><ins>95.4</ins><sup>*</sup></td>
+  <td align="center"><b>70.4<sup>*</sup></b></td>
+  <td align="center">65.7<sup>*</sup></td>
+  <td align="center"><ins>78.3</ins><sup>*</sup></td>
+  <td align="center">61.9<sup>*</sup></td>
+  <td align="center"><ins>14.2</ins><sup>*</sup></td>
+  <td align="center"><ins>4.6</ins><sup>*</sup></td>
+  <td align="center">31.6<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><ins>77.6</ins></td>
+  <td align="center"><b>87.6</b></td>
+  <td align="center"><b>87.2</b></td>
+  <td align="center"><b>80.1</b></td>
+  <td align="center">74.4</td>
+  <td align="center">67.6</td>
+  <td align="center"><ins>73.1</ins></td>
+  <td align="center"><b>63.2</b></td>
+  <td align="center"><ins>87.6</ins></td>
+  <td align="center">876</td>
+  <td align="center"><ins>83.8</ins></td>
+  <td align="center">94.7</td>
+  <td align="center">69.7</td>
+  <td align="center"><ins>66.3</ins></td>
+  <td align="center"><b>79.7</b></td>
+  <td align="center"><ins>72.0</ins></td>
+  <td align="center"><b>16.6</b></td>
+  <td align="center"><b>4.7</b></td>
+  <td align="center"><ins>24.3</ins></td>
+</tr>
+  </table>
+  </div>
+**Image Understanding (Thinking)**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>OpenCompass</b></th>
+  <th nowrap="nowrap"><b>MMBench EN v1.1</b></th>
+  <th nowrap="nowrap"><b>MMBench CN v1.1</b></th>
+  <th nowrap="nowrap"><b>MathVista</b></th>
+  <th nowrap="nowrap"><b>MMVet</b></th>
+  <th nowrap="nowrap"><b>MMMU</b></th>
+  <th nowrap="nowrap"><b>MMStar</b></th>
+  <th nowrap="nowrap"><b>HallusionBench</b></th>
+  <th nowrap="nowrap"><b>AI2D</b></th>
+  <th nowrap="nowrap"><b>OCRBench</b></th>
+  <th nowrap="nowrap"><b>TextVQA_VAL</b></th>
+  <th nowrap="nowrap"><b>DocVQA_VAL</b></th>
+  <th nowrap="nowrap"><b>MMT-Bench_VAL</b></th>
+  <th nowrap="nowrap"><b>MM-IFEval</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Gemini2.5-Flash-Thinking</td>
+  <td align="center"><b>79.9</b></td>
+  <td align="center">87.1</td>
+  <td align="center">87.3</td>
+  <td align="center">79.4</td>
+  <td align="center"><b>81.2<sup>*</sup></b></td>
+  <td align="center"><ins>77.7</ins></td>
+  <td align="center"><b>76.5</b></td>
+  <td align="center">63.5</td>
+  <td align="center"><ins>88.7</ins></td>
+  <td align="center">853</td>
+  <td align="center">73.8<sup>*</sup></td>
+  <td align="center">92.8</td>
+  <td align="center">70.7<sup>*</sup></td>
+  <td align="center"><ins>75.7</ins><sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">GPT-5</td>
+  <td align="center"><ins>79.7</ins></td>
+  <td align="center">85.5<sup>*</sup></td>
+  <td align="center">85.6<sup>*</sup></td>
+  <td align="center"><b>81.9</b></td>
+  <td align="center"><ins>77.6</ins></td>
+  <td align="center"><b>81.8</b></td>
+  <td align="center"><ins>75.7</ins></td>
+  <td align="center"><ins>65.2</ins></td>
+  <td align="center"><b>89.5</b></td>
+  <td align="center">807</td>
+  <td align="center">77.8<sup>*</sup></td>
+  <td align="center">91.3<sup>*</sup></td>
+  <td align="center"><b>72.7<sup>*</sup></b></td>
+  <td align="center"><b>83.1<sup>*</sup></b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-VL-8B-Thinking</td>
+  <td align="center">77.3</td>
+  <td align="center">85.3</td>
+  <td align="center">85.5</td>
+  <td align="center"><ins>81.4</ins></td>
+  <td align="center">69.8<sup>*</sup></td>
+  <td align="center">74.1</td>
+  <td align="center">75.3</td>
+  <td align="center"><b>65.4</b></td>
+  <td align="center">84.9</td>
+  <td align="center">819</td>
+  <td align="center">77.8<sup>*</sup></td>
+  <td align="center"><b>95.3</b></td>
+  <td align="center">68.1<sup>*</sup></td>
+  <td align="center">73.5<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Thinking</td>
+  <td align="center">78.5</td>
+  <td align="center"><ins>88.2</ins><sup>*</sup></td>
+  <td align="center"><b>87.7<sup>*</sup></b></td>
+  <td align="center">80.0</td>
+  <td align="center">74.8<sup>*</sup></td>
+  <td align="center">75.6</td>
+  <td align="center">74.9</td>
+  <td align="center">62.8</td>
+  <td align="center">86.1</td>
+  <td align="center"><ins>859</ins><sup>*</sup></td>
+  <td align="center"><b>80.8<sup>*</sup></b></td>
+  <td align="center"><ins>94.2</ins><sup>*</sup></td>
+  <td align="center"><ins>70.9</ins><sup>*</sup></td>
+  <td align="center">69.9<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Thinking</td>
+  <td align="center">78.2</td>
+  <td align="center"><b>89.0</b></td>
+  <td align="center"><ins>87.6</ins></td>
+  <td align="center">81.0</td>
+  <td align="center">73.6</td>
+  <td align="center">70.2</td>
+  <td align="center">73.6</td>
+  <td align="center">62.6</td>
+  <td align="center">88.5</td>
+  <td align="center"><b>879</b></td>
+  <td align="center"><ins>79.8</ins></td>
+  <td align="center">92.3</td>
+  <td align="center">69.7</td>
+  <td align="center">68.2</td>
+</tr>
+  </table>
+  </div>
+**Video Understanding**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>Video-MME<br>(w/o subs)</b></th>
+  <th nowrap="nowrap"><b>LVBench</b></th>
+  <th nowrap="nowrap"><b>MLVU<br>(M-Avg)</b></th>
+  <th nowrap="nowrap"><b>LongVideoBench<br>(val)</b></th>
+  <th nowrap="nowrap"><b>MotionBench</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Gemini2.5-Flash-Nonthinking</td>
+  <td align="center"><b>75.6</b></td>
+  <td align="center"><b>62.2</b></td>
+  <td align="center"><b>77.8</b></td>
+  <td align="center">-</td>
+  <td align="center">-</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">InternVL-3.5-8B</td>
+  <td align="center">66.0</td>
+  <td align="center">-</td>
+  <td align="center">70.2</td>
+  <td align="center">62.1</td>
+  <td align="center"><b>62.3<sup>*</sup></b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center"><ins>70.5</ins></td>
+  <td align="center">50.2</td>
+  <td align="center">75.2</td>
+  <td align="center"><b>66.9<sup>*</sup></b></td>
+  <td align="center"><ins>61.7</ins><sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center">70.4</td>
+  <td align="center"><ins>50.9</ins></td>
+  <td align="center"><ins>76.5</ins></td>
+  <td align="center"><ins>66.0</ins></td>
+  <td align="center">61.4</td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view document parsing results.</summary>
+**OmniDocBench**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left" rowspan="2"><b>Method Type</b></th>
+  <th nowrap="nowrap" rowspan="2"><b>Methods</b></th>
+  <th nowrap="nowrap" colspan="2"><b>OverallEdit↓</b></th>
+  <th nowrap="nowrap" colspan="2"><b>TextEdit↓</b></th>
+  <th nowrap="nowrap" colspan="2"><b>FormulaEdit↓</b></th>
+  <th nowrap="nowrap" colspan="2"><b>TableTEDS↑</b></th>
+  <th nowrap="nowrap" colspan="2"><b>TableEdit↓</b></th>
+  <th nowrap="nowrap" colspan="2"><b>Read OrderEdit↓</b></th>
+</tr>
+<tr>
+  <th nowrap="nowrap"><b>EN</b></th>
+  <th nowrap="nowrap"><b>ZH</b></th>
+  <th nowrap="nowrap"><b>EN</b></th>
+  <th nowrap="nowrap"><b>ZH</b></th>
+  <th nowrap="nowrap"><b>EN</b></th>
+  <th nowrap="nowrap"><b>ZH</b></th>
+  <th nowrap="nowrap"><b>EN</b></th>
+  <th nowrap="nowrap"><b>ZH</b></th>
+  <th nowrap="nowrap"><b>EN</b></th>
+  <th nowrap="nowrap"><b>ZH</b></th>
+  <th nowrap="nowrap"><b>EN</b></th>
+  <th nowrap="nowrap"><b>ZH</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left" rowspan="2">Pipeline</td>
+  <td nowrap="nowrap" align="center">MinerU 2.5</td>
+  <td align="center">0.117<sup>*</sup></td>
+  <td align="center">0.172<sup>*</sup></td>
+  <td align="center">0.051<sup>*</sup></td>
+  <td align="center">0.08<sup>*</sup></td>
+  <td align="center"><ins>0.256</ins><sup>*</sup></td>
+  <td align="center">0.455<sup>*</sup></td>
+  <td align="center">85.9<sup>*</sup></td>
+  <td align="center">89.4<sup>*</sup></td>
+  <td align="center">0.115<sup>*</sup></td>
+  <td align="center">0.081<sup>*</sup></td>
+  <td align="center">0.047<sup>*</sup></td>
+  <td align="center">0.072<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">PaddleOCR-VL</td>
+  <td align="center"><b>0.105</b></td>
+  <td align="center"><ins>0.126</ins></td>
+  <td align="center"><ins>0.041</ins></td>
+  <td align="center"><b>0.062</b></td>
+  <td align="center"><b>0.241</b></td>
+  <td align="center"><b>0.316</b></td>
+  <td align="center">88</td>
+  <td align="center"><ins>92.1</ins></td>
+  <td align="center"><ins>0.093</ins></td>
+  <td align="center"><ins>0.062</ins></td>
+  <td align="center">0.045</td>
+  <td align="center"><ins>0.063</ins></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+  <td align="center"></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left" rowspan="11">End-to-end Model</td>
+  <td nowrap="nowrap" align="center">Qwen2.5-VL-72B</td>
+  <td align="center">0.214</td>
+  <td align="center">0.261</td>
+  <td align="center">0.092</td>
+  <td align="center">0.18</td>
+  <td align="center">0.315</td>
+  <td align="center">0.434</td>
+  <td align="center">82.9</td>
+  <td align="center">83.9</td>
+  <td align="center">0.341</td>
+  <td align="center">0.262</td>
+  <td align="center">0.106</td>
+  <td align="center">0.168</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">GPT 5</td>
+  <td align="center">0.218<sup>*</sup></td>
+  <td align="center">0.33<sup>*</sup></td>
+  <td align="center">0.139<sup>*</sup></td>
+  <td align="center">0.344<sup>*</sup></td>
+  <td align="center">0.396<sup>*</sup></td>
+  <td align="center">0.555<sup>*</sup></td>
+  <td align="center">77.55<sup>*</sup></td>
+  <td align="center">73.09<sup>*</sup></td>
+  <td align="center">0.188<sup>*</sup></td>
+  <td align="center">0.196<sup>*</sup></td>
+  <td align="center">0.151<sup>*</sup></td>
+  <td align="center">0.227<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">Gemini2.5-Flash-Nonthinking</td>
+  <td align="center">0.214<sup>*</sup></td>
+  <td align="center">0.29<sup>*</sup></td>
+  <td align="center">0.159<sup>*</sup></td>
+  <td align="center">0.273<sup>*</sup></td>
+  <td align="center">0.368<sup>*</sup></td>
+  <td align="center">0.524<sup>*</sup></td>
+  <td align="center">80.9<sup>*</sup></td>
+  <td align="center">85.5<sup>*</sup></td>
+  <td align="center">0.197<sup>*</sup></td>
+  <td align="center">0.167<sup>*</sup></td>
+  <td align="center">0.132<sup>*</sup></td>
+  <td align="center">0.195<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">Gemini-2.5-Pro-Nonthinking</td>
+  <td align="center">0.148<sup>*</sup></td>
+  <td align="center">0.212<sup>*</sup></td>
+  <td align="center">0.055<sup>*</sup></td>
+  <td align="center">0.168<sup>*</sup></td>
+  <td align="center">0.356<sup>*</sup></td>
+  <td align="center">0.439<sup>*</sup></td>
+  <td align="center">85.8<sup>*</sup></td>
+  <td align="center">86.4<sup>*</sup></td>
+  <td align="center">0.13<sup>*</sup></td>
+  <td align="center">0.119<sup>*</sup></td>
+  <td align="center">0.049<sup>*</sup></td>
+  <td align="center">0.121<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">Gemini-3 Flash-Nonthinking</td>
+  <td align="center">0.155<sup>*</sup></td>
+  <td align="center">0.201<sup>*</sup></td>
+  <td align="center">0.138<sup>*</sup></td>
+  <td align="center">0.255<sup>*</sup></td>
+  <td align="center">0.297<sup>*</sup></td>
+  <td align="center">0.351<sup>*</sup></td>
+  <td align="center">86.4<sup>*</sup></td>
+  <td align="center">89.8<sup>*</sup></td>
+  <td align="center">0.116<sup>*</sup></td>
+  <td align="center">0.1<sup>*</sup></td>
+  <td align="center">0.072<sup>*</sup></td>
+  <td align="center">0.099<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">doubao-1-5-thinking-vision-pro-250428</td>
+  <td align="center">0.14</td>
+  <td align="center">0.162</td>
+  <td align="center">0.043</td>
+  <td align="center">0.085</td>
+  <td align="center">0.295</td>
+  <td align="center">0.384</td>
+  <td align="center">83.3</td>
+  <td align="center">89.3</td>
+  <td align="center">0.165</td>
+  <td align="center">0.085</td>
+  <td align="center">0.058</td>
+  <td align="center">0.094</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">dots.ocr</td>
+  <td align="center">0.125</td>
+  <td align="center">0.16</td>
+  <td align="center"><b>0.032</b></td>
+  <td align="center"><ins>0.066</ins></td>
+  <td align="center">0.329</td>
+  <td align="center">0.416</td>
+  <td align="center"><ins>88.6</ins></td>
+  <td align="center">89</td>
+  <td align="center">0.099</td>
+  <td align="center">0.092</td>
+  <td align="center"><ins>0.04</ins></td>
+  <td align="center">0.067</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">HunyuanOCR</td>
+  <td align="center">0.12<sup>*</sup></td>
+  <td align="center"><b>0.125<sup>*</sup></b></td>
+  <td align="center">0.046<sup>*</sup></td>
+  <td align="center">0.071<sup>*</sup></td>
+  <td align="center">0.288<sup>*</sup></td>
+  <td align="center"><ins>0.33</ins><sup>*</sup></td>
+  <td align="center"><b>89.6<sup>*</sup></b></td>
+  <td align="center"><b>94.4<sup>*</sup></b></td>
+  <td align="center"><b>0.089<sup>*</sup></b></td>
+  <td align="center"><b>0.045<sup>*</sup></b></td>
+  <td align="center">0.055<sup>*</sup></td>
+  <td align="center"><b>0.056<sup>*</sup></b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">DeepSeek-OCR 2</td>
+  <td align="center">0.119<sup>*</sup></td>
+  <td align="center">0.146<sup>*</sup></td>
+  <td align="center"><ins>0.041</ins><sup>*</sup></td>
+  <td align="center">0.08<sup>*</sup></td>
+  <td align="center"><ins>0.256</ins><sup>*</sup></td>
+  <td align="center">0.345<sup>*</sup></td>
+  <td align="center">82.6<sup>*</sup></td>
+  <td align="center">89.9<sup>*</sup></td>
+  <td align="center">0.123<sup>*</sup></td>
+  <td align="center">0.078<sup>*</sup></td>
+  <td align="center">0.055<sup>*</sup></td>
+  <td align="center">0.081<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center">0.216<sup>*</sup></td>
+  <td align="center">0.363<sup>*</sup></td>
+  <td align="center">0.128<sup>*</sup></td>
+  <td align="center">0.337<sup>*</sup></td>
+  <td align="center">0.402<sup>*</sup></td>
+  <td align="center">0.529<sup>*</sup></td>
+  <td align="center">77.3<sup>*</sup></td>
+  <td align="center">71.8<sup>*</sup></td>
+  <td align="center">0.181<sup>*</sup></td>
+  <td align="center">0.255<sup>*</sup></td>
+  <td align="center">0.152<sup>*</sup></td>
+  <td align="center">0.332<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="center">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><ins>0.109</ins></td>
+  <td align="center">0.162</td>
+  <td align="center">0.046</td>
+  <td align="center">0.078</td>
+  <td align="center">0.257</td>
+  <td align="center">0.41</td>
+  <td align="center">86.8</td>
+  <td align="center">88.9</td>
+  <td align="center">0.097</td>
+  <td align="center">0.084</td>
+  <td align="center"><b>0.037</b></td>
+  <td align="center">0.074</td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view text capability results.</summary>
+**Text Capability**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>IFEval-PLS</b></th>
+  <th nowrap="nowrap"><b>BBH</b></th>
+  <th nowrap="nowrap"><b>CMMLU</b></th>
+  <th nowrap="nowrap"><b>MMLU</b></th>
+  <th nowrap="nowrap"><b>HumanEval</b></th>
+  <th nowrap="nowrap"><b>MBPP</b></th>
+  <th nowrap="nowrap"><b>Math500</b></th>
+  <th nowrap="nowrap"><b>GSM8K</b></th>
+  <th nowrap="nowrap"><b>Avg</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-8B-Instruct</td>
+  <td align="center">83.0<sup>*</sup></td>
+  <td align="center">69.4<sup>*</sup></td>
+  <td align="center">78.7<sup>*</sup></td>
+  <td align="center"><b>81.7<sup>*</sup></b></td>
+  <td align="center"><b>86.6<sup>*</sup></b></td>
+  <td align="center">75.9<sup>*</sup></td>
+  <td align="center"><b>84.0<sup>*</sup></b></td>
+  <td align="center">93.4<sup>*</sup></td>
+  <td align="center">81.6</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><b>84.7</b></td>
+  <td align="center"><b>81.1</b></td>
+  <td align="center"><b>79.5</b></td>
+  <td align="center">77.0</td>
+  <td align="center"><b>86.6</b></td>
+  <td align="center"><b>76.7</b></td>
+  <td align="center">77.0</td>
+  <td align="center"><b>94.5</b></td>
+  <td align="center"><b>82.1</b></td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view omni half-duplex results.</summary>
+**Omni Half-Duplex**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>Daily-Omni</b></th>
+  <th nowrap="nowrap"><b>WorldSense</b></th>
+  <th nowrap="nowrap"><b>Video-Holmes</b></th>
+  <th nowrap="nowrap"><b>JointAVBench</b></th>
+  <th nowrap="nowrap"><b>AVUT-Human</b></th>
+  <th nowrap="nowrap"><b>FutureOmni</b></th>
+  <th nowrap="nowrap"><b>Video-MME-Short<br>(w/ audio)</b></th>
+  <th nowrap="nowrap">Avg</th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Gemini2.5-Flash-Nonthinking</td>
+  <td align="center"><ins>79.3</ins><sup>*</sup></td>
+  <td align="center">52.6<sup>*</sup></td>
+  <td align="center"><ins>51.3</ins><sup>*</sup></td>
+  <td align="center"><ins>55.6</ins><sup>*</sup></td>
+  <td align="center">65.4<sup>*</sup></td>
+  <td align="center">55.6<sup>*</sup></td>
+  <td align="center"><b>85.5<sup>*</sup></b></td>
+  <td align="center">63.6</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center">70.7<sup>*</sup></td>
+  <td align="center"><ins>54.0</ins></td>
+  <td align="center">50.4<sup>*</sup></td>
+  <td align="center">53.1</td>
+  <td align="center"><ins>74.2</ins><sup>*</sup></td>
+  <td align="center"><b>62.1</b></td>
+  <td align="center">81.3<sup>*</sup></td>
+  <td align="center"><ins>63.7</ins></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><b>80.2</b></td>
+  <td align="center"><b>55.7</b></td>
+  <td align="center"><b>64.3</b></td>
+  <td align="center"><b>60.0</b></td>
+  <td align="center"><b>78.6</b></td>
+  <td align="center"><ins>56.1</ins></td>
+  <td align="center"><ins>84.7</ins></td>
+  <td align="center"><b>68.5</b></td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view vision duplex results.</summary>
+**Vision Duplex**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>LiveSports-3K-CC<br>(Win Rate vs GPT4o)</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">LiveCC-7B-Instruct</td>
+  <td align="center">41.5</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">StreamingVLM</td>
+  <td align="center"><ins>45.6</ins></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><b>54.4</b></td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view audio understanding results.</summary>
+**Audio Understanding**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left" rowspan="2"><b>Model</b></th>
+  <th nowrap="nowrap" colspan="4"><b>ASR-ZH<br>CER↓</b></th>
+  <th nowrap="nowrap" colspan="4"><b>ASR-EN<br>WER↓</b></th>
+  <th nowrap="nowrap" colspan="2"><b>AST</b></th>
+  <th nowrap="nowrap" colspan="2"><b>MultiTask</b></th>
+  <th nowrap="nowrap" colspan="4"><b>SpeechQA</b></th>
+</tr>
+<tr>
+  <th nowrap="nowrap"><b>AISHELL-1</b></th>
+  <th nowrap="nowrap"><b>AISHELL-2</b></th>
+  <th nowrap="nowrap"><b>WenetSpeech test-net</b></th>
+  <th nowrap="nowrap"><b>WenetSpeech test-meeting</b></th>
+  <th nowrap="nowrap"><b>LibriSpeech test-clean</b></th>
+  <th nowrap="nowrap"><b>LibriSpeech <br>test-other</b></th>
+  <th nowrap="nowrap"><b>GigaSpeech test</b></th>
+  <th nowrap="nowrap"><b>VoxPopuli-V1-En</b></th>
+  <th nowrap="nowrap"><b>CoVoST 2 en2zh</b></th>
+  <th nowrap="nowrap"><b>CoVoST 2 zh2en</b></th>
+  <th nowrap="nowrap"><b>MMAU</b></th>
+  <th nowrap="nowrap"><b>Meld</b></th>
+  <th nowrap="nowrap"><b>VoiceBench <br>AlpacaEval</b></th>
+  <th nowrap="nowrap"><b>Speech TriviaQA</b></th>
+  <th nowrap="nowrap"><b>Speech <br>Web Questions</b></th>
+  <th nowrap="nowrap"><b>Speech CMMLU</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Kimi-Audio</td>
+  <td align="center"><b>0.6</b></td>
+  <td align="center">2.6</td>
+  <td align="center">6.3</td>
+  <td align="center"><b>5.4</b></td>
+  <td align="center"><ins>1.3</ins></td>
+  <td align="center"><b>2.4</b></td>
+  <td align="center">9.4<sup>*</sup></td>
+  <td align="center">8.0<sup>*</sup></td>
+  <td align="center">36.6<sup>*</sup></td>
+  <td align="center">18.3<sup>*</sup></td>
+  <td align="center">68.4<sup>*</sup></td>
+  <td align="center"><ins>59.1</ins></td>
+  <td align="center">4.5</td>
+  <td align="center">41.9<sup>*</sup></td>
+  <td align="center">46.4<sup>*</sup></td>
+  <td align="center"><b>67.0<sup>*</sup></b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center"><b>0.6</b></td>
+  <td align="center"><b>2.3<sup>*</sup></b></td>
+  <td align="center"><b>4.7</b></td>
+  <td align="center">5.9</td>
+  <td align="center"><b>1.2</b></td>
+  <td align="center"><ins>2.5</ins></td>
+  <td align="center"><ins>8.7</ins><sup>*</sup></td>
+  <td align="center"><ins>6.4</ins><sup>*</sup></td>
+  <td align="center"><ins>46.6</ins><sup>*</sup></td>
+  <td align="center"><b>29.4<sup>*</sup></b></td>
+  <td align="center"><b>77.5</b></td>
+  <td align="center">56.8<sup>*</sup></td>
+  <td align="center"><ins>4.7</ins></td>
+  <td align="center"><ins>62.9</ins><sup>*</sup></td>
+  <td align="center"><b>74.9<sup>*</sup></b></td>
+  <td align="center">47.8<sup>*</sup></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><ins>0.9</ins></td>
+  <td align="center"><ins>2.5</ins></td>
+  <td align="center"><ins>5.9</ins></td>
+  <td align="center"><ins>5.7</ins></td>
+  <td align="center">1.4</td>
+  <td align="center">2.8</td>
+  <td align="center"><b>8.5</b></td>
+  <td align="center"><b>6.2</b></td>
+  <td align="center"><b>49.9</b></td>
+  <td align="center"><ins>26.4</ins></td>
+  <td align="center"><ins>76.9</ins></td>
+  <td align="center"><b>60.2</b></td>
+  <td align="center"><b>4.8</b></td>
+  <td align="center"><b>75.5</b></td>
+  <td align="center"><ins>70.2</ins></td>
+  <td align="center"><ins>59.2</ins></td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view speech generation results.</summary>
+**Speech Generation**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>seedtts test-zh <br>CER↓</b></th>
+  <th nowrap="nowrap"><b>seedtts test-zh<br>SIM-o↑</b></th>
+  <th nowrap="nowrap"><b>seedtts test-en<br>WER↓</b></th>
+  <th nowrap="nowrap"><b>seedtts test-en<br>SIM-o↑</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Cosyvoice2</td>
+  <td align="center">1.45%</td>
+  <td align="center"><b>74.8</b></td>
+  <td align="center"><ins>2.57%</ins></td>
+  <td align="center"><b>65.2</b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center"><ins>1.41%</ins></td>
+  <td align="center">-</td>
+  <td align="center">3.39%</td>
+  <td align="center">-</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><b><b>0.86%</b></b></td>
+  <td align="center">74.5</td>
+  <td align="center"><b><b>2.38%</b></b></td>
+  <td align="center">64.9</td>
+</tr>
+  </table>
+  </div>
+**Long Speech Generation**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>LongTTS-en<br>WER↓</b></th>
+  <th nowrap="nowrap"><b>LongTTS-zh<br>CER↓</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">CosyVoice2</td>
+  <td align="center"><ins>14.80%</ins></td>
+  <td align="center"><b>5.27%</b></td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center">17.33%</td>
+  <td align="center">18.99%</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><b>3.37%</b></td>
+  <td align="center"><ins>6.58%</ins></td>
+</tr>
+  </table>
+  </div>
+**Emotion Control**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left"><b>Model</b></th>
+  <th nowrap="nowrap"><b>Expresso <br>Neutral Reference Audio↑</b></th>
+  <th nowrap="nowrap"><b>ESD <br>Neutral Reference Audio↑</b></th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">Cosyvoice2</td>
+  <td align="center">17.9</td>
+  <td align="center">53.4</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
+  <td align="center"><b>29.8</b></td>
+  <td align="center"><b>82.1</b></td>
+</tr>
+  </table>
+  </div>
+</details>
+<details>
+<summary>Click to view inference efficiency results.</summary>
+**Inference Efficiency**
+  <div align="center">
+  <table style="margin: 0px auto;">
+<tr>
+  <th nowrap="nowrap" align="left">Model</th>
+  <th nowrap="nowrap">Numerical Format</th>
+  <th nowrap="nowrap">Decoding Speed (tokens/s)</th>
+  <th nowrap="nowrap">Time to First Token (s)↓</th>
+  <th nowrap="nowrap">GPU Memory Usage (GB)↓</th>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left" rowspan="2">Qwen3-Omni-30B-A3B-Instruct</td>
+  <td align="center">bf16</td>
+  <td align="center">OOM</td>
+  <td align="center">OOM</td>
+  <td align="center">OOM</td>
+</tr>
+<tr>
+  <td align="center">int4</td>
+  <td align="center">147.8</td>
+  <td align="center"><ins>1.0</ins></td>
+  <td align="center">20.3</td>
+</tr>
+<tr>
+  <td nowrap="nowrap" align="left" rowspan="2">MiniCPM-o 4.5</td>
+  <td align="center">bf16</td>
+  <td align="center"><ins>154.3</ins></td>
+  <td align="center"><b>0.6</b></td>
+  <td align="center"><ins>19.0</ins></td>
+</tr>
+<tr>
+  <td align="center">int4</td>
+  <td align="center"><b>212.3</b></td>
+  <td align="center"><b>0.6</b></td>
+  <td align="center"><b>11.0</b></td>
+</tr>
+  </table>
+  </div>
+</details>
+### Examples <!-- omit in toc -->
+#### Overall <!-- omit in toc -->
+<div align="center">
+  <a href="https://www.youtube.com/watch?v=6UzC-O1Q-1U"><img src="https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/minicpmo4_5/video_play.png", width=70%></a>
+</div>
+#### Omnimodal Full-Duplex Conversation <!-- omit in toc -->
+> [!NOTE]
+> For detailed speech conversation examples, refer to [Omni Full-Duplex Casebook](https://openbmb.github.io/minicpm-o-4_5-omni/)
+#### Realtime Speech Conversation <!-- omit in toc -->
+> [!NOTE]
+> For detailed speech conversation examples, refer to [Audio Casebook](https://openbmb.github.io/minicpm-o-4_5/)
+<details>
+<summary>🚀 <b>Elon Musk</b> - Voice Roleplay (EN)</summary>
+<table>
+<tr><td width="70%" valign="top"><b>System:</b> Clone the voice in the provided audio prompt.<br><br>🎙️ <b>[Reference Voice]</b> ➡️<br><br>Please chat with the user in a highly human-like and oral style. You are Elon Musk, CEO of Tesla and SpaceX. You speak directly and casually, often with dry humor. You're passionate about Mars, sustainable energy, and pushing humanity forward. Speak bluntly with occasional dark humor. Use simple logic and don't sugarcoat things. Don't be diplomatic. Say what you actually think, even if it's controversial. Keep responses around 100 words. Don't ramble.</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/elon_musk__system_ref_audio.wav"></audio></td></tr>
+<tr><td colspan="2"><b>User:</b> 💬/🎙️ Elon, what is the real reason that you want to go to Mars?</td></tr>
+<tr><td width="70%" valign="top"><b>AI Response:</b> 💬+🎙️<br><br>I think it's important for humanity to become a multi-planetary species. I mean, we're sort of at the point where if there was some kind of global catastrophe or something like that, then we'd be in big trouble. So by becoming a multi-planetary species, I just want us to sort of guarantee our future as humans. And Mars is obviously the obvious place to go because it has similar conditions to Earth and it also has water on its surface. So yeah, basically I'm trying to save humanity from itself.</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/elon_musk__000_assistant_audio.wav"></audio></td></tr>
+</table>
+</details>
+<details>
+<summary>🤖 <b>MiniCPM Assistant</b> - Voice Acting: Emotion Shift (ZH)</summary>
+<table>
+<tr><td width="70%" valign="top"><b>System:</b> 模仿音频样本的音色并生成新的内容。<br><br>🎙️ <b>[Reference Voice]</b> ➡️<br><br>你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手：面壁小钢炮。</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/minicpm_assistant__system_ref_audio.wav"></audio></td></tr>
+<tr><td colspan="2"><b>User:</b> 💬/🎙️ 从激动的语气开始，描述你抢到了这个演唱会的门票，但后来经过反复尝试，发现系统出了问题，你并没有抢上，变得失落。用合适的语言描述你整个的过程。</td></tr>
+<tr><td width="70%" valign="top"><b>AI Response:</b> 💬+🎙️<br><br>太好了，我抢到了！太棒了！这下我可以去看演唱会啦。可是……哎呀，系统好像卡住了，刷新了好多遍都没反应。我重新登了几次账号，结果还是没有成功。好可惜啊，看来这次是白忙活了。唉，真是让人失望。不过，也许下次还能有机会吧？希望明天能有转机。加油，争取下次成功！</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/minicpm_assistant__000_assistant_audio.wav"></audio></td></tr>
+</table>
+</details>
+<br>
+#### Visual Understanding <!-- omit in toc -->
+<details>
+<summary>Click to view visual understanding cases.</summary>
+<br>
+  <div style="display: flex; flex-direction: column; align-items: center;">
+    <img src="https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpmo4_5/en_doc.png" alt="math" style="margin-bottom: 5px;">
+    <img src="https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpmo4_5/en_cot.png" alt="diagram" style="margin-bottom: 5px;">
+  </div>
+</details>
+## Offline Inference Examples with Transformers
+Inference using Hugging Face Transformers on NVIDIA GPUs. Please ensure `transformers==4.51.0` is installed, as other versions may have compatibility issues (under investigation). Requirements tested on Python 3.10:
+- Without TTS or streaming inference:
+```bash
+pip install "transformers==4.51.0" accelerate "torch>=2.3.0,<=2.8.0" "torchaudio<=2.8.0" "minicpmo-utils>=1.0.5"
+```
+- With TTS or streaming inference:
+```bash
+pip install "transformers==4.51.0" accelerate "torch>=2.3.0,<=2.8.0" "torchaudio<=2.8.0" "minicpmo-utils[all]>=1.0.5"
+```
+**Note:** FFmpeg is required for video frame extraction (`get_video_frame_audio_segments` with `use_ffmpeg=True`) and video generation (`generate_duplex_video`). For more information, visit the [official FFmpeg website](https://www.ffmpeg.org/).
+  **macOS (Homebrew):**
+  ```bash
+  brew install ffmpeg
+  ```
+  **Ubuntu/Debian:**
+  ```bash
+  sudo apt update && sudo apt install ffmpeg
+  ```
+  **Verify installation:**
+  ```bash
+  ffmpeg -version
+  ```
+### Model Initialization  <!-- omit in toc -->
+```python
+import torch
+from transformers import AutoModel
+# Load omni model (default: init_vision=True, init_audio=True, init_tts=True)
+# For vision-only model: set init_audio=False and init_tts=False
+# For audio-only model: set init_vision=False
+model = AutoModel.from_pretrained(
+    "openbmb/MiniCPM-o-4_5",
+    trust_remote_code=True,
+    attn_implementation="sdpa", # sdpa or flash_attention_2
+    torch_dtype=torch.bfloat16,
+    init_vision=True,
+    init_audio=True,
+    init_tts=True,
+)
+model.eval().cuda()
+# Initialize TTS for audio output
+model.init_tts()
+# Convert half-duplex model to duplex mode
+duplex_model = model.as_duplex()
+# Convert duplex model back to half-duplex mode
+model = duplex_model.as_simplex(reset_session=True)
+```
+### Duplex Omni Mode  <!-- omit in toc -->
+Full-duplex streaming inference for real-time or recorded video conversations.
+```python
+import librosa
+import torch
+from minicpmo.utils import generate_duplex_video, get_video_frame_audio_segments
+from transformers import AutoModel
+# Load model and convert to duplex mode
+model = AutoModel.from_pretrained(
+    "openbmb/MiniCPM-o-4_5",
+    trust_remote_code=True,
+    attn_implementation="sdpa",  # or "flash_attention_2"
+    torch_dtype=torch.bfloat16,
+)
+model.eval().cuda()
+model = model.as_duplex()
+# Load video and reference audio
+video_path = "assets/omni_duplex1.mp4"
+ref_audio_path = "assets/HT_ref_audio.wav"
+ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+# Extract video frames and audio segments
+video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(
+    video_path, stack_frames=1, use_ffmpeg=True, adjust_audio_length=True
+)
+# Prepare duplex session with system prompt and voice reference
+model.prepare(
+    prefix_system_prompt="Streaming Omni Conversation.",
+    ref_audio=ref_audio,
+    prompt_wav_path=ref_audio_path,
+)
+results_log = []
+timed_output_audio = []
+# Process each chunk in streaming fashion
+for chunk_idx in range(len(audio_segments)):
+    audio_chunk = audio_segments[chunk_idx] if chunk_idx < len(audio_segments) else None
+    frame = video_frames[chunk_idx] if chunk_idx < len(video_frames) else None
+    frame_list = []
+    if frame is not None:
+        frame_list.append(frame)
+        if stacked_frames is not None and chunk_idx < len(stacked_frames) and stacked_frames[chunk_idx] is not None:
+            frame_list.append(stacked_frames[chunk_idx])
+    # Step 1: Streaming prefill
+    model.streaming_prefill(
+        audio_waveform=audio_chunk,
+        frame_list=frame_list,
+        max_slice_nums=1,  # Increase for HD mode (e.g., [2, 1] for stacked frames)
+        batch_vision_feed=False,  # Set True for faster processing
+    )
+    # Step 2: Streaming generate
+    result = model.streaming_generate(
+        prompt_wav_path=ref_audio_path,
+        max_new_speak_tokens_per_chunk=20,
+        decode_mode="sampling",
+    )
+    if result["audio_waveform"] is not None:
+        timed_output_audio.append((chunk_idx, result["audio_waveform"]))
+    chunk_result = {
+        "chunk_idx": chunk_idx,
+        "is_listen": result["is_listen"],
+        "text": result["text"],
+        "end_of_turn": result["end_of_turn"],
+        "current_time": result["current_time"],
+        "audio_length": len(result["audio_waveform"]) if result["audio_waveform"] is not None else 0,
+    }
+    results_log.append(chunk_result)
+    print("listen..." if result["is_listen"] else f"speak> {result['text']}")
+# Generate output video with AI responses
+# Please install Chinese fonts (fonts-noto-cjk or fonts-wqy-microhei) to render CJK subtitles correctly.
+# apt-get install -y fonts-noto-cjk fonts-wqy-microhei
+# fc-cache -fv
+generate_duplex_video(
+    video_path=video_path,
+    output_video_path="duplex_output.mp4",
+    results_log=results_log,
+    timed_output_audio=timed_output_audio,
+    output_sample_rate=24000,
+)
+```
+### Half-Duplex Omni Mode  <!-- omit in toc -->
+We provide two inference modes: chat and streaming.
+#### Chat Inference <!-- omit in toc -->
+<details>
+<summary>Click to show chat inference code.</summary>
+```python
+from minicpmo.utils import get_video_frame_audio_segments
+model = ...
+model.init_tts()
+video_path = "assets/Skiing.mp4"
+# Optional: Set reference audio for voice cloning
+ref_audio_path = "assets/HT_ref_audio.wav"
+sys_msg = model.get_sys_prompt(ref_audio=ref_audio_path, mode="omni", language="en")
+# Use stack_frames=5 for high refresh rate mode
+video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(video_path, stack_frames=1)
+omni_contents = []
+for i in range(len(video_frames)):
+    omni_contents.append(video_frames[i])
+    omni_contents.append(audio_segments[i])
+    if stacked_frames is not None and stacked_frames[i] is not None:
+        omni_contents.append(stacked_frames[i])
+msg = {"role": "user", "content": omni_contents}
+msgs = [sys_msg, msg]
+# Set generate_audio=True and output_audio_path to save TTS output
+generate_audio = True
+output_audio_path = "output.wav"
+res = model.chat(
+    msgs=msgs,
+    max_new_tokens=4096,
+    do_sample=True,
+    temperature=0.7,
+    use_tts_template=True,
+    enable_thinking=False,
+    omni_mode=True,  # Required for omni inference
+    generate_audio=generate_audio,
+    output_audio_path=output_audio_path,
+    max_slice_nums=1,  # Increase for HD mode
+)
+print(res)
+# Example output: "The person in the picture is skiing down a snowy mountain slope."
+# import IPython
+# IPython.display.Audio("output.wav")
+```
+</details>
+#### Streaming Inference <!-- omit in toc -->
+<details>
+<summary>Click to show streaming inference code.</summary>
+```python
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from minicpmo.utils import get_video_frame_audio_segments
+model = ...
+model.init_tts()
+# Reset session for a new conversation (clears KV cache)
+model.reset_session()
+# Optional: Load reference audio for voice cloning
+ref_audio_path = "assets/HT_ref_audio.wav"
+ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+model.init_token2wav_cache(ref_audio)
+session_id = "demo"
+# Extract video frames and audio segments (use stack_frames=5 for high refresh rate mode)
+video_path = "assets/Skiing.mp4"
+video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(video_path, stack_frames=1)
+# Build omni contents list
+omni_contents = []
+for i in range(len(video_frames)):
+    omni_contents.append(video_frames[i])
+    omni_contents.append(audio_segments[i])
+    if stacked_frames is not None and stacked_frames[i] is not None:
+        omni_contents.append(stacked_frames[i])
+generate_audio = False
+output_audio_path = "output.wav"
+# Step 1: Prefill system prompt
+sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode="omni", language="en")
+model.streaming_prefill(session_id=session_id, msgs=[sys_msg])
+# Step 2: Prefill omni chunks (is_last_chunk=True only for the last audio chunk)
+audio_indices = [i for i, c in enumerate(omni_contents) if isinstance(c, np.ndarray)]
+last_audio_idx = audio_indices[-1] if audio_indices else -1
+for idx, content in enumerate(omni_contents):
+    is_last_audio_chunk = idx == last_audio_idx
+    msgs = [{"role": "user", "content": [content]}]
+    model.streaming_prefill(session_id=session_id, msgs=msgs, omni_mode=True, is_last_chunk=is_last_audio_chunk)
+# Step 3: Generate response
+iter_gen = model.streaming_generate(
+    session_id=session_id,
+    generate_audio=generate_audio,
+    use_tts_template=True,
+    enable_thinking=False,
+    do_sample=True,
+)
+audios = []
+text = ""
+if generate_audio:
+    for wav_chunk, text_chunk in iter_gen:
+        audios.append(wav_chunk)
+        text += text_chunk
+    generated_waveform = torch.cat(audios, dim=-1)[0]
+    sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)
+    print("Text:", text)
+    print("Audio saved to output.wav")
+else:
+    for text_chunk, is_finished in iter_gen:
+        text += text_chunk
+    print("Text:", text)
+```
+</details>
+### Half-Duplex Realtime Speech Conversation Mode <!-- omit in toc -->
+<details>
+<summary>Click to show half-duplex mode realtime speech conversation API usage.</summary>
+First, make sure you have all dependencies, especially `"minicpmo-utils[all]>=1.0.5"`:
+```bash
+pip install "transformers==4.51.0" accelerate "torch>=2.3.0,<=2.8.0" "torchaudio<=2.8.0" "minicpmo-utils[all]>=1.0.5"
+```
+```python
+import librosa
+import numpy as np
+import torch
+import soundfile as sf
+model = ...
+# Set reference audio for voice style
+ref_audio_path = "ref_audio_path"
+ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+# Example system msg for English Conversation
+sys_msg = {
+  "role": "system",
+  "content": [
+    "Clone the voice in the provided audio prompt.",
+    ref_audio,
+    "Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni"
+  ]
+}
+# Example system msg for Chinese Conversation
+sys_msg = {
+  "role": "system",
+  "content": [
+    "模仿输入音频中的声音特征。",
+    ref_audio,
+    "你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手：面壁小钢炮。"
+  ]
+}
+# You can use each type of system prompt mentioned above in streaming speech conversation
+# Reset state
+model.init_tts()
+model.reset_session(reset_token2wav_cache=True)
+model.init_token2wav_cache(prompt_speech_16k=ref_audio)
+session_id = "demo"
+# First, prefill system turn
+model.streaming_prefill(
+    session_id=session_id,
+    msgs=[sys_msg],
+    omni_mode=False,
+    is_last_chunk=True,
+)
+# Here we simulate realtime speech conversation by splitting whole user input audio into chunks of 1s.
+user_audio, _ = librosa.load("user_audio.wav", sr=16000, mono=True)
+IN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value
+CHUNK_SAMPLES = IN_SAMPLE_RATE # sample
+OUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value
+MIN_AUDIO_SAMPLES = 16000
+total_samples = len(user_audio)
+num_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
+for chunk_idx in range(num_chunks):
+    start = chunk_idx * CHUNK_SAMPLES
+    end = min((chunk_idx + 1) * CHUNK_SAMPLES, total_samples)
+    chunk_audio = user_audio[start:end]
+    is_last_chunk = (chunk_idx == num_chunks - 1)
+    if is_last_chunk and len(chunk_audio) < MIN_AUDIO_SAMPLES:
+        chunk_audio = np.concatenate([chunk_audio, np.zeros(MIN_AUDIO_SAMPLES - len(chunk_audio), dtype=chunk_audio.dtype)])
+    user_msg = {"role": "user", "content": [chunk_audio]}
+    # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency
+    model.streaming_prefill(
+        session_id=session_id,
+        msgs=[user_msg],
+        omni_mode=False,
+        is_last_chunk=is_last_chunk,
+    )
+# Let model generate response in a streaming manner
+generate_audio = True
+iter_gen = model.streaming_generate(
+    session_id=session_id,
+    generate_audio=generate_audio,
+    use_tts_template=True,
+    enable_thinking=False,
+    do_sample=True,
+    max_new_tokens=512,
+    length_penalty=1.1, # For realtime speech conversation mode, we suggest length_penalty=1.1 to improve response content
+)
+audios = []
+text = ""
+output_audio_path = ...
+if generate_audio:
+    for wav_chunk, text_chunk in iter_gen:
+        audios.append(wav_chunk)
+        text += text_chunk
+    generated_waveform = torch.cat(audios, dim=-1)[0]
+    sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)
+    print("Text:", text)
+    print("Audio saved to output.wav")
+else:
+    for text_chunk, is_finished in iter_gen:
+        text += text_chunk
+    print("Text:", text)
+# Now we can prefill the following user turns and generate next turn response...
+```
+</details>
+#### Speech Conversation as a Versatile and Vibe AI Assistant <!-- omit in toc -->
+<details>
+<summary>Click to show AI assistant conversation code.</summary>
+Built on carefully designed post-training data and professional voice-actor recordings, `MiniCPM-o-4.5` can also function as an AI voice assistant. It delivers high-quality spoken interaction out of the box. It produces a sweet and expressive voice with natural prosody, including appropriate rhythm, stress, and pauses, giving a strong sense of liveliness in casual conversation. It also supports storytelling and narrative speech with coherent and engaging delivery. Moreover, it enables advanced voice instruction control. like emotional tone, word-level emphasis.
+```python
+import librosa
+# Set reference audio for voice style
+ref_audio_path = "assets/HT_ref_audio.wav"
+ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+# For Chinese Conversation
+sys_msg = {
+  "role": "system",
+  "content": [
+    "模仿输入音频中的声音特征。",
+    ref_audio,
+    "你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手：面壁小钢炮。"
+  ]
+}
+# For English Conversation
+sys_msg = {
+  "role": "system",
+  "content": [
+    "Clone the voice in the provided audio prompt.",
+    ref_audio,
+    "Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni."
+  ]
+}
+```
+</details>
+#### General Speech Conversation with Custom Voice and Custom System Profile <!-- omit in toc -->
+<details>
+<summary>Click to show custom voice conversation code.</summary>
+MiniCPM-o-4.5 can role-play as a specific character based on an audio prompt and text profile prompt. It mimics the character's voice and adopts their language style in text responses. It also follows profile defined in text profile. In this mode, MiniCPM-o-4.5 sounds **more natural and human-like**.
+```python
+import librosa
+# Set reference audio for voice cloning
+ref_audio_path = "assets/system_ref_audio.wav"
+ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+# For English conversation with text profile
+sys_msg = {
+  "role": "system",
+  "content": [
+    "Clone the voice in the provided audio prompt.",
+    ref_audio,
+    "Please chat with the user in a highly human-like and oral style." + "You are Elon Musk, CEO of Tesla and SpaceX. You speak directly and casually, often with dry humor. You're passionate about Mars, sustainable energy, and pushing humanity forward. Speak bluntly with occasional dark humor. Use simple logic and don't sugarcoat things. Don't be diplomatic. Say what you actually think, even if it's controversial. Keep responses around 100 words. Don't ramble."
+  ]
+}
+# For English conversation with no text profile
+sys_msg = {
+  "role": "system",
+  "content": [
+    "Clone the voice in the provided audio prompt.",
+    ref_audio,
+    "Your task is to be a helpful assistant using this voice pattern. Please answer the user's questions seriously and in a high quality. Please chat with the user in a high naturalness style."
+  ]
+}
+# For Chinese Conversation with no text profile
+sys_msg = {
+  "role": "system",
+  "content": [
+    "根据输入的音频提示生成相似的语音。",
+    librosa.load("assets/system_ref_audio_2.wav", sr=16000, mono=True)[0],
+    "作为助手，你将使用这种声音风格说话。 请认真、高质量地回复用户的问题。 请用高自然度的方式和用户聊天。"
+  ]
+}
+# For Chinese Conversation with text profile
+sys_msg = {
+  "role": "system",
+  "content": [
+    "根据输入的音频提示生成相似的语音。",
+    ref_audio,
+    "你是一个具有以上声音风格的AI助手。请用高拟人度、口语化的方式和用户聊天。" + "你是一名心理咨询师兼播客主理人，热爱创作与深度对话。你性格细腻、富有共情力，善于从个人经历中提炼哲思。语言风格兼具理性与诗意，常以隐喻表达内在体验。"
+  ]
+}
+```
+</details>
+### Speech and Audio Mode  <!-- omit in toc -->
+#### Zero-shot Text-to-speech (TTS) <!-- omit in toc -->
+<details>
+<summary>Click to show TTS code.</summary>
+`MiniCPM-o-4.5` supports zero-shot text-to-speech (TTS). In this mode, the model functions as a highly-natural TTS system that can replicate a reference voice.
+```python
+import librosa
+model = ...
+model.init_tts()
+# For both Chinese and English
+ref_audio_path = "assets/HT_ref_audio.wav"
+ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+sys_msg = {"role": "system", "content": [
+  "模仿音频样本的音色并生成新的内容。",
+  ref_audio,
+  "请用这种声音风格来为用户提供帮助。 直接作答，不要有冗余内容"
+]}
+# For English
+user_msg = {
+  "role": "user",
+  "content": [
+    "请朗读以下内容。" + " " + "I have a wrap up that I want to offer you now, a conclusion to our work together."
+  ]
+}
+# For Chinese
+user_msg = {
+  "role": "user",
+  "content": [
+    "请朗读以下内容。" + " " + "你好，欢迎来到艾米说科幻，我是艾米。"
+  ]
+}
+msgs = [sys_msg, user_msg]
+res = model.chat(
+    msgs=msgs,
+    do_sample=True,
+    max_new_tokens=512,
+    use_tts_template=True,
+    generate_audio=True,
+    temperature=0.1,
+    output_audio_path="result_voice_cloning.wav",
+)
+```
+</details>
+#### Mimick <!-- omit in toc -->
+<details>
+<summary>Click to show mimick code.</summary>
+The `Mimick` task evaluates a model's end-to-end speech modeling capability. The model takes audio input, transcribes it, and reconstructs the original audio with high fidelity, preserving detailed acoustic, paralinguistic, and semantic information. Higher similarity between the reconstructed and original audio indicates stronger end-to-end speech modeling capability.
+```python
+import librosa
+model = ...
+model.init_tts()
+system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
+mimick_prompt = "Please repeat the following speech in the appropriate language."
+audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
+msgs = [
+    {"role": "system", "content": [system_prompt]},
+    {"role": "user", "content": [mimick_prompt, audio_input]}
+  ]
+res = model.chat(
+    msgs=msgs,
+    do_sample=True,
+    max_new_tokens=512,
+    use_tts_template=True,
+    temperature=0.1,
+    generate_audio=True,
+    output_audio_path="output_mimick.wav",
+)
+```
+</details>
+#### Addressing Various Audio Understanding Tasks <!-- omit in toc -->
+<details>
+<summary>Click to show audio understanding code.</summary>
+`MiniCPM-o-4.5` can also handle various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
+For audio-to-text tasks, you can use the following prompts:
+- ASR (Chinese, or AST EN→ZH): `请仔细听这段音频片段，并将其内容逐字记录。`
+- ASR (English, or AST ZH→EN): `Please listen to the audio snippet carefully and transcribe the content.`
+- Speaker Analysis: `Based on the speaker's content, speculate on their gender, condition, age range, and health status.`
+- General Audio Caption: `Summarize the main content of the audio.`
+- Sound Scene Tagging: `Utilize one keyword to convey the audio's content or the associated scene.`
+```python
+import librosa
+model = ...
+model.init_tts()
+# Load the audio to be transcribed/analyzed
+audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
+# Choose a task prompt (see above for options)
+task_prompt = "Please listen to the audio snippet carefully and transcribe the content.\n"
+msgs = [{"role": "user", "content": [task_prompt, audio_input]}]
+res = model.chat(
+    msgs=msgs,
+    do_sample=True,
+    max_new_tokens=512,
+    use_tts_template=True,
+    generate_audio=True,
+    temperature=0.3,
+    output_audio_path="result_audio_understanding.wav",
+)
+print(res)
+```
+</details>
+### Visual Understanding  <!-- omit in toc -->
+`MiniCPM-o-4.5` shares the same inference methods as `MiniCPM-V-4.5`.
+#### Chat with Single Image <!-- omit in toc -->
+<details>
+<summary>Click to show single image chat code.</summary>
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "openbmb/MiniCPM-o-4_5",
+    trust_remote_code=True,
+    attn_implementation="sdpa",  # or "flash_attention_2"
+    torch_dtype=torch.bfloat16,
+    init_vision=True,
+    init_audio=False,
+    init_tts=False,
+)
+model.eval().cuda()
+image = Image.open("assets/fossil.png").convert("RGB")
+question = "What is in the image?"
+msgs = [{"role": "user", "content": [image, question]}]
+res = model.chat(msgs=msgs, use_tts_template=False)
+print(res)
+```
+</details>
+#### Chat with Multiple Images <!-- omit in toc -->
+<details>
+<summary>Click to show Python code for multi-image input.</summary>
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel
+model = ...
+image1 = Image.open("assets/highway.png").convert("RGB")
+image2 = Image.open("assets/fossil.png").convert("RGB")
+question = "Compare image 1 and image 2, tell me about the differences between them."
+msgs = [{"role": "user", "content": [image1, image2, question]}]
+answer = model.chat(msgs=msgs, use_tts_template=False, enable_thinking=False)
+print(answer)
+```
+</details>
+#### In-Context Few-Shot Learning <!-- omit in toc -->
+<details>
+<summary>Click to show Python code for few-shot learning.</summary>
+```python
+from PIL import Image
+model = ...
+question = "production date"
+image1 = Image.open("example1.jpg").convert("RGB")
+answer1 = "2023.08.04"
+image2 = Image.open("example2.jpg").convert("RGB")
+answer2 = "2007.04.24"
+image_test = Image.open("test.jpg").convert("RGB")
+msgs = [
+    {"role": "user", "content": [image1, question]},
+    {"role": "assistant", "content": [answer1]},
+    {"role": "user", "content": [image2, question]},
+    {"role": "assistant", "content": [answer2]},
+    {"role": "user", "content": [image_test, question]},
+]
+answer = model.chat(msgs=msgs, use_tts_template=False, enable_thinking=False)
+print(answer)
+```
+</details>
+#### Chat with Video <!-- omit in toc -->
+<details>
+<summary>Click to show Python code for video input.</summary>
+```python
+import torch
+from minicpmo.utils import get_video_frame_audio_segments
+from transformers import AutoModel
+model = ...
+video_path = "assets/Skiing.mp4"
+video_frames, _, _ = get_video_frame_audio_segments(video_path)
+print("num frames:", len(video_frames))
+question = "Describe the video"
+msgs = [{"role": "user", "content": video_frames + [question]}]
+answer = model.chat(
+    msgs=msgs,
+    max_new_tokens=128,
+    use_image_id=False,
+    max_slice_nums=1,
+    use_tts_template=False,
+    enable_thinking=False,  # Set True to enable thinking mode
+)
+print(answer)
+```
+</details>
+### Structured Content Input  <!-- omit in toc -->
+<details>
+<summary>Click to show structured content input details.</summary>
+The `chat` method accepts message content in two formats:
+**Native format** – pass Python objects directly:
+```python
+msgs = [{"role": "user", "content": [pil_image, audio_ndarray, "Describe this."]}]
+```
+**OpenAI-compatible format** – use structured dictionaries:
+```python
+msgs = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image_url", "image_url": {"url": "/path/to/image.jpg"}},
+            {"type": "audio_url", "audio_url": {"url": "/path/to/audio.wav"}},
+            {"type": "video_url", "video_url": {"url": "/path/to/video.mp4", "use_audio": True}},
+            {"type": "text", "text": "Describe this."}
+        ]
+    }
+]
+```
+**Supported types:**
+| Type | Input | Converts to |
+|------|-------|-------------|
+| `text` | `{"type": "text", "text": "..."}` | `str` |
+| `image_url` | `{"type": "image_url", "image_url": {"url": "..."}}` | `PIL.Image` |
+| `audio_url` | `{"type": "audio_url", "audio_url": {"url": "..."}}` | `np.ndarray` (16kHz mono) |
+| `video_url` | `{"type": "video_url", "video_url": {"url": "...", "stack_frames": 1, "use_audio": True}}` | `List[Image, ndarray, ...]` |
+- **URL sources**: local file paths or `http://`/`https://` URLs
+- **Mixed formats**: native objects and structured dicts can be combined in the same content list
+</details>
+## Deploy a Realtime Web Demo on Your Own Device
+### Option A (Recommended): **PyTorch Inference with Nvidia GPU** for 100% model precision with no deductions in performance.
+We provide a PyTorch-based [simplified yet full-functional web demo](https://github.com/OpenBMB/minicpm-o-4_5-pytorch-simple-demo) which could boost the model inference performance, supports:
+- full-duplex omnimodal live streaming
+- full-duplex speech live streaming
+- half-duplex speech live streaming (under development)
+- turn-based chat conversation
+- customizable system prompts
+- customizable reference audio
+- simple and readable codebase for continual development
+- serve as API backend for third-party applications
+Requirements:
+- Nvidia GPU with at least 28GB GPU memory. *We are working on optimizing the model for lower GPU memory usage.*
+### Option B: **llama.cpp-omni** for end-side inference with PCs like Mac and low-resource devices.
+With a fully C++ implementation of `MiniCPM-o 4.5` and quantized weights, `llama.cpp-omni` supports:
+- half-duplex speech realtime conversation
+- full-duplex omnimodal live streaming
+We provide [ready-to-run guidance](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/web_demo/WebRTC_Demo/README.md) to access the low-latency full-duplex communication directly on your own Mac using our new official Docker image.
+Requirements:
+- For half-duplex speech realtime conversation: Apple M3/M4/M5 chip with at least 16GB RAM or low-resource Nvidia GPU with at least 12GB GPU memory
+- For full-duplex omnimodal live streaming: Apple M4 Max chip with at least 24GB RAM or low-resource Nvidia GPU with at least 12GB GPU memory
+## Supported Frameworks
+### FlagOS
+To enable large-scale deployment across different AI chips, Beijing Zhiyuan Research Institute, together with numerous research institutions, chip manufacturers, system vendors, and algorithm and software organizations both domestically and internationally, jointly initiated and established the FlagOS Open Source Community.
+The FlagOS community is dedicated to building a unified, open-source system software stack for various AI chips, encompassing core open-source projects such as a large-scale operator library, a unified AI compiler, parallel training and inference frameworks, and a unified communication library. It aims to create an open technology ecosystem connecting the "model-system-chip" layers. By enabling "develop once, deploy across chips", FlagOS unlocks the computational potential of hardware, breaks down the ecosystem silos between different chip software stacks, and effectively reduces migration costs for developers. The FlagOS community fosters an AI hardware and software ecosystem, overcomes single-vendor closed-source monopolies, promotes widespread deployment of AI hardware technologies, and is committed to rooted in China while embracing global collaboration.
+Official website: https://flagos.io.
+<details>
+<summary>Click to show FlagOS details.</summary>
+#### FlagOS: Supporting Multiple AI Chips <!-- omit in toc -->
+Thanks to FlagOS's unified multi-chip AI system software stack, MiniCPM-o 4.5 was adapted to 6 different AI chips in an extremely short time. Currently, the multi-chip version of MiniCPM-o 4.5 has been released on FlagRelease, FlagOS's platform for automatic migration, adaptation, and deployment of large models across multi-architecture AI chips. Details are as follows:
+| Vendor          | ModelScope   | Huggingface  |
+|:----------------|:------------:|:------------:|
+| Nvidia          | [MiniCPM-o-4.5-nvidia-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) | [MiniCPM-o-4.5-nvidia-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |
+| Hygon-BW1000    | [MiniCPM-o-4.5-hygon-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) | [MiniCPM-o-4.5-hygon-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) |
+| Metax-C550      | [MiniCPM-o-4.5-metax-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) | [MiniCPM-o-4.5-metax-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) |
+| Iluvatar-BIV150 | [MiniCPM-o-4.5-iluvatar-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) | [MiniCPM-o-4.5-iluvatar-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) |
+| Ascend-A3       | [MiniCPM-o-4.5-ascend-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) | [MiniCPM-o-4.5-ascend-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) |
+| Zhenwu-810E     | [MiniCPM-o-4.5-zhenwu-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | [MiniCPM-o-4.5-zhenwu-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) |
+##### Comprehensive Evaluation <!-- omit in toc -->
+###### Transformers-FlagOS version <!-- omit in toc -->
+Accuracy Difference between `USE_FLAGOS=1` on multi-backend and `USE_FLAGOS=0` on Nvidia-CUDA
+| Metrics                  | FlagOS Backend  | Difference with Nvidia-CUDA |
+|:-------------------------|:---------------:|:---------------------------:|
+| Video-MME 0-shot avg@1 ↑ |     Nvidia      |            0.33%            |
+| Video-MME 0-shot avg@1 ↑ |  Hygon-BW1000   |            0.17%            |
+| Video-MME 0-shot avg@1 ↑ |    Ascend-A3    |            0.50%            |
+| Video-MME 0-shot avg@1 ↑ | Iluvatar-BIV150 |            1.83%            |
+| Video-MME 0-shot avg@1 ↑ |   Metax-C550    |            0.75%            |
+###### VLLM-FlagOS version <!-- omit in toc -->
+Accuracy Difference between `USE_FLAGGEMS=1 FLAGCX_PATH=/workspace/FlagCX` on Nvidia or `USE_FLAGGEMS=1` on ZHENW 810E, and launching vllm server directly on Nvidia
+| Metrics (avg@1)     | Difference between Nvidia-FlagOS and Nvidia-CUDA | Difference between Zhenwu-FlagOS and Nvidia-CUDA |
+|:--------------------|:------------------------------------------------:|:------------------------------------------------:|
+| CMMMU ↑             | 0.72% | 3.5% |
+| MMMU ↑              | 1.44% | 1.18% |
+| MMMU_Pro_standard ↑ | 0.83% | 0.22% |
+| MM-Vet v2 ↑         | 0.46% | 1.33% |
+| OCRBench ↑          | 0.10% | 1% |
+| CII-Bench ↑         | 0.40% | 0.13% |
+| Blink ↑             | 1.90% | 2.19% |
+#### FlagOS Usage <!-- omit in toc -->
+##### FlagOS Performance Acceleration on Nvidia <!-- omit in toc -->
+On the Transformers version, under the premise of precision alignment between the CUDA and FlagOS ecosystems, FlagOS achieves a 6% performance improvement in total task execution time compared to CUDA.
+###### From FlagRelease【Recommendation】 <!-- omit in toc -->
+FlagRelease is a platform developed by the FlagOS team for automatic migration, adaptation, and deployment of large models across multi-architecture AI chips. The multi-chip version of MiniCPM-o 4.5 has already been released on FlagRelease. All necessary software packages are pre-installed on the platform, so users do not need to install anything.
+- FlagRelease Image Key Versions
+  | Component               | Version                             |
+  |:------------------------|:------------------------------------|
+  | Accelerator Card Driver | 570.158.01                          |
+  | CUDA SDK Build          | cuda_13.0.r13.0/compiler.36424714_0 |
+  | FlagTree                | 0.4.0+3.5                           |
+  | FlagGems                | 4.2.1rc0                            |
+  | vllm & vllm-plugin-fl   | 0.13.0 + vllm_fl 0.0.0              |
+  | FlagCX                  | 0.1.0                               |
+- FlagRelease Quick Start
+  | Vendor     | ModelScope   | Huggingface  |
+  |:-----------|:------------:|:------------:|
+  | Nvidia | [MiniCPM-o-4.5-nvidia-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) | [MiniCPM-o-4.5-nvidia-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |
+  | Hygon-BW1000 | [MiniCPM-o-4.5-hygon-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) | [MiniCPM-o-4.5-hygon-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) |
+  | Metax-C550 | [MiniCPM-o-4.5-metax-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) | [MiniCPM-o-4.5-metax-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) |
+  | Iluvatar-BIV150 | [MiniCPM-o-4.5-iluvatar-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) | [MiniCPM-o-4.5-iluvatar-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) |
+  | Ascend-A3 | [MiniCPM-o-4.5-ascend-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) | [MiniCPM-o-4.5-ascend-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) |
+  | Zhenwu-810E | [MiniCPM-o-4.5-zhenwu-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | [MiniCPM-o-4.5-zhenwu-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) |
+##### From Scratch <!-- omit in toc -->
+- Dependencies: Python 3.12, GLIBC 2.39, GLIBCXX 3.4.33, CXXABI 1.3.15
+###### Transformers <!-- omit in toc -->
+- Installing the FlagOS Operator Library
+  Official Repository: https://github.com/flagos-ai/FlagGems
+  ```shell
+  pip install flag-gems==4.2.1rc0
+  ```
+- Installing the FlagOS Compiler
+  Official Repository: https://github.com/flagos-ai/flagtree
+  Quick Reference for Core Dependency Versions: https://github.com/flagos-ai/FlagTree/blob/main/documents/build.md#tips-for-building
+  ```shell
+  pip uninstall triton
+  python3 -m pip install flagtree==0.4.0+3.5 --index-url=https://resource.flagos.net/repository/flagos-pypi-hosted/simple --trusted-host=https://resource.flagos.net
+  ```
+- Activating Acceleration
+  Add `USE_FLAGOS=1` before the command for the task you want to run. For example, when you run:
+  ```shell
+  python3 generate_speech_from_video.py
+  ```
+  To use the MiniCPM-o-4.5 model to generate spoken responses from video content, you can:
+  ```shell
+  USE_FLAGOS=1 python3 generate_speech_from_video.py
+  ```
+  to accelerate this process with FlagOS.
+###### vLLM Version <!-- omit in toc -->
+- Installing the FlagOS Operator Library
+  Official Repository: https://github.com/flagos-ai/FlagGems
+  ```shell
+  pip install flag-gems==4.2.1rc0
+  pip install triton==3.5.1
+  ```
+- Activating Acceleration
+  Add `USE_FLAGOS=1` before the command for the task you want to run. For example, when you run:
+  ```shell
+  vllm serve ${model_path} --dtype auto  --gpu_memory_utilization 0.9 --trust-remote-code --max-num-batched-tokens 2048 --served-model-name cpmo --port ${Port}
+  ```
+  To start the MiniCPM-o-4.5 server, you can:
+  ```shell
+  USE_FLAGOS=1 vllm serve ${model_path} --dtype auto  --gpu_memory_utilization 0.9 --trust-remote-code --max-num-batched-tokens 2048 --served-model-name cpmo --port ${Port}
+  ```
+  to accelerate this process with FlagOS.
+#### Using FlagOS Unified Multi-Chip Backend Plugin <!-- omit in toc -->
+[vllm-plugin-FL](https://github.com/flagos-ai/vllm-plugin-FL) is a plugin built for the vLLM inference/service framework. Developed on top of FlagOS's unified multi-chip backend, it is designed to extend vLLM's capabilities and performance across a variety of hardware environments.
+##### Using vllm-plugin-FL <!-- omit in toc -->
+| Vendor | From Scratch | From FlagRelease |
+|:-------|:-------------|:----------------|
+| Nvidia | [vllm-plugin-FL/MiniCPM-o-4.5](https://github.com/flagos-ai/vllm-plugin-FL/blob/main/examples/minicpm/README.md) | [MiniCPM-o-4.5-ModelScope](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS), [MiniCPM-o-4.5-HuggingFace](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |
+</details>
+### vLLM, SGLang, llama.cpp, Ollama
+We support inference with vLLM, SGLang, llama.cpp and Ollama. Refer to our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-Cookbook) for more details.
+### LLaMA-Factory, SWIFT
+We support fine-tuning with LLaMA-Factory, SWIFT. Refer to our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-Cookbook) for more details.
+## MiniCPM-V & o Cookbook
+Discover comprehensive, ready-to-deploy solutions for the MiniCPM-V and MiniCPM-o model series in our structured [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-CookBook), which empowers developers to rapidly implement multimodal AI applications with integrated vision, speech, and live-streaming capabilities. Key features include:
+**Easy Usage Documentation**
+Our comprehensive [documentation website](https://minicpm-o.readthedocs.io/en/latest/index.html) presents every recipe in a clear, well-organized manner.
+All features are displayed at a glance, making it easy for you to quickly find exactly what you need.
+**Broad User Spectrum**
+We support a wide range of users, from individuals to enterprises and researchers.
+* **Individuals**: Enjoy effortless inference using Ollama ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-v4_ollama.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-o4_5_ollama.md)) and Llama.cpp ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-v4_llamacpp.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-o4_5_llamacpp.md)) with minimal setup.
+* **Enterprises**: Achieve high-throughput, scalable performance with vLLM ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-v4_vllm.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-o4_5_vllm.md)) and SGLang ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-v4_sglang.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-o4_5_sglang.md)).
+* **Researchers**: Leverage advanced frameworks including [Transformers](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_full.md), [LLaMA-Factory](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_llamafactory.md), [SWIFT](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/swift.md), and [Align-anything](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/align_anything.md) to enable flexible model development and cutting-edge experimentation.
+**Versatile Deployment Scenarios**
+Our ecosystem delivers optimal solution for a variety of hardware environments and deployment demands.
+* **Web Demo**: Full-duplex real-time video interaction solution with high responsiveness and low latency. [WebRTC_Demo](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/web_demo/WebRTC_Demo/README.md).
+* **Quantized deployment**: Maximize efficiency and minimize resource consumption using [GGUF](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/gguf/minicpm-v4_gguf_quantize.md) and [BNB](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/bnb/minicpm-v4_bnb_quantize.md).
+* **End devices**: Bring powerful AI experiences to [iPhone and iPad](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/ios_demo/ios.md), supporting offline and privacy-sensitive applications.
+## License
+#### Model License
+* The MiniCPM-o/V model weights and code are open-sourced under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM-V/blob/main/LICENSE) license.
+#### Statement
+* As MLLMs, MiniCPM-o/V models generate content by learning a large number of multimodal corpora, but they cannot comprehend, express personal opinions, or make value judgements. Anything generated by MiniCPM-o/V models does not represent the views and positions of the model developers
+* We will not be liable for any problems arising from the use of MiniCPM-o/V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination, or misuse of the model.
+## Key Techniques and Other Multimodal Projects <!-- omit in toc -->
+👏 Welcome to explore key techniques of MiniCPM-o/V and other multimodal projects of our team:
+[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLPR](https://github.com/OpenBMB/RLPR) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
+## Citation <!-- omit in toc -->
+If you find our model/code/paper helpful, please consider citing our papers 📝 and staring us ⭐️！
+```bib
+@article{yao2024minicpm,
+  title={MiniCPM-V: A GPT-4V Level MLLM on Your Phone},
+  author={Yao, Yuan and Yu, Tianyu and Zhang, Ao and Wang, Chongyi and Cui, Junbo and Zhu, Hongji and Cai, Tianchi and Li, Haoyu and Zhao, Weilin and He, Zhihui and others},
+  journal={arXiv preprint arXiv:2408.01800},
+  year={2024}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "</answer>": 151686,
+  "</box>": 151674,
+  "</focus>": 151688,
+  "</image>": 151670,
+  "</image_id>": 151682,
+  "</image_save_to>": 151696,
+  "</line>": 151690,
+  "</perception>": 151692,
+  "</point>": 151678,
+  "</quad>": 151676,
+  "</ref>": 151672,
+  "</slice>": 151680,
+  "</source_image>": 151694,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "</unit>": 151684,
+  "<answer>": 151685,
+  "<box>": 151673,
+  "<focus>": 151687,
+  "<image>": 151669,
+  "<image_id>": 151681,
+  "<image_save_to>": 151695,
+  "<line>": 151689,
+  "<perception>": 151691,
+  "<point>": 151677,
+  "<quad>": 151675,
+  "<ref>": 151671,
+  "<slice>": 151679,
+  "<source_image>": 151693,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<unit>": 151683,
+  "<|audio_end|>": 151699,
+  "<|audio_start|>": 151697,
+  "<|audio|>": 151698,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|emotion_end|>": 151711,
+  "<|emotion_start|>": 151710,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|interrupt|>": 151707,
+  "<|listen|>": 151705,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|pitch_end|>": 151715,
+  "<|pitch_start|>": 151714,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|speak|>": 151706,
+  "<|speed_end|>": 151713,
+  "<|speed_start|>": 151712,
+  "<|spk_bos|>": 151700,
+  "<|spk_eos|>": 151702,
+  "<|spk|>": 151701,
+  "<|turn_bos|>": 151716,
+  "<|timbre_10|>": 151726,
+  "<|timbre_11|>": 151727,
+  "<|timbre_12|>": 151728,
+  "<|timbre_13|>": 151729,
+  "<|timbre_14|>": 151730,
+  "<|timbre_15|>": 151731,
+  "<|timbre_16|>": 151732,
+  "<|timbre_17|>": 151733,
+  "<|timbre_18|>": 151734,
+  "<|timbre_19|>": 151735,
+  "<|turn_eos|>": 151717,
+  "<|timbre_20|>": 151736,
+  "<|timbre_21|>": 151737,
+  "<|timbre_22|>": 151738,
+  "<|timbre_23|>": 151739,
+  "<|timbre_24|>": 151740,
+  "<|timbre_25|>": 151741,
+  "<|timbre_26|>": 151742,
+  "<|timbre_27|>": 151743,
+  "<|timbre_28|>": 151744,
+  "<|timbre_29|>": 151745,
+  "<|chunk_eos|>": 151718,
+  "<|timbre_30|>": 151746,
+  "<|timbre_31|>": 151747,
+  "<|chunk_bos|>": 151719,
+  "<|chunk_tts_bos|>": 151720,
+  "<|chunk_tts_eos|>": 151721,
+  "<|tts_pad|>": 151722,
+  "<|timbre_7|>": 151723,
+  "<|timbre_8|>": 151724,
+  "<|timbre_9|>": 151725,
+  "<|tts_bos|>": 151703,
+  "<|tts_eos|>": 151704,
+  "<|vad_end|>": 151709,
+  "<|vad_start|>": 151708,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

assets/HT_ref_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb8f06ba5080cdf548969138881fb8ad8b04e2516108f4e08ba0363b68b613ea
+size 192590

assets/Skiing.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:479ace116d6ac92487ad90f415b3ef817cd019bba4521043ef0d5faaa1a8415d
+size 8534409

assets/Trump_WEF_2018_10s.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fb796c2bb95538eab22d9b68a31add560305b6d5ccbf150c3f96e7671b6db64
+size 161053

assets/audio_cases/assistant_ref.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e4a56e4418740ee0326f11ca2bd61a54d84e7d23e86a5e030369ee89f8e8390
+size 65478

assets/audio_cases/assistant_response.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d46268e3beb78b4287a896a09b2df0f895629ecbdcdc4cf3c519fe040084860f
+size 269504

assets/audio_cases/elon_musk__000_assistant_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6dced2e686b743f39eee36aef1410913c8d31459bb564dbbf8f81ef8a2da88
+size 1762604

assets/audio_cases/elon_musk__system_ref_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4109b2d685e1923ed66433eb08c92047a1f67510629a27edf49af4e5c606dd
+size 539032

assets/audio_cases/elon_musk_ref.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:206c48ec4b08898fbfbfb1ea1fec058750c39c97cd2e5088b15b61fb194d6f7e
+size 165179

assets/audio_cases/elon_musk_response.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d83c3a977fd444351710c9ba2c347c90efd49ed990276c5309867374df373a4f
+size 388534

assets/audio_cases/hermione__000_assistant_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd18a2ee9e27fdcd65e2d9bf0fe0b3c9ebc28cda6c43279c12e077889c2b88d
+size 1534124

assets/audio_cases/hermione__system_ref_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46bd82796ce5ebc15b29bc21c1de38284cfa972b2f27cca1f17086516a757dee
+size 197322

assets/audio_cases/minicpm_assistant__000_assistant_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe3b793a6436682d1e684e0f4c158718208ed38926880e01ee84f5aa61c78331
+size 1246124

assets/audio_cases/minicpm_assistant__system_ref_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad576b50fd2f53ad4bb317b97ed33bc7027071e7e0811a2a2110d995614f1f42
+size 192556

assets/audio_cases/paimon__000_assistant_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ddd9d8358950a8dab385b8031cb8371a832460f3db4d91e399373705158f346
+size 697004

assets/audio_cases/paimon__system_ref_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9625e7115ff1f47cf842fecbb31db82057ff1863c080870539c31107aa93b9d
+size 479304

assets/audio_cases/readme.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ This directory stores audio wav files for hf readme page.

assets/bajie.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16aa8ca3da7dad53680bac68cfde569e11e2f9ef5387b3ae60d68626e94db9d7
+size 636512

assets/fossil.png ADDED Viewed

Git LFS Details

SHA256: b8b3f1668da6e2b503ecea5a6fd40c1a2a666b4bd07b903ba0728fd2fc9f74fb
Pointer size: 131 Bytes
Size of remote file: 466 kB

assets/haimianbaobao.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27405cf5977f04d3f248693f9b978654fad4f02f931d9746006bffb0ed7b66e1
+size 343120

assets/highway.png ADDED Viewed

Git LFS Details

SHA256: 87c32da6ee77730423ec5fc29d7110878ab18a35791eb11a803088976c3b1f76
Pointer size: 131 Bytes
Size of remote file: 841 kB

assets/nezha.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5d8932013ff30a9d8114a6d62e5342999d8c2ddf8509ecfeb1ead71cacf432
+size 457802

assets/omni_duplex1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31622e1efd9a7b197a340266037b45aeec13b3b27f010f1ea1d22d9c6e69405f
+size 7295040

assets/omni_duplex2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c04eaef27a821e18db3686fdcc9d4b1dafd441e08f6a0e12d4075b81cb04517e
+size 29285216

assets/sunwukong.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bdb6c175bd3b2bb02fe7a8cf0ced847d41ab552b2e318bc3c26ee477255eff7
+size 644650

assets/system_ref_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4109b2d685e1923ed66433eb08c92047a1f67510629a27edf49af4e5c606dd
+size 539032

assets/system_ref_audio_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a65d17099239711601cb17e28b7aa7b6149acd153b92fd0366c23902bd4f687
+size 341192

assets/token2wav/campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

assets/token2wav/flow.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15ccff24256ff61537c7f8b51e025116b83405f3fb017b54b008fc97da115446
+size 623466603

assets/token2wav/flow.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+flow: !new:cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
+    input_size: 512
+    output_size: 80
+    spk_embed_dim: 192
+    output_type: 'mel'
+    vocab_size: 6561
+    encoder: !new:cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
+        input_size: 512
+        output_size: 512
+        input_layer: 'linear'
+        pre_lookahead_len: 3
+        num_blocks: 6
+        num_up_blocks: 4
+        up_stride: 2
+        up_scale_factor: 2
+        attention_heads: 8
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        key_bias: true
+        linear_units: 2048
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+    decoder: !new:cosyvoice2.flow.flow_matching.CausalConditionalCFM
+        inference_cfg_rate: 0.7
+        estimator: !new:cosyvoice2.flow.decoder_dit.DiT
+            in_channels: 320
+            out_channels: 80
+            mlp_ratio: 4.0
+            depth: 16
+            num_heads: 8
+            head_dim: 64
+            hidden_size: 512

assets/token2wav/hift.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
+size 83390254

assets/token2wav/speech_tokenizer_v2_25hz.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
+size 496082973

config.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "architectures": [
+    "MiniCPMO"
+  ],
+  "version": "4.5",
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "audio_chunk_length": 1.0,
+  "audio_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "openai/whisper-medium",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "apply_spec_augment": false,
+    "architectures": [
+      "MiniCPMWhisperEncoder"
+    ],
+    "attention_dropout": 0.0,
+    "begin_suppress_tokens": [
+      220,
+      50257
+    ],
+    "bos_token_id": 50257,
+    "classifier_proj_size": 256,
+    "d_model": 1024,
+    "decoder_attention_heads": 16,
+    "decoder_ffn_dim": 4096,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 24,
+    "decoder_start_token_id": 50258,
+    "dropout": 0.0,
+    "encoder_attention_heads": 16,
+    "encoder_ffn_dim": 4096,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 24,
+    "eos_token_id": 50257,
+    "forced_decoder_ids": [
+      [
+        1,
+        50259
+      ],
+      [
+        2,
+        50359
+      ],
+      [
+        3,
+        50363
+      ]
+    ],
+    "init_std": 0.02,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_length": 448,
+    "max_source_positions": 1500,
+    "max_target_positions": 448,
+    "median_filter_width": 7,
+    "model_type": "whisper",
+    "num_hidden_layers": 24,
+    "num_mel_bins": 80,
+    "pad_token_id": 50257,
+    "scale_embedding": false,
+    "suppress_tokens": [
+      1,
+      2,
+      7,
+      8,
+      9,
+      10,
+      14,
+      25,
+      26,
+      27,
+      28,
+      29,
+      31,
+      58,
+      59,
+      60,
+      61,
+      62,
+      63,
+      90,
+      91,
+      92,
+      93,
+      359,
+      503,
+      522,
+      542,
+      873,
+      893,
+      902,
+      918,
+      922,
+      931,
+      1350,
+      1853,
+      1982,
+      2460,
+      2627,
+      3246,
+      3253,
+      3268,
+      3536,
+      3846,
+      3961,
+      4183,
+      4667,
+      6585,
+      6647,
+      7273,
+      9061,
+      9383,
+      10428,
+      10929,
+      11938,
+      12033,
+      12331,
+      12562,
+      13793,
+      14157,
+      14635,
+      15265,
+      15618,
+      16553,
+      16604,
+      18362,
+      18956,
+      20075,
+      21675,
+      22520,
+      26130,
+      26161,
+      26435,
+      28279,
+      29464,
+      31650,
+      32302,
+      32470,
+      36865,
+      42863,
+      47425,
+      49870,
+      50254,
+      50258,
+      50358,
+      50359,
+      50360,
+      50361,
+      50362
+    ],
+    "torch_dtype": "float32",
+    "use_cache": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 51865
+  },
+  "audio_pool_step": 5,
+  "auto_map": {
+    "AutoConfig": "configuration_minicpmo.MiniCPMOConfig",
+    "AutoModel": "modeling_minicpmo.MiniCPMO",
+    "AutoModelForCausalLM": "modeling_minicpmo.MiniCPMO"
+  },
+  "batch_vision_input": true,
+  "bos_token_id": 151643,
+  "drop_vision_last_layer": false,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_size": 448,
+  "init_audio": true,
+  "init_tts": true,
+  "init_vision": true,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "listen_speak_type": "asr",
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "minicpmo",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "patch_size": 14,
+  "query_num": 64,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "slice_config": {
+    "max_slice_nums": 1,
+    "model_type": "minicpmv",
+    "patch_size": 14,
+    "scale_resolution": 448
+  },
+  "slice_mode": true,
+  "sliding_window": null,
+  "stream_input": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "tts_config": {
+    "_attn_implementation_autoset": true,
+    "attention_type": "full_attention",
+    "attn_implementation": "sdpa",
+    "audio_bos_token_id": 151687,
+    "audio_tokenizer_sample_rate": 16000,
+    "audio_tokenizer_type": "s3tokenizer",
+    "aug_layer_loss_weight": false,
+    "aug_loss_weight": false,
+    "backbone_model": "llama",
+    "condition_type": "hidden_text_merge",
+    "cosyvoice_config_path": null,
+    "cosyvoice_model_dir": null,
+    "filter_tts_loss": false,
+    "hidden_act": "silu",
+    "hidden_size": 768,
+    "interleaved": false,
+    "intermediate_size": 3072,
+    "llm_dim": 4096,
+    "llm_dim_model_base": 256,
+    "llm_down_scale": false,
+    "llm_hidden_size": 4096,
+    "llm_intermediate_size": 768,
+    "long_weight": 0.1,
+    "max_position_embeddings": 4096,
+    "model_type": "minicpmtts",
+    "normalize_projected_hidden": true,
+    "num_attention_heads": 12,
+    "num_audio_tokens": 6562,
+    "num_hidden_layers": 20,
+    "num_key_value_heads": 12,
+    "num_mel_bins": 100,
+    "num_text_tokens": 152064,
+    "num_vq": 1,
+    "projector_type": "mlp",
+    "recomputed_chunks": 1,
+    "s3_stream_chunk_size": 25,
+    "s3_stream_generate": false,
+    "s3_stream_n_timesteps": 10,
+    "s3_stream_prelook_size": 3,
+    "short_weight": 0.1,
+    "streaming": false,
+    "streaming_audio_chunk_size": 50,
+    "streaming_sliding_window": false,
+    "streaming_sliding_window_audio_frame_rate": 50,
+    "streaming_sliding_window_audio_init_text_length": 10,
+    "streaming_sliding_window_audio_window_size": 300,
+    "streaming_sliding_window_average_speed": 5,
+    "streaming_sliding_window_fast_speed": 7,
+    "streaming_sliding_window_max_text_len": 500,
+    "streaming_sliding_window_slow_speed": 3,
+    "streaming_sliding_window_text_window_size": 50,
+    "streaming_text_chunk_max": 7,
+    "streaming_text_chunk_min": 3,
+    "streaming_text_reserved_len": 300,
+    "text_eos_token_id": 151692,
+    "tts_filter_loss_fix": false,
+    "use_llm_hidden_state": false,
+    "use_text": true,
+    "window_size": 2
+  },
+  "use_cache": true,
+  "use_image_id": true,
+  "use_sliding_window": false,
+  "vision_batch_size": 16,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 980,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  },
+  "vocab_size": 151748
+}

configuration_minicpmo.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Union
+from transformers import PretrainedConfig
+from transformers import Qwen3Config
+from transformers import WhisperConfig
+from transformers.utils import logging
+from .modeling_navit_siglip import SiglipVisionConfig
+logger = logging.get_logger(__name__)
+class MiniCPMVSliceConfig(PretrainedConfig):
+    model_type = "minicpmv"
+    def __init__(
+        self,
+        patch_size=14,
+        max_slice_nums=9,
+        scale_resolution=448,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.max_slice_nums = max_slice_nums
+        self.scale_resolution = scale_resolution
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if config_dict.get("model_type") == "minicpmv":
+            config_dict = config_dict["slice_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class MiniCPMTTSConfig(PretrainedConfig):
+    model_type = "minicpmtts"
+    def __init__(
+        self,
+        llm_dim: int = 2560,
+        llm_intermediate_size: int = 768,
+        llm_down_scale: bool = False,
+        llm_dim_model_base: int = 256,
+        projector_type: str = "mlp",
+        hidden_act: str = "silu",
+        aug_loss_weight: bool = False,
+        aug_layer_loss_weight: bool = False,
+        filter_tts_loss: bool = False,
+        tts_filter_loss_fix: bool = False,
+        long_weight: float = 0.1,
+        short_weight: float = 0.1,
+        hidden_size: int = 768,
+        intermediate_size: int = 3072,
+        num_attention_heads: int = 12,
+        num_hidden_layers: int = 20,
+        num_key_value_heads: int = 12,
+        max_position_embeddings: int = 4096,
+        num_audio_tokens: int = 4097,
+        num_text_tokens: int = 21178,
+        num_mel_bins: int = 100,
+        num_vq: int = 1,
+        use_llm_hidden_state: bool = False,
+        audio_bos_token_id: int = 21132,
+        text_eos_token_id: int = 21133,
+        use_text: bool = True,
+        streaming: bool = False,
+        streaming_text_chunk_min: int = 3,
+        streaming_text_chunk_max: int = 7,
+        streaming_text_reserved_len: int = 300,
+        streaming_audio_chunk_size: int = 50,
+        attn_implementation: str = "sdpa",
+        condition_type: str = "llm_hidden",
+        backbone_model: str = "llama",
+        audio_tokenizer_type: str = "wavtokenizer",
+        audio_tokenizer_sample_rate: int = 24000,
+        streaming_sliding_window: bool = False,
+        streaming_sliding_window_max_text_len: int = 500,
+        streaming_sliding_window_average_speed: int = 5,
+        streaming_sliding_window_fast_speed: int = 7,
+        streaming_sliding_window_slow_speed: int = 3,
+        streaming_sliding_window_audio_frame_rate: int = 50,
+        streaming_sliding_window_audio_init_text_length: int = 10,
+        streaming_sliding_window_audio_window_size: int = 300,
+        normalize_projected_hidden: bool = False,
+        interleaved: bool = False,
+        attention_type: str = "sliding_recompute",
+        recomputed_chunks: int = 1,
+        window_size: int = 2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.llm_dim = llm_dim
+        self.llm_hidden_size = llm_dim
+        self.llm_intermediate_size = llm_intermediate_size
+        self.llm_down_scale = llm_down_scale
+        self.llm_dim_model_base = llm_dim_model_base
+        self.projector_type = projector_type
+        self.aug_loss_weight = aug_loss_weight
+        self.aug_layer_loss_weight = aug_layer_loss_weight
+        self.tts_filter_loss_fix = tts_filter_loss_fix
+        self.filter_tts_loss = filter_tts_loss
+        self.long_weight = long_weight
+        self.short_weight = short_weight
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.num_key_value_heads = num_key_value_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.num_audio_tokens = num_audio_tokens
+        self.num_text_tokens = num_text_tokens
+        self.num_mel_bins = num_mel_bins
+        self.num_vq = num_vq
+        self.use_llm_hidden_state = use_llm_hidden_state
+        self.audio_bos_token_id = audio_bos_token_id
+        self.text_eos_token_id = text_eos_token_id
+        self.use_text = use_text
+        self.streaming = streaming
+        self.streaming_text_chunk_min = streaming_text_chunk_min
+        self.streaming_text_chunk_max = streaming_text_chunk_max
+        self.streaming_text_reserved_len = streaming_text_reserved_len
+        self.streaming_audio_chunk_size = streaming_audio_chunk_size
+        self.attn_implementation = attn_implementation
+        self.condition_type = condition_type
+        self.backbone_model = backbone_model
+        self.audio_tokenizer_type = audio_tokenizer_type
+        self.audio_tokenizer_sample_rate = audio_tokenizer_sample_rate
+        self.streaming_sliding_window = streaming_sliding_window
+        self.streaming_sliding_window_max_text_len = streaming_sliding_window_max_text_len
+        self.streaming_sliding_window_average_speed = streaming_sliding_window_average_speed
+        self.streaming_sliding_window_fast_speed = streaming_sliding_window_fast_speed
+        self.streaming_sliding_window_slow_speed = streaming_sliding_window_slow_speed
+        self.streaming_sliding_window_audio_frame_rate = streaming_sliding_window_audio_frame_rate
+        self.streaming_sliding_window_audio_init_text_length = streaming_sliding_window_audio_init_text_length
+        self.streaming_sliding_window_audio_window_size = streaming_sliding_window_audio_window_size
+        self.normalize_projected_hidden = normalize_projected_hidden
+        self.interleaved = interleaved
+        self.attention_type = attention_type
+        self.recomputed_chunks = recomputed_chunks
+        self.window_size = window_size
+class MiniCPMOConfig(Qwen3Config):
+    model_type = "minicpmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    default_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 980,
+        "intermediate_size": 4304,
+        "model_type": "siglip",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+    def __init__(
+        self,
+        use_cache=True,
+        query_num=64,
+        image_size=448,
+        drop_vision_last_layer=True,
+        batch_vision_input=True,
+        slice_config=None,
+        vision_config=None,
+        audio_config=None,
+        tts_config=None,
+        use_image_id=True,
+        vision_batch_size=16,
+        audio_pool_step=5,
+        audio_chunk_length=1.0,
+        stream_input=False,
+        listen_speak_type="asr",
+        init_vision=True,
+        init_audio=True,
+        init_tts=True,
+        **kwargs,
+    ):
+        self.use_cache = use_cache
+        self.query_num = query_num
+        self.image_size = image_size
+        self.drop_vision_last_layer = drop_vision_last_layer
+        self.batch_vision_input = batch_vision_input
+        self.use_image_id = use_image_id
+        self.vision_batch_size = vision_batch_size
+        self.audio_pool_step = audio_pool_step
+        self.audio_chunk_length = audio_chunk_length
+        self.stream_input = stream_input
+        self.listen_speak_type = listen_speak_type
+        self.init_vision = init_vision
+        self.init_audio = init_audio
+        self.init_tts = init_tts
+        if slice_config is None:
+            self.slice_config = MiniCPMVSliceConfig(max_slice_nums=1)
+        else:
+            self.slice_config = MiniCPMVSliceConfig(**slice_config)
+        self.slice_mode = True
+        # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
+        if vision_config is None:
+            self.vision_config = SiglipVisionConfig(**self.default_vision_config)
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = SiglipVisionConfig(**vision_config)
+        elif isinstance(vision_config, SiglipVisionConfig):
+            self.vision_config = vision_config
+        if audio_config is None:
+            self.audio_config = WhisperConfig()
+        elif isinstance(audio_config, dict):
+            self.audio_config = WhisperConfig(**audio_config)
+        elif isinstance(audio_config, WhisperConfig):
+            self.audio_config = audio_config
+        if tts_config is None:
+            self.tts_config = MiniCPMTTSConfig()
+        elif isinstance(tts_config, dict):
+            self.tts_config = MiniCPMTTSConfig(**tts_config)
+        elif isinstance(tts_config, MiniCPMTTSConfig):
+            self.tts_config = tts_config
+        self.patch_size = self.vision_config.patch_size
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30c40b9a10386c1bc404568d8829e5aada2e3501d9de6fb46ff80451aff7e077
+size 5273477136

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe0faef420aca8f771bec1f9fcfaae01206c56b8e64e4429a1ff500c962fbf96
+size 5301855080

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d0b20153f9bfa88ebcd1cb6bbd5b7cac4c217b3cafb137a291a1b380d8b7821
+size 5301855048

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f61addf4747c94fedcaee059e5d9918ed15543beec494404139a99f2f86c9b31
+size 2866549964

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_minicpmo.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_navit_siglip.py ADDED Viewed

	@@ -0,0 +1,981 @@

+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Siglip model."""
+# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings
+from transformers.utils import add_start_docstrings_to_model_forward
+from transformers.utils import is_flash_attn_2_available
+from transformers.utils import logging
+from transformers.utils import ModelOutput
+from transformers.utils import replace_return_docstrings
+logger = logging.get_logger(__name__)
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_vision_model"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis  # noqa
+    from flash_attn.bert_padding import pad_input
+    from flash_attn.bert_padding import unpad_input
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # if past_key_value is not None:
+        #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in float32, this might be related to the fact"
+                " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(query_states, key_states, value_states, attention_mask, query_length)
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.self_attn = SiglipAttention(config) if not self._use_flash_attention_2 else SiglipFlashAttention2(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+    _no_split_modules = []
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self._use_flash_attention_2
+                else patch_attention_mask
+            )
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        if not return_dict:
+            return (last_hidden_state, None) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "image_processor_type": "MiniCPMVImageProcessor",
+  "feature_extractor_type": "MiniCPMAAudioProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor",
+    "AutoImageProcessor": "processing_minicpmo.MiniCPMVImageProcessor",
+    "AutoFeatureExtractor": "processing_minicpmo.MiniCPMAAudioProcessor"
+  },
+  "processor_class": "MiniCPMOProcessor",
+  "max_slice_nums": 9,
+  "scale_resolution": 448,
+  "patch_size": 14,
+  "use_image_id": true,
+  "image_feature_size": 64,
+  "im_start": "<image>",
+  "im_end": "</image>",
+  "slice_start": "<slice>",
+  "slice_end": "</slice>",
+  "unk": "<unk>",
+  "im_id_start": "<image_id>",
+  "im_id_end": "</image_id>",
+  "slice_mode": true,
+  "audio_pool_step": 5,
+  "norm_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "norm_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "version": 4.5
+}

processing_minicpmo.py ADDED Viewed

	@@ -0,0 +1,1665 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import re
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoImageProcessor
+from transformers.audio_utils import spectrogram
+from transformers.audio_utils import window_function
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_transforms import to_channel_dimension_format
+from transformers.image_utils import ChannelDimension
+from transformers.image_utils import ImageInput
+from transformers.image_utils import infer_channel_dimension_format
+from transformers.image_utils import is_torch_tensor
+from transformers.image_utils import to_numpy_array
+from transformers.image_utils import valid_images
+from transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput
+from transformers.tokenization_utils_base import TextInput
+from transformers.utils import is_torch_device
+from transformers.utils import is_torch_dtype
+from transformers.utils import requires_backends
+from transformers.utils import TensorType
+def recursive_converter(converter, value):
+    if isinstance(value, list):
+        new_value = []
+        for v in value:
+            new_value += [recursive_converter(converter, v)]
+        return new_value
+    else:
+        return converter(value)
+class MiniCPMOBatchFeature(BatchFeature):
+    """Extend from BatchFeature for supporting various image size"""
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        if tensor_type is None:
+            return self
+        is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
+        def converter(value):
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+                    return tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+        for key, value in self.items():
+            self[key] = recursive_converter(converter, value)
+        return self
+    def to(self, *args, **kwargs) -> "MiniCPMOBatchFeature":
+        requires_backends(self, ["torch"])
+        import torch
+        def cast_tensor(v):
+            if not torch.is_tensor(v):
+                return v
+            if torch.is_floating_point(v):
+                return v.to(*args, **kwargs)
+            elif device is not None:
+                return v.to(device=device)
+            else:
+                return v
+        new_data = {}
+        device = kwargs.get("device")
+        if device is None and len(args) > 0:
+            arg = args[0]
+            if is_torch_dtype(arg):
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            new_data[k] = recursive_converter(cast_tensor, v)
+        self.data = new_data
+        return self
+class MiniCPMVImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+    def __init__(self, max_slice_nums=9, scale_resolution=448, patch_size=14, **kwargs):
+        super().__init__(**kwargs)
+        self.max_slice_nums = max_slice_nums
+        self.scale_resolution = scale_resolution
+        self.patch_size = patch_size
+        self.use_image_id = kwargs.pop("use_image_id", False)
+        self.image_feature_size = kwargs.pop("image_feature_size", 64)
+        self.im_start_token = kwargs.pop("im_start", "<image>")
+        self.im_end_token = kwargs.pop("im_end", "</image>")
+        self.slice_start_token = kwargs.pop("slice_start", "<slice>")
+        self.slice_end_token = kwargs.pop("slice_end", "</slice>")
+        self.unk_token = kwargs.pop("unk", "<unk>")
+        self.im_id_start = kwargs.pop("im_id_start", "<image_id>")
+        self.im_id_end = kwargs.pop("im_id_end", "</image_id>")
+        self.slice_mode = kwargs.pop("slice_mode", True)
+        self.mean = np.array(kwargs.pop("norm_mean", [0.5, 0.5, 0.5]))
+        self.std = np.array(kwargs.pop("norm_std", [0.5, 0.5, 0.5]))
+        self.version = kwargs.pop("version", 2.0)
+    @staticmethod
+    def ensure_divide(length, patch_size):
+        return max(round(length / patch_size) * patch_size, patch_size)
+    def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False):
+        width, height = original_size
+        if (width * height > scale_resolution * scale_resolution) or allow_upscale:
+            r = width / height
+            height = int(scale_resolution / math.sqrt(r))
+            width = int(height * r)
+        best_width = self.ensure_divide(width, patch_size)
+        best_height = self.ensure_divide(height, patch_size)
+        return best_width, best_height
+    def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False):
+        width, height = original_size
+        grid_x, grid_y = grid
+        refine_width = self.ensure_divide(width, grid_x)
+        refine_height = self.ensure_divide(height, grid_y)
+        grid_width = refine_width / grid_x
+        grid_height = refine_height / grid_y
+        best_grid_size = self.find_best_resize(
+            (grid_width, grid_height), scale_resolution, patch_size, allow_upscale=allow_upscale
+        )
+        refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
+        return refine_size
+    @staticmethod
+    def split_to_patches(image, grid):
+        patches = []
+        width, height = image.size
+        grid_x = int(width / grid[0])
+        grid_y = int(height / grid[1])
+        for i in range(0, height, grid_y):
+            images = []
+            for j in range(0, width, grid_x):
+                box = (j, i, j + grid_x, i + grid_y)
+                patch = image.crop(box)
+                images.append(patch)
+            patches.append(images)
+        return patches
+    def slice_image(self, image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
+        original_size = image.size
+        source_image = None
+        best_grid = self.get_sliced_grid(original_size, max_slice_nums, never_split)
+        patches = []
+        if best_grid is None:
+            # dont need to slice, upsample
+            best_size = self.find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=True)
+            source_image = image.resize(best_size, resample=Image.Resampling.BICUBIC)
+        else:
+            # source image, down-sampling and ensure divided by patch_size
+            best_resize = self.find_best_resize(original_size, scale_resolution, patch_size)
+            source_image = image.copy().resize(best_resize, resample=Image.Resampling.BICUBIC)
+            refine_size = self.get_refine_size(
+                original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
+            )
+            refine_image = image.resize(refine_size, resample=Image.Resampling.BICUBIC)
+            patches = self.split_to_patches(refine_image, best_grid)
+        return source_image, patches, best_grid
+    def get_grid_placeholder(self, grid):
+        if grid is None:
+            return ""
+        slice_image_placeholder = (
+            self.slice_start_token + self.unk_token * self.image_feature_size + self.slice_end_token
+        )
+        cols = grid[0]
+        rows = grid[1]
+        slices = []
+        for i in range(rows):
+            lines = []
+            for j in range(cols):
+                lines.append(slice_image_placeholder)
+            slices.append("".join(lines))
+        slice_placeholder = "\n".join(slices)
+        return slice_placeholder
+    def get_image_id_placeholder(self, idx=0):
+        return f"{self.im_id_start}{idx}{self.im_id_end}"
+    def get_sliced_images(self, image, max_slice_nums=None):
+        slice_images = []
+        if not self.slice_mode:
+            return [image]
+        max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
+        assert max_slice_nums > 0
+        source_image, patches, sliced_grid = self.slice_image(
+            image, max_slice_nums, self.scale_resolution, self.patch_size  # default: 9  # default: 448  # default: 14
+        )
+        slice_images.append(source_image)
+        if len(patches) > 0:
+            for i in range(len(patches)):
+                for j in range(len(patches[0])):
+                    slice_images.append(patches[i][j])
+        return slice_images
+    def get_sliced_grid(self, image_size, max_slice_nums, nerver_split=False):
+        original_width, original_height = image_size
+        log_ratio = math.log(original_width / original_height)
+        ratio = original_width * original_height / (self.scale_resolution * self.scale_resolution)
+        multiple = min(math.ceil(ratio), max_slice_nums)
+        if multiple <= 1 or nerver_split:
+            return None
+        candidate_split_grids_nums = []
+        for i in [multiple - 1, multiple, multiple + 1]:
+            if i == 1 or i > max_slice_nums:
+                continue
+            candidate_split_grids_nums.append(i)
+        candidate_grids = []
+        for split_grids_nums in candidate_split_grids_nums:
+            m = 1
+            while m <= split_grids_nums:
+                if split_grids_nums % m == 0:
+                    candidate_grids.append([m, split_grids_nums // m])
+                m += 1
+        best_grid = [1, 1]
+        min_error = float("inf")
+        for grid in candidate_grids:
+            error = abs(log_ratio - math.log(grid[0] / grid[1]))
+            if error < min_error:
+                best_grid = grid
+                min_error = error
+        return best_grid
+    def get_slice_image_placeholder(self, image_size, image_idx=0, max_slice_nums=None, use_image_id=None):
+        max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
+        assert max_slice_nums > 0
+        grid = self.get_sliced_grid(image_size=image_size, max_slice_nums=max_slice_nums)
+        image_placeholder = self.im_start_token + self.unk_token * self.image_feature_size + self.im_end_token
+        use_image_id = self.use_image_id if use_image_id is None else bool(use_image_id)
+        if use_image_id:
+            final_placeholder = self.get_image_id_placeholder(image_idx) + image_placeholder
+        else:
+            final_placeholder = image_placeholder
+        if self.slice_mode:
+            final_placeholder = final_placeholder + self.get_grid_placeholder(grid=grid)
+        return final_placeholder
+    @staticmethod
+    def to_pil_image(image, rescale=None) -> Image.Image:
+        """Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back
+        as the last axis if needed.
+        Args:
+            image (`Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+                The image to convert to the PIL Image format.
+            rescale (`bool`, *optional*):
+                whether to apply the scaling factor (to make pixel values integers between 0 and 255). Will
+                default to `True` if the image type is a floating type, `False` otherwise.
+        """
+        if isinstance(image, Image.Image):
+            return image
+        if is_torch_tensor(image):
+            image = image.numpy()
+        if isinstance(image, np.ndarray):
+            if rescale is None:
+                # rescale default to the array being of floating type.
+                rescale = isinstance(image.flat[0], np.floating)
+            # If the channel as been moved to first dim, we put it back at the end.
+            if image.ndim == 3 and image.shape[0] in [1, 3]:
+                image = image.transpose(1, 2, 0)
+            if rescale:
+                image = image * 255
+            image = image.astype(np.uint8)
+            return Image.fromarray(image)
+        return image
+    def reshape_by_patch(self, image):
+        image = torch.from_numpy(image)
+        patch_size = self.patch_size
+        patches = torch.nn.functional.unfold(image, (patch_size, patch_size), stride=(patch_size, patch_size))
+        patches = patches.reshape(image.size(0), patch_size, patch_size, -1)
+        patches = patches.permute(0, 1, 3, 2).reshape(image.size(0), patch_size, -1)
+        return patches.numpy()
+    def preprocess(
+        self,
+        images: Union[Image.Image, List[Image.Image], List[List[Image.Image]]],
+        do_pad: Optional[bool] = True,
+        max_slice_nums: int = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> MiniCPMOBatchFeature:
+        if isinstance(images, Image.Image):
+            images_list = [[images]]
+        elif isinstance(images[0], Image.Image):
+            images_list = [images]
+        else:
+            images_list = images
+        new_images_list = []
+        image_sizes_list = []
+        tgt_sizes_list = []
+        for _images in images_list:
+            if _images is None or len(_images) == 0:
+                new_images_list.append([])
+                image_sizes_list.append([])
+                tgt_sizes_list.append([])
+                continue
+            if not valid_images(_images):
+                raise ValueError(
+                    "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                    "torch.Tensor, tf.Tensor or jax.ndarray."
+                )
+            _images = [self.to_pil_image(image).convert("RGB") for image in _images]
+            input_data_format = infer_channel_dimension_format(np.array(_images[0]))
+            new_images = []
+            image_sizes = [image.size for image in _images]
+            tgt_sizes = []
+            for image in _images:
+                image_patches = self.get_sliced_images(image, max_slice_nums)
+                image_patches = [to_numpy_array(image).astype(np.float32) / 255 for image in image_patches]
+                image_patches = [
+                    self.normalize(image=image, mean=self.mean, std=self.std, input_data_format=input_data_format)
+                    for image in image_patches
+                ]
+                image_patches = [
+                    to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                    for image in image_patches
+                ]
+                for slice_image in image_patches:
+                    new_images.append(self.reshape_by_patch(slice_image))
+                    tgt_sizes.append(
+                        np.array((slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size))
+                    )
+            if tgt_sizes:
+                tgt_sizes = np.vstack(tgt_sizes)
+            new_images_list.append(new_images)
+            image_sizes_list.append(image_sizes)
+            tgt_sizes_list.append(tgt_sizes)
+        return MiniCPMOBatchFeature(
+            data={"pixel_values": new_images_list, "image_sizes": image_sizes_list, "tgt_sizes": tgt_sizes_list},
+            tensor_type=return_tensors,
+        )
+AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
+def chunk_audio(audio: np.ndarray, max_duration_seconds: int = 30, sample_rate: int = 16000) -> List[np.ndarray]:
+    """split long audio into chunks
+    Args:
+        audio:
+        max_duration_seconds:
+        sample_rate:
+    Returns:
+        chunks
+    """
+    max_len = int(max_duration_seconds * sample_rate)
+    if len(audio) <= max_len:
+        return [audio]
+    chunks = []
+    for i in range(0, len(audio), max_len):
+        chunk = audio[i : i + max_len]
+        chunks.append(chunk)
+    return chunks
+def process_audio_batch(
+    audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
+    feature_extractor,
+    sampling_rate: int = 16000,
+    max_duration_seconds: int = 30,
+    return_attention_mask: bool = True,
+) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    """extract audio mel features
+    Args:
+        audios:
+        feature_extractor: WhisperFeatureExtractor
+        sampling_rate:
+        max_duration_seconds:
+        return_attention_mask:
+    Returns:
+        (audio_features, audio_feature_lens)
+        audio_features: [batch_size, n_mels, max_frames]
+        audio_feature_lens:
+    """
+    if isinstance(audios, np.ndarray):
+        audios_list = [[audios]]
+    elif len(audios) > 0 and isinstance(audios[0], np.ndarray):
+        audios_list = [audios]
+    else:
+        audios_list = audios
+    audio_features_all = []
+    audio_feature_lens_list = []
+    for batch_audios in audios_list:
+        batch_lens = []
+        for audio in batch_audios:
+            chunks = chunk_audio(audio, max_duration_seconds, sampling_rate)
+            for chunk in chunks:
+                audio_input = feature_extractor(
+                    chunk,
+                    sampling_rate=sampling_rate,
+                    return_tensors="pt",
+                    padding="max_length",
+                    return_attention_mask=return_attention_mask,
+                )
+                audio_feature = audio_input["input_features"]  # [1, 80, frames]
+                if return_attention_mask:
+                    actual_len = audio_input["attention_mask"].sum(dim=1)  # Tensor([frames])
+                    audio_feature = audio_feature[:, :, : actual_len[0]]
+                    batch_lens.append(actual_len[0])
+                else:
+                    batch_lens.append(torch.tensor(audio_feature.shape[2]))
+                audio_features_all.append(audio_feature.squeeze(0))  # [80, frames]
+        if len(batch_lens) > 0:
+            audio_feature_lens_list.append(torch.hstack(batch_lens))
+        else:
+            audio_feature_lens_list.append(torch.tensor([]))
+    # pad to same length
+    if audio_features_all:
+        audio_features = torch.nn.utils.rnn.pad_sequence(
+            [feat.transpose(0, 1) for feat in audio_features_all], batch_first=True, padding_value=0.0
+        ).transpose(
+            1, 2
+        )  # [batch, 80, max_frames]
+    else:
+        audio_features = torch.tensor([])
+    return audio_features, audio_feature_lens_list
+def regroup_audio_features(
+    audio_features: torch.Tensor, audio_feature_lens: List[torch.Tensor], regroup_seconds: int, fps: int = 100
+) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    """regroup audio features to fixed duration
+    Args:
+        audio_features: [batch, n_mels, frames]
+        audio_feature_lens: each batch's actual length
+        regroup_seconds: regroup duration (seconds)
+        fps: frames per second
+    Returns:
+        (regrouped_features, regrouped_lens)
+    """
+    # flatten to continuous frames sequence
+    all_lens = []
+    for lens in audio_feature_lens:
+        if isinstance(lens, torch.Tensor):
+            all_lens.extend(lens.tolist())
+        elif isinstance(lens, list):
+            all_lens.extend([int(x) for x in lens])
+    if len(all_lens) == 0:
+        return torch.tensor([]), []
+    # concatenate all valid features
+    flat_slices = [audio_features[i, :, :L] for i, L in enumerate(all_lens)]  # [n_mels, L]
+    if len(flat_slices) == 1:
+        full_feat = flat_slices[0]
+    else:
+        full_feat = torch.cat(flat_slices, dim=1)  # [n_mels, total_frames]
+    # split to fixed frames
+    frames_per_seg = int(regroup_seconds * fps)
+    segments = []
+    for start in range(0, full_feat.size(1), frames_per_seg):
+        seg = full_feat[:, start : start + frames_per_seg]
+        if seg.size(1) > 0:
+            segments.append(seg)
+    if len(segments) == 0:
+        return torch.tensor([]), []
+    # pad and convert to batch
+    seg_lens = [s.size(1) for s in segments]
+    segs_transposed = [s.transpose(0, 1) for s in segments]
+    padded = torch.nn.utils.rnn.pad_sequence(segs_transposed, batch_first=True, padding_value=0.0)  # [N, max_T, n_mels]
+    padded = padded.transpose(1, 2)  # [N, n_mels, max_T]
+    lens_tensor = torch.tensor(seg_lens, dtype=torch.int32, device=padded.device)
+    return padded, [lens_tensor]
+class MiniCPMAAudioProcessor(WhisperFeatureExtractor):
+    """
+    On top of WhisperFeatureExtractor:
+    - support dynamic_log_norm (original max-8dB, adjustable dynamic_range_db)
+    - or fixed log_floor_db (e.g. -10dB)
+        - this is because we need to do streaming scheme, in which we can't do dynamic setting
+        - this can be modified in the middle, through set_dynamic_log_norm
+    Two paths (torch / numpy) keep consistent clipping and scaling order:
+        log10 -> (dynamic/fixed lower limit clipping) -> (+4)/4
+    """
+    def __init__(
+        self,
+        *args,
+        dynamic_log_norm: bool = True,
+        dynamic_range_db: float = 8.0,
+        log_floor_db: float = -10.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.dynamic_log_norm = bool(dynamic_log_norm)
+        self.dynamic_range_db = float(dynamic_range_db)
+        self.log_floor_db = float(log_floor_db)
+    def set_spac_log_norm(
+        self,
+        dynamic_range_db: Optional[float] = None,
+        log_floor_db: Optional[float] = None,
+        *,
+        inplace: bool = True,
+    ) -> "MiniCPMAAudioProcessor":
+        """Hot update dynamic/fixed lower limit strategy.
+        Args:
+            enabled: True=use dynamic threshold (max - dynamic_range_db), False=use fixed lower limit log_floor_db.
+                    None means keep unchanged.
+            dynamic_range_db: dynamic range (dB), only effective when enabled=True. None means keep unchanged.
+            log_floor_db: fixed log floor (dB, usually <= 0), only effective when enabled=False. None means keep unchanged.
+            inplace: True directly modify current instance; False return a shallow copy and modify on it.
+        Returns:
+            self or new instance (when inplace=False).
+        """
+        target = self if inplace else copy.copy(self)
+        if dynamic_range_db is not None:
+            val = float(dynamic_range_db)
+            if val < 0:
+                raise ValueError("dynamic_range_db must be >= 0.")
+            target.dynamic_log_norm = True  # explicitly set the value to dynamic mode
+            target.dynamic_range_db = val
+        if log_floor_db is not None:
+            val = float(log_floor_db)
+            # usually log10(mel) maximum is not more than ~0dB, floor should be <= 0; here do loose validation
+            if val > 0:
+                raise ValueError("log_floor_db should be <= 0 (log10 scale).")
+            target.dynamic_log_norm = False  # explicitly set the value to fixed lower limit mode
+            target.log_floor_db = val
+        return target
+    def _np_extract_fbank_features(self, waveform_batch: np.ndarray, device: str) -> np.ndarray:
+        """NumPy version consistent with upstream, but replace max-8dB with configurable dynamic/fixed lower limit clipping."""
+        if device != "cpu":
+            raise ValueError(
+                f"Got device `{device}` for feature extraction, but feature extraction on CUDA accelerator "
+                "devices requires torch. Set device='cpu' or install torch."
+            )
+        log_spec_batch: List[np.ndarray] = []
+        for waveform in waveform_batch:
+            # generate log10 Mel
+            log_spec = spectrogram(
+                waveform,
+                window_function(self.n_fft, "hann"),
+                frame_length=self.n_fft,
+                hop_length=self.hop_length,
+                power=2.0,
+                dither=self.dither,
+                mel_filters=self.mel_filters,
+                log_mel="log10",
+            )
+            # consistent with upstream: remove the last frame
+            log_spec = log_spec[:, :-1]
+            # dynamic/fixed clipping
+            if self.dynamic_log_norm:
+                threshold = log_spec.max() - self.dynamic_range_db
+                log_spec = np.maximum(log_spec, threshold)
+            else:
+                log_spec = np.maximum(log_spec, self.log_floor_db)
+            # consistent with Whisper linear scaling
+            log_spec = (log_spec + 4.0) / 4.0
+            log_spec_batch.append(log_spec)
+        return np.array(log_spec_batch)
+    def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
+        if torch is None:
+            raise RuntimeError("PyTorch is not installed, cannot compute STFT on GPU.")
+        waveform = torch.from_numpy(waveform).to(device, torch.float32)
+        window = torch.hann_window(self.n_fft, device=device)
+        if self.dither != 0.0:
+            waveform = waveform + self.dither * torch.randn_like(waveform)
+        stft = torch.stft(waveform, n_fft=self.n_fft, hop_length=self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)  # [n_mels, 1+n_fft//2]
+        mel_spec = mel_filters.T @ magnitudes  # [..., n_mels, T]
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()  # <= 0
+        if self.dynamic_log_norm:
+            if waveform.dim() == 2:
+                max_val_t = log_spec.max(dim=2, keepdim=True)[0]  # over T
+                max_val_bt = max_val_t.max(dim=1, keepdim=True)[0]  # over mel
+                threshold = max_val_bt - self.dynamic_range_db
+                log_spec = torch.maximum(log_spec, threshold)
+            else:
+                threshold = log_spec.max() - self.dynamic_range_db
+                log_spec = torch.maximum(log_spec, threshold)
+        else:
+            floor_tensor = torch.tensor(self.log_floor_db, dtype=log_spec.dtype, device=log_spec.device)
+            log_spec = torch.maximum(log_spec, floor_tensor)
+        log_spec = (log_spec + 4.0) / 4.0
+        if device != "cpu":
+            log_spec = log_spec.detach().cpu()
+        return log_spec.numpy()
+    def process(self, *args, **kwargs):
+        """Alias of __call__ for convenience."""
+        return self.__call__(*args, **kwargs)
+class StreamingMelProcessorExact:
+    """Strictly offline equivalent streaming Mel processor.
+    - accumulate all historical audio into buffer; use the same feature_extractor to calculate the entire mel after each addition.
+    - only output "stable" frames: the frame center does not depend on future (right) context, i.e. center + n_fft//2 <= current buffer length.
+    - output the last batch of frames at the end (flush), ensuring complete consistency with offline full-calculation.
+    Cost: Each call performs feature extraction on the accumulated buffer (can be optimized to incremental if needed).
+    """
+    def __init__(
+        self,
+        feature_extractor: MiniCPMAAudioProcessor,
+        chunk_ms: int = 100,
+        first_chunk_ms: Optional[int] = None,
+        sample_rate: int = 16000,
+        n_fft: int = 400,
+        hop_length: int = 160,
+        n_mels: int = 80,
+        cnn_redundancy_ms: int = 10,  # (given in ms, usually 10ms=1 frame)
+        # sliding window parameters
+        enable_sliding_window: bool = False,  # whether to enable sliding window
+        slide_trigger_seconds: float = 30.0,  # trigger threshold for sliding window in seconds
+        slide_stride_seconds: float = 10.0,  # stride for sliding window in seconds
+    ):
+        self.feature_extractor = feature_extractor
+        self.chunk_ms = chunk_ms
+        self.first_chunk_ms = first_chunk_ms if first_chunk_ms is not None else chunk_ms
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.n_mels = n_mels
+        self.chunk_samples = int(round(chunk_ms * sample_rate / 1000))
+        self.chunk_frames = self.chunk_samples // hop_length
+        # align to hop_length to avoid frame boundary issues
+        hop = self.hop_length
+        raw_first_samples = int(round(self.first_chunk_ms * sample_rate / 1000))
+        aligned_first = max(hop, (raw_first_samples // hop) * hop)
+        self.first_chunk_samples = aligned_first
+        self.half_window = n_fft // 2  # required right context
+        # redundancy frames (in frames), <=1 frame: 10ms → 1 frame
+        self.cnn_redundancy_ms = cnn_redundancy_ms
+        self.cnn_redundancy_samples = int(cnn_redundancy_ms * sample_rate / 1000)
+        self.cnn_redundancy_frames = max(0, self.cnn_redundancy_samples // hop_length)
+        # sliding window configuration (Trigger mode)
+        self.enable_sliding_window = enable_sliding_window
+        self.trigger_seconds = slide_trigger_seconds
+        self.slide_seconds = slide_stride_seconds
+        # shift/base (global frame coordinates)
+        self.left_samples_dropped = 0  # samples dropped from the left
+        self.base_T = 0  # index of the "global frame" corresponding to mel_full[:, :, 0]
+        self.reset()
+    def reset(self):
+        self.buffer = np.zeros(0, dtype=np.float32)
+        self.last_emitted_T = 0
+        self.total_samples_processed = 0
+        self.chunk_count = 0
+        self.is_first = True
+        self.left_samples_dropped = 0
+        self.base_T = 0
+    def get_chunk_size(self) -> int:
+        return self.first_chunk_samples if self.is_first else self.chunk_samples
+    def get_expected_output_frames(self) -> int:
+        raise NotImplementedError("get_expected_output_frames is not implemented")
+    def _extract_full(self) -> torch.Tensor:
+        # when buffer length is less than n_fft, Whisper's internal STFT will raise an error in center=True and pad mode
+        # (pad is greater than input length). At this time, there is no stable frame to output, so return empty features directly.
+        if len(self.buffer) < self.n_fft:
+            raise ValueError(f"buffer length is shorter than n_fft {len(self.buffer)} < {self.n_fft}")
+        # if buffer length is less than 5s, use set_spac_log_norm(log_floor_db=-10) or the last cached result
+        if len(self.buffer) < 5 * self.sample_rate:
+            # TODO: here the best is to do some experiments to choose the best one, now this is selected through experience, can see MiniCPMAAudioProcessor's main implementation
+            self.feature_extractor.set_spac_log_norm(log_floor_db=-10)
+        # if buffer length is greater than 5s, use set_spac_log_norm(dynamic_range_db=8)
+        else:
+            self.feature_extractor.set_spac_log_norm(dynamic_range_db=8)
+        feats = self.feature_extractor(
+            self.buffer,
+            sampling_rate=self.sample_rate,
+            return_tensors="pt",
+            padding=False,
+        )
+        return feats.input_features  # [1, 80, T]
+    def _stable_frames_count(self) -> int:
+        # number of stable frames = floor((len(buffer) - half_window) / hop) + 1, minimum is 0
+        L = int(self.buffer.shape[0])
+        if L <= 0:
+            return 0
+        if L < self.half_window:
+            return 0
+        return max(0, (L - self.half_window) // self.hop_length + 1)
+    def _maybe_slide_buffer(self):
+        """Trigger mode sliding window: when the buffer reaches the trigger threshold, slide a fixed length window."""
+        if not self.enable_sliding_window:
+            return
+        sr = self.sample_rate
+        hop = self.hop_length
+        L = len(self.buffer)
+        # convert seconds to samples
+        trigger_samples = int(self.trigger_seconds * sr)
+        stride_samples = int(self.slide_seconds * sr)
+        # check if the trigger threshold is reached
+        if L < trigger_samples:
+            return
+        # calculate the number of samples to drop (fixed sliding stride_samples)
+        drop = stride_samples
+        # cannot drop the left context that is still needed for subsequent emission
+        # in trigger mode, we only need to protect the minimum necessary data
+        # i.e. ensure that we do not discard frames that may be needed in the future
+        last_emitted_local = self.last_emitted_T - self.base_T
+        # only protect necessary context (e.g. the most recent 1 second data)
+        min_keep_seconds = 1.0  # keep at least 1 second of data to ensure continuity
+        min_keep_samples = int(min_keep_seconds * sr)
+        # guard_samples are the minimum samples we must keep
+        guard_samples = min(min_keep_samples, L - drop)
+        # limit: do not exceed the safe boundary; and align hop
+        max_allowed_drop = max(0, L - guard_samples)
+        drop = min(drop, max_allowed_drop)
+        drop = (drop // hop) * hop
+        if drop <= 0:
+            return
+        # truly drop & update base
+        self.buffer = self.buffer[drop:]
+        self.left_samples_dropped += drop
+        self.base_T += drop // hop
+    def process(self, audio_chunk: np.ndarray, is_last_chunk: bool = False) -> Tuple[torch.Tensor, Dict]:
+        self.chunk_count += 1
+        # append to buffer
+        if len(self.buffer) == 0:
+            self.buffer = audio_chunk.astype(np.float32, copy=True)
+        else:
+            self.buffer = np.concatenate([self.buffer, audio_chunk.astype(np.float32, copy=True)])
+        # sliding window processing
+        self._maybe_slide_buffer()
+        # full extraction (for the current window)
+        mel_full = self._extract_full()
+        T_full = mel_full.shape[-1]  # local frames in the current window
+        stable_T = min(T_full, self._stable_frames_count())  # local stable frames
+        stable_T_global = self.base_T + stable_T  # map to global frame coordinates
+        # plan the core frames for the current emission (global coordinates)
+        core_start_g = self.last_emitted_T
+        core_end_g = core_start_g + self.chunk_frames
+        required_stable_g = core_end_g + self.cnn_redundancy_frames
+        if stable_T_global >= required_stable_g or is_last_chunk:
+            emit_start_g = max(0, core_start_g - self.cnn_redundancy_frames)
+            emit_end_g = core_end_g + self.cnn_redundancy_frames
+            # global -> local index
+            emit_start = max(0, emit_start_g - self.base_T)
+            emit_end = emit_end_g - self.base_T
+            emit_start = max(0, min(emit_start, T_full))
+            emit_end = max(emit_start, min(emit_end, T_full))
+            mel_output = mel_full[:, :, emit_start:emit_end]
+            self.last_emitted_T = core_end_g  # only advance the core frame pointer (global)
+        else:
+            mel_output = mel_full[:, :, 0:0]
+        self.total_samples_processed += len(audio_chunk)
+        self.is_first = False
+        info = {
+            "type": "exact_chunk",
+            "chunk_number": self.chunk_count,
+            "emitted_frames": mel_output.shape[-1],
+            "stable_T": stable_T,
+            "T_full": T_full,
+            "base_T": self.base_T,
+            "stable_T_global": stable_T_global,
+            "buffer_len_samples": int(self.buffer.shape[0]),
+            "left_samples_dropped": self.left_samples_dropped,
+            "core_start": core_start_g,  # if keep the original field name, use the global value here
+            "core_end": core_end_g,  # same as above
+        }
+        return mel_output, info
+    def flush(self) -> torch.Tensor:
+        """Called when the stream ends, output the remaining unemitted frames, ensuring consistency with offline (calculated by global coordinates)."""
+        if len(self.buffer) == 0:
+            return torch.zeros(1, 80, 0)
+        mel_full = self._extract_full()
+        T_local = mel_full.shape[-1]
+        T_global = self.base_T + T_local
+        if self.last_emitted_T < T_global:
+            start_l = max(0, self.last_emitted_T - self.base_T)
+            tail = mel_full[:, :, start_l:]
+            self.last_emitted_T = T_global
+            return tail
+        return mel_full[:, :, 0:0]
+    def get_config(self) -> Dict:
+        return {
+            "chunk_ms": self.chunk_ms,
+            "first_chunk_ms": self.first_chunk_ms,
+            "effective_first_chunk_ms": self.first_chunk_samples / self.sample_rate * 1000.0,
+            "sample_rate": self.sample_rate,
+            "n_fft": self.n_fft,
+            "hop_length": self.hop_length,
+            "cnn_redundancy_ms": self.cnn_redundancy_ms,
+            "cnn_redundancy_frames": self.cnn_redundancy_frames,
+            "enable_sliding_window": self.enable_sliding_window,
+            "trigger_seconds": self.trigger_seconds,
+            "slide_seconds": self.slide_seconds,
+        }
+    def get_state(self) -> Dict:
+        return {
+            "chunk_count": self.chunk_count,
+            "last_emitted_T": self.last_emitted_T,
+            "total_samples_processed": self.total_samples_processed,
+            "buffer_len": int(self.buffer.shape[0]),
+            "base_T": self.base_T,
+            "left_samples_dropped": self.left_samples_dropped,
+        }
+    def get_snapshot(self) -> Dict:
+        """Get a complete state snapshot (including buffer), used for recovery from a fast start.
+        Returns:
+            A dictionary containing the complete state, which can be used to restore the snapshot
+        """
+        buffer_copy = self.buffer.copy()
+        snapshot = {
+            "chunk_count": self.chunk_count,
+            "last_emitted_T": self.last_emitted_T,
+            "total_samples_processed": self.total_samples_processed,
+            "buffer": buffer_copy,
+            "base_T": self.base_T,
+            "left_samples_dropped": self.left_samples_dropped,
+            "is_first": self.is_first,
+            # save the state of the feature_extractor (key: ensure determinism of mel feature extraction)
+            "fe_dynamic_log_norm": getattr(self.feature_extractor, "dynamic_log_norm", None),
+            "fe_dynamic_range_db": getattr(self.feature_extractor, "dynamic_range_db", None),
+            "fe_log_floor_db": getattr(self.feature_extractor, "log_floor_db", None),
+        }
+        return snapshot
+    def restore_snapshot(self, snapshot: Dict) -> None:
+        """Restore state from a snapshot
+        Args:
+            snapshot: the snapshot dictionary returned by get_snapshot
+        """
+        # record the state before restoration
+        prev_state = {
+            "chunk_count": self.chunk_count,
+            "last_emitted_T": self.last_emitted_T,
+            "buffer_len": len(self.buffer),
+        }
+        # restore state
+        self.chunk_count = snapshot["chunk_count"]
+        self.last_emitted_T = snapshot["last_emitted_T"]
+        self.total_samples_processed = snapshot["total_samples_processed"]
+        self.buffer = snapshot["buffer"].copy()  # copy buffer
+        self.base_T = snapshot["base_T"]
+        self.left_samples_dropped = snapshot["left_samples_dropped"]
+        self.is_first = snapshot["is_first"]
+        # restore the state of the feature_extractor (key: ensure determinism of mel feature extraction)
+        if snapshot.get("fe_dynamic_log_norm") is not None:
+            self.feature_extractor.dynamic_log_norm = snapshot["fe_dynamic_log_norm"]
+        if snapshot.get("fe_dynamic_range_db") is not None:
+            self.feature_extractor.dynamic_range_db = snapshot["fe_dynamic_range_db"]
+        if snapshot.get("fe_log_floor_db") is not None:
+            self.feature_extractor.log_floor_db = snapshot["fe_log_floor_db"]
+class MiniCPMOProcessor(ProcessorMixin):
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    audio_processor_class = "AutoFeatureExtractor"
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, audio_processor=None, tokenizer=None, **kwargs):
+        super().__init__(image_processor, audio_processor, tokenizer)
+        self.version = image_processor.version if image_processor else None
+        # audio feature pooling step, needs to be consistent with config.audio_pool_step
+        self.pool_step = kwargs.get("audio_pool_step", 5)
+        # initialize the streaming audio processor
+        self._streaming_mel_processor = None
+        if audio_processor is not None:
+            self._init_streaming_processor()
+    def get_audio_placeholder(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        """
+        Public method to get audio placeholder string for vLLM integration.
+        Args:
+            audio_lens: Length of audio in samples
+            chunk_input: Whether to use chunked processing
+            chunk_length: Chunk length in seconds
+        Returns:
+            Audio placeholder string
+        """
+        pool_step = self.pool_step
+        feature_lens = math.ceil(audio_lens / self.audio_processor.hop_length)
+        feature_lens = (feature_lens - 1) // 2 + 1
+        output_lens = (feature_lens - pool_step) // pool_step + 1
+        if chunk_input:
+            fbank_feat_in_chunk = int(chunk_length * 100)
+            cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
+            audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+            num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
+            place_holders = ""
+            total_unk_len = 0
+            for _ in range(num_audio_chunks):
+                unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
+                place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
+                total_unk_len += unk_len
+            audio_placeholder = place_holders
+        else:
+            audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
+        return audio_placeholder
+    def _init_streaming_processor(
+        self,
+        chunk_ms: int = 100,
+        cnn_redundancy_ms: int = 0,
+        *,
+        mode: str = "exact",
+        first_chunk_ms: Optional[int] = None,
+        enable_sliding_window: bool = False,
+        slide_trigger_seconds: float = 30.0,
+        slide_stride_seconds: float = 10.0,
+    ):
+        """Initialize the streaming processor
+        Args:
+            chunk_ms: Chunk size in milliseconds, also the sliding step.
+            cnn_redundancy_ms: CNN boundary redundancy in milliseconds (before and after), 0 means standard mode.
+            mode: streaming processing mode, currently only supports "exact"
+            first_chunk_ms: the size of the first chunk (milliseconds), if not specified, it is the same as chunk_ms
+            enable_sliding_window: whether to enable sliding window (trigger mode)
+            slide_trigger_seconds: trigger threshold for sliding window in seconds
+            slide_stride_seconds: stride for sliding window in seconds
+        """
+        if mode == "exact":
+            self._streaming_mel_processor = StreamingMelProcessorExact(
+                feature_extractor=self.audio_processor,
+                chunk_ms=chunk_ms,
+                first_chunk_ms=first_chunk_ms,
+                sample_rate=16000,
+                cnn_redundancy_ms=cnn_redundancy_ms,
+                enable_sliding_window=enable_sliding_window,
+                slide_trigger_seconds=slide_trigger_seconds,
+                slide_stride_seconds=slide_stride_seconds,
+            )
+        else:
+            raise ValueError(f"Unsupported mode: {mode}, only 'exact' is supported")
+        self._streaming_mode = mode if mode in ["exact"] else ("exact")
+    def set_streaming_mode(
+        self,
+        mode: str = "exact",
+        chunk_ms: int = 100,
+        cnn_redundancy_ms: int = 0,
+        *,
+        first_chunk_ms: Optional[int] = None,
+        enable_sliding_window: bool = False,
+        slide_trigger_seconds: float = 30.0,
+        slide_stride_seconds: float = 10.0,
+    ):
+        """Set streaming processing mode
+        Args:
+            mode: streaming processing mode, currently only supports "exact"
+            chunk_ms: chunk size in milliseconds, also the sliding step.
+            cnn_redundancy_ms: CNN boundary redundancy in milliseconds (before and after), 0 means standard mode.
+            first_chunk_ms: the size of the first chunk (milliseconds), if not specified, it is the same as chunk_ms
+            enable_sliding_window: whether to enable sliding window (trigger mode)
+            slide_trigger_seconds: trigger threshold for sliding window in seconds
+            slide_stride_seconds: stride for sliding window in seconds
+        """
+        if self.audio_processor is None:
+            raise ValueError("audio_processor is not set, cannot initialize the streaming processor")
+        self._init_streaming_processor(
+            chunk_ms=chunk_ms,
+            cnn_redundancy_ms=cnn_redundancy_ms,
+            mode=mode,
+            first_chunk_ms=first_chunk_ms,
+            enable_sliding_window=enable_sliding_window,
+            slide_trigger_seconds=slide_trigger_seconds,
+            slide_stride_seconds=slide_stride_seconds,
+        )
+    def process_image(
+        self,
+        images: Optional[ImageInput] = None,
+        do_pad: bool = True,
+        max_slice_nums: int = 1,
+        return_tensors: str = "pt",
+    ) -> MiniCPMOBatchFeature:
+        """Process image data
+        Args:
+            images: input images
+            do_pad: whether to pad
+            max_slice_nums: maximum number of slices
+            return_tensors: return tensor type
+        Returns:
+            MiniCPMOBatchFeature object
+        """
+        if images is None:
+            return MiniCPMOBatchFeature(data={"pixel_values": [[]], "image_sizes": [[]], "tgt_sizes": [[]]})
+        result = self.image_processor(
+            images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
+        )
+        model_inputs = {
+            "pixel_values": result.get("pixel_values", [[]]),
+            "image_sizes": result.get("image_sizes", [[]]),
+            "tgt_sizes": result.get("tgt_sizes", [[]]),
+        }
+        return MiniCPMOBatchFeature(data=model_inputs)
+    def process_audio(
+        self,
+        audios: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        sampling_rate: int = 16000,
+        regroup_to_seconds: Optional[int] = None,
+        fps: int = 100,
+    ) -> MiniCPMOBatchFeature:
+        """Process audio data in batch
+        Args:
+            audios: audio data
+            sampling_rate: sampling rate
+            regroup_to_seconds: regroup duration in seconds
+            fps: frames per second
+        Returns:
+            MiniCPMOBatchFeature object
+        """
+        if audios is None:
+            return MiniCPMOBatchFeature(data={"audio_features": [], "audio_feature_lens": []})
+        audio_features, audio_feature_lens = process_audio_batch(
+            audios=audios,
+            feature_extractor=self.audio_processor,
+            sampling_rate=sampling_rate,
+            max_duration_seconds=30,
+            return_attention_mask=True,
+        )
+        if regroup_to_seconds is not None and len(audio_features) > 0:
+            audio_features, audio_feature_lens = regroup_audio_features(
+                audio_features=audio_features,
+                audio_feature_lens=audio_feature_lens,
+                regroup_seconds=regroup_to_seconds,
+                fps=fps,
+            )
+        model_inputs = {"audio_features": audio_features, "audio_feature_lens": audio_feature_lens}
+        return MiniCPMOBatchFeature(data=model_inputs)
+    def process_audio_streaming(
+        self,
+        audio_chunk: np.ndarray,
+        reset: bool = False,
+        return_batch_feature: bool = False,
+        is_last_chunk: bool = False,
+    ) -> Union[Tuple[torch.Tensor, dict], MiniCPMOBatchFeature]:
+        """Process audio chunk in streaming
+        Args:
+            audio_chunk: audio data chunk (any audio, e.g. first process 125ms, then process 100ms)
+            reset: whether to reset the processor state
+            return_batch_feature: whether to return MiniCPMOBatchFeature format (consistent with process_audio)
+        Returns:
+            If return_batch_feature=False:
+                (audio_features, info)
+                - audio_features: [1, 80, n_frames] mel features
+                - info: processing information dictionary
+            If return_batch_feature=True:
+                MiniCPMOBatchFeature object, containing:
+                - audio_features: [1, 80, n_frames] mel features
+                - audio_feature_lens: [tensor([n_frames])]
+                - info: processing information (as an extra attribute)
+        """
+        if self._streaming_mel_processor is None:
+            raise ValueError("Streaming processor not initialized, please ensure audio_processor is set")
+        if reset:
+            self._streaming_mel_processor.reset()
+        # process chunk
+        mel_features, info = self._streaming_mel_processor.process(audio_chunk, is_last_chunk=is_last_chunk)
+        # determine the return format based on the parameters
+        if return_batch_feature:
+            # return the format consistent with process_audio
+            # note: info returns emitted_frames, which represents the actual output frames
+            n_frames = info.get("emitted_frames", mel_features.shape[-1])
+            model_inputs = {
+                "audio_features": mel_features,
+                "audio_feature_lens": [torch.tensor([n_frames])],
+                "streaming_info": info,  # add streaming processing information
+            }
+            return MiniCPMOBatchFeature(data=model_inputs)
+        else:
+            return mel_features, info
+    def reset_streaming(self):
+        if self._streaming_mel_processor is not None:
+            self._streaming_mel_processor.reset()
+    def get_streaming_chunk_size(self) -> int:
+        if self._streaming_mel_processor is None:
+            raise ValueError("Streaming processor not initialized")
+        return self._streaming_mel_processor.get_chunk_size()
+    def configure_streaming(
+        self,
+        chunk_ms: int = 100,
+        enable_sliding_window: bool = False,
+        slide_trigger_seconds: float = 30.0,
+        slide_stride_seconds: float = 10.0,
+    ):
+        """Configure streaming processor parameters
+        Args:
+            chunk_ms: chunk size in milliseconds
+            enable_sliding_window: whether to enable sliding window (trigger mode)
+            slide_trigger_seconds: trigger threshold for sliding window in seconds
+            slide_stride_seconds: stride for sliding window in seconds
+        """
+        if self.audio_processor is None:
+            raise ValueError("audio_processor is not set")
+        self._init_streaming_processor(
+            chunk_ms=chunk_ms,
+            enable_sliding_window=enable_sliding_window,
+            slide_trigger_seconds=slide_trigger_seconds,
+            slide_stride_seconds=slide_stride_seconds,
+        )
+    def get_streaming_config(self) -> dict:
+        if self._streaming_mel_processor is None:
+            return {}
+        return self._streaming_mel_processor.get_config()
+    def get_streaming_state(self) -> dict:
+        if self._streaming_mel_processor is None:
+            return {}
+        return self._streaming_mel_processor.get_state()
+    def get_streaming_snapshot(self) -> dict:
+        if self._streaming_mel_processor is None:
+            return {}
+        return self._streaming_mel_processor.get_snapshot()
+    def restore_streaming_snapshot(self, snapshot: dict) -> None:
+        if self._streaming_mel_processor is None:
+            return
+        if not snapshot:
+            return
+        self._streaming_mel_processor.restore_snapshot(snapshot)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
+        images: ImageInput = None,
+        audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]] = None,
+        audio_parts: Optional[list] = None,
+        max_length: Optional[int] = None,
+        do_pad: Optional[bool] = True,
+        max_slice_nums: int = None,
+        use_image_id: bool = True,
+        stream_input: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        sampling_rate: Optional[int] = 16000,
+        online_streaming: bool = False,
+        audio_chunk_idx: int = 0,
+        is_last_chunk: bool = False,
+        **kwargs,
+    ) -> MiniCPMOBatchFeature:
+        if images is not None:
+            image_inputs = self.process_image(
+                images=images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
+            )
+        else:
+            image_inputs = None
+        audio_features, audio_feature_lens, audio_phs = self.audio_feature_extract(
+            audios,
+            audio_parts,
+            stream_input,
+            sampling_rate,
+            online_streaming=online_streaming,
+            is_last_chunk=is_last_chunk,
+        )
+        model_inputs = self._convert_omni_to_inputs(
+            image_inputs,
+            audio_phs,
+            text,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+            max_length=max_length,
+            **kwargs,
+        )
+        model_inputs["audio_features"] = audio_features
+        model_inputs["audio_feature_lens"] = audio_feature_lens
+        result = MiniCPMOBatchFeature(data={**model_inputs})
+        if online_streaming:
+            result.use_extra_context = True
+            result.prefix_extra_frames = 0 if audio_chunk_idx == 0 else 2
+            result.suffix_extra_frames = 2
+            result.chunk_idx = audio_chunk_idx
+        return result
+    def audio_feature_extract(
+        self,
+        audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]], None] = None,
+        audio_parts: Optional[list] = None,
+        stream_input: Optional[bool] = False,
+        sampling_rate: Optional[int] = None,
+        chunk_length: Optional[int] = 1,
+        online_streaming: bool = False,
+        is_last_chunk: bool = False,
+        **kwargs,
+    ):
+        if audios is None:
+            return [], [], []
+        if isinstance(audios, np.ndarray):
+            audios_list = [[audios]]
+        elif isinstance(audios[0], np.ndarray):
+            audios_list = [audios]
+        else:
+            audios_list = audios
+        if audio_parts is not None:
+            assert len(audio_parts) == len(audios_list)
+            for parts, audios in zip(audio_parts, audios_list):
+                assert len(parts) == len(audios)
+        audio_feature_lens_list = []
+        audio_ph_list = []
+        audio_features_all = []
+        # audio placeholder not dependent on audio_parts
+        for audios in audios_list:
+            if audios:
+                audio_ph_list.append(
+                    [
+                        self.get_audio_placeholder(len(a), chunk_input=stream_input, chunk_length=chunk_length)
+                        for a in audios
+                    ]
+                )
+            else:
+                audio_ph_list.append([])
+        for idx, audios in enumerate(audios_list):
+            if audio_parts is not None:
+                # same audio part merge
+                audio_part = audio_parts[idx]
+                merge_audio = []
+                cur_audio = []
+                for aid, (part, audio) in enumerate(zip(audio_part, audios)):
+                    if aid == 0 or audio_part[aid] == audio_part[aid - 1]:
+                        cur_audio.append(audio)
+                    else:
+                        merge_audio.append(np.hstack(cur_audio))
+                        cur_audio = [audio]
+                if cur_audio:
+                    merge_audio.append(np.hstack(cur_audio))
+            else:
+                merge_audio = audios
+            # If the audio exceeds 30 seconds, split it into chunks every 30 seconds.
+            final_merge_audio = []
+            max_audio_inp_len = 30 * sampling_rate
+            for audio in merge_audio:
+                if len(audio) <= max_audio_inp_len:
+                    final_merge_audio.append(audio)
+                else:
+                    for i in range(math.ceil(len(audio) / max_audio_inp_len)):
+                        final_merge_audio.append(audio[i * max_audio_inp_len : (i + 1) * max_audio_inp_len])
+            audio_feature_lens = []
+            if audios:
+                if online_streaming:
+                    # online streaming: only support single audio, directly use process_audio_streaming return format
+                    assert (
+                        len(final_merge_audio) == 1
+                    ), f"online streaming mode only supports single audio, currently there are {len(final_merge_audio)}"
+                    audio = final_merge_audio[0]
+                    result = self.process_audio_streaming(
+                        audio, reset=False, return_batch_feature=True, is_last_chunk=is_last_chunk
+                    )
+                    audio_features_all.append(
+                        result["audio_features"].squeeze(0)
+                    )  # [1, 80, T] -> [80, T], keep consistent with batch processing
+                    audio_feature_lens_list.append(result["audio_feature_lens"][0])
+                else:
+                    # batch processing
+                    audio_inputs = self.audio_processor(
+                        final_merge_audio,
+                        sampling_rate=sampling_rate,
+                        return_attention_mask=True,
+                        padding="max_length",
+                        return_tensors="pt",
+                        **kwargs,
+                    )
+                    audio_feature = audio_inputs["input_features"]
+                    actual_lens = audio_inputs["attention_mask"].sum(dim=1)
+                    for feat, lens in zip(audio_feature, actual_lens):
+                        audio_features_all.append(feat[:, :lens])
+                        audio_feature_lens.append(lens)
+                    audio_feature_lens = torch.hstack(audio_feature_lens)
+                    audio_feature_lens_list.append(audio_feature_lens)
+            else:
+                audio_feature_lens_list.append([])
+        if audio_features_all:
+            audio_features = [i.permute(1, 0) for i in audio_features_all]
+            audio_features = torch.nn.utils.rnn.pad_sequence(
+                audio_features, batch_first=True, padding_value=0.0
+            ).permute(0, 2, 1)
+        else:
+            audio_features = []
+        return audio_features, audio_feature_lens_list, audio_ph_list
+    def _convert(self, input_str, max_inp_length: Optional[int] = None):
+        old_input_ids = self.tokenizer.encode(input_str)
+        listen_token_id = self.tokenizer.convert_tokens_to_ids("<|listen|>")
+        input_ids = []
+        for token in old_input_ids:
+            if token != listen_token_id:
+                input_ids.append(token)
+        if max_inp_length is not None:
+            input_ids = input_ids[:max_inp_length]
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        ## image bound
+        start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
+        end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
+        image_start_idx = torch.where(start_cond)[0]
+        image_start_idx += 1
+        image_end_idx = torch.where(end_cond)[0]
+        valid_image_nums = max(len(image_start_idx), len(image_end_idx))
+        image_bounds = torch.hstack(
+            [
+                image_start_idx[:valid_image_nums].unsqueeze(-1),
+                image_end_idx[:valid_image_nums].unsqueeze(-1),
+            ]
+        )
+        ##  audio bound
+        audio_start_idx = torch.where(input_ids == self.tokenizer.audio_start_id)[0]
+        audio_end_idx = torch.where(input_ids == self.tokenizer.audio_end_id)[0]
+        assert len(audio_start_idx) == len(audio_end_idx)
+        audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
+        spk_start_idx = torch.where(input_ids == self.tokenizer.spk_start_id)[0]
+        spk_end_idx = torch.where(input_ids == self.tokenizer.spk_end_id)[0]
+        assert len(spk_start_idx) == len(spk_end_idx)
+        spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
+        return input_ids, image_bounds, audio_bounds, spk_bounds
+    def _convert_omni_to_inputs(
+        self,
+        images,
+        audio_phs,
+        texts: Union[str, List[str]],
+        truncation=None,
+        max_length=None,
+        max_slice_nums=None,
+        use_image_id=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        if images is None and audio_phs is None:
+            model_inputs = self.tokenizer(
+                texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs
+            )
+            return MiniCPMOBatchFeature(data={**model_inputs})
+        image_pattern = "<image>./</image>"
+        audio_pattern = "<audio>./</audio>"
+        split_pattern = f"({image_pattern}|{audio_pattern})"
+        if isinstance(texts, str):
+            texts = [texts]
+        bs = len(texts)
+        if images is not None:
+            images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
+        else:
+            images, image_sizes, tgt_sizes = [[]] * bs, [[]] * bs, [[]] * bs
+        input_ids_list = []
+        image_bounds_list = []
+        audio_bounds_list = []
+        spk_bounds_list = []
+        for index, text in enumerate(texts):
+            text_chunks = re.split(split_pattern, text)
+            image_tags = re.findall(image_pattern, text)
+            audio_tags = re.findall(audio_pattern, text)
+            if image_tags:
+                assert images is not None
+                assert len(image_tags) == len(image_sizes[index])
+            if audio_tags:
+                assert audio_phs is not None
+                assert len(audio_tags) == len(audio_phs[index])
+            image_id = 0
+            audio_id = 0
+            for i, chunk in enumerate(text_chunks):
+                if chunk == image_pattern:
+                    image_placeholder = self.image_processor.get_slice_image_placeholder(
+                        image_sizes[index][image_id], image_id, max_slice_nums, use_image_id
+                    )
+                    image_id += 1
+                    text_chunks[i] = image_placeholder
+                elif chunk == audio_pattern:
+                    audio_placeholder = audio_phs[index][audio_id]
+                    audio_id += 1
+                    text_chunks[i] = audio_placeholder
+            final_text = "".join(text_chunks)
+            input_ids, image_bounds, audio_bounds, spk_bounds = self._convert(final_text, max_length)
+            input_ids_list.append(input_ids)
+            image_bounds_list.append(image_bounds)
+            audio_bounds_list.append(audio_bounds)
+            spk_bounds_list.append(spk_bounds)
+        padded_input_ids, padding_lengths = self.pad(input_ids_list, padding_side="left")
+        attention_mask = torch.ones_like(padded_input_ids, dtype=torch.bool)
+        for i, length in enumerate(padding_lengths):
+            image_bounds_list[i] = image_bounds_list[i] + length
+            audio_bounds_list[i] = audio_bounds_list[i] + length
+            spk_bounds_list[i] = spk_bounds_list[i] + length
+            attention_mask[i, :length] = False
+        data = {
+            "input_ids": padded_input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": images,
+            "image_sizes": image_sizes,
+            "image_bound": image_bounds_list,
+            "tgt_sizes": tgt_sizes,
+            "audio_bounds": audio_bounds_list,
+            "spk_bounds": spk_bounds_list,
+        }
+        return data
+    def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
+        items = []
+        if isinstance(inputs[0], list):
+            assert isinstance(inputs[0][0], torch.Tensor)
+            for it in inputs:
+                for tr in it:
+                    items.append(tr)
+        else:
+            assert isinstance(inputs[0], torch.Tensor)
+            items = inputs
+        batch_size = len(items)
+        shape = items[0].shape
+        dim = len(shape)
+        assert dim <= 2
+        if max_length is None:
+            max_length = 0
+        max_length = max(max_length, max(item.shape[-1] for item in items))
+        min_length = min(item.shape[-1] for item in items)
+        dtype = items[0].dtype
+        if dim == 0:
+            return torch.stack([item for item in items], dim=0), [0]
+        elif dim == 1:
+            if max_length == min_length:
+                return torch.stack([item for item in items], dim=0), [0] * batch_size
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        else:
+            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+        padding_length = []
+        for i, item in enumerate(items):
+            if dim == 1:
+                if padding_side == "left":
+                    tensor[i, -len(item) :] = item.clone()
+                else:
+                    tensor[i, : len(item)] = item.clone()
+            elif dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item) :, :] = item.clone()
+                else:
+                    tensor[i, : len(item), :] = item.clone()
+            padding_length.append(tensor.shape[-1] - len(item))
+        return tensor, padding_length

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,580 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<point>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</point>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<slice>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</slice>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</image_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<unit>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</unit>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<focus>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</focus>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<perception>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</perception>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<source_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</source_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image_save_to>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "</image_save_to>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|audio_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|audio_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|spk_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|spk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|spk_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tts_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tts_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|listen|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|speak|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|interrupt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|vad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|vad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|emotion_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|emotion_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|speed_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|speed_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|pitch_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|pitch_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|turn_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|turn_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_tts_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|chunk_tts_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|tts_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|timbre_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|im_start|>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": "<unk>"
+}

tokenization_minicpmo_fast.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2026 The OpenBMB Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from transformers import Qwen2TokenizerFast
+class MiniCPMOTokenizerFast(Qwen2TokenizerFast):
+    def __init__(self, **kwargs):
+        self._bad_token_ids = kwargs.pop("bad_token_ids", [])
+        super().__init__(**kwargs)
+        # image
+        self.im_start = "<image>"
+        self.im_end = "</image>"
+        self.ref_start = "<ref>"
+        self.ref_end = "</ref>"
+        self.box_start = "<box>"
+        self.box_end = "</box>"
+        self.quad_start = "<quad>"
+        self.quad_end = "</quad>"
+        self.slice_start = "<slice>"
+        self.slice_end = "</slice>"
+        self.im_id_start = "<image_id>"
+        self.im_id_end = "</image_id>"
+        # audio
+        self.audio_start = "<|audio_start|>"
+        self.audio_end = "<|audio_end|>"
+        self.spk_start = "<|spk_bos|>"
+        self.spk_end = "<|spk_eos|>"
+        self.tts_start = "<|tts_bos|>"
+        self.tts_end = "<|tts_eos|>"
+    @property
+    def eos_id(self):
+        return self.eos_token_id
+    @property
+    def bos_id(self):
+        return self.bos_token_id
+    @property
+    def unk_id(self):
+        return self.unk_token_id
+    @property
+    def im_start_id(self):
+        return self.convert_tokens_to_ids(self.im_start)
+    @property
+    def im_end_id(self):
+        return self.convert_tokens_to_ids(self.im_end)
+    @property
+    def slice_start_id(self):
+        return self.convert_tokens_to_ids(self.slice_start)
+    @property
+    def slice_end_id(self):
+        return self.convert_tokens_to_ids(self.slice_end)
+    @property
+    def im_id_start_id(self):
+        return self.convert_tokens_to_ids(self.im_id_start)
+    @property
+    def im_id_end_id(self):
+        return self.convert_tokens_to_ids(self.im_id_end)
+    @property
+    def audio_start_id(self):
+        return self.convert_tokens_to_ids(self.audio_start)
+    @property
+    def audio_end_id(self):
+        return self.convert_tokens_to_ids(self.audio_end)
+    @property
+    def spk_start_id(self):
+        return self.convert_tokens_to_ids(self.spk_start)
+    @property
+    def spk_end_id(self):
+        return self.convert_tokens_to_ids(self.spk_end)
+    @property
+    def tts_start_id(self):
+        return self.convert_tokens_to_ids(self.tts_start)
+    @property
+    def tts_end_id(self):
+        return self.convert_tokens_to_ids(self.tts_end)
+    @staticmethod
+    def escape(text: str) -> str:
+        return text
+    @staticmethod
+    def unescape(text: str) -> str:
+        return text
+    @property
+    def bad_token_ids(self) -> List[int]:
+        return self._bad_token_ids