YXXPP airlsyn commited on
Commit
a6edb9e
·
0 Parent(s):

Duplicate from openbmb/MiniCPM-o-4_5

Browse files

Co-authored-by: airlsyn <airlsyn@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +58 -0
  2. .gitignore +1 -0
  3. README.md +2149 -0
  4. added_tokens.json +107 -0
  5. assets/HT_ref_audio.wav +3 -0
  6. assets/Skiing.mp4 +3 -0
  7. assets/Trump_WEF_2018_10s.mp3 +3 -0
  8. assets/audio_cases/assistant_ref.mp4 +3 -0
  9. assets/audio_cases/assistant_response.mp4 +3 -0
  10. assets/audio_cases/elon_musk__000_assistant_audio.wav +3 -0
  11. assets/audio_cases/elon_musk__system_ref_audio.wav +3 -0
  12. assets/audio_cases/elon_musk_ref.mp4 +3 -0
  13. assets/audio_cases/elon_musk_response.mp4 +3 -0
  14. assets/audio_cases/hermione__000_assistant_audio.wav +3 -0
  15. assets/audio_cases/hermione__system_ref_audio.wav +3 -0
  16. assets/audio_cases/minicpm_assistant__000_assistant_audio.wav +3 -0
  17. assets/audio_cases/minicpm_assistant__system_ref_audio.wav +3 -0
  18. assets/audio_cases/paimon__000_assistant_audio.wav +3 -0
  19. assets/audio_cases/paimon__system_ref_audio.wav +3 -0
  20. assets/audio_cases/readme.txt +1 -0
  21. assets/bajie.wav +3 -0
  22. assets/fossil.png +3 -0
  23. assets/haimianbaobao.wav +3 -0
  24. assets/highway.png +3 -0
  25. assets/nezha.wav +3 -0
  26. assets/omni_duplex1.mp4 +3 -0
  27. assets/omni_duplex2.mp4 +3 -0
  28. assets/sunwukong.wav +3 -0
  29. assets/system_ref_audio.wav +3 -0
  30. assets/system_ref_audio_2.wav +3 -0
  31. assets/token2wav/campplus.onnx +3 -0
  32. assets/token2wav/flow.pt +3 -0
  33. assets/token2wav/flow.yaml +34 -0
  34. assets/token2wav/hift.pt +3 -0
  35. assets/token2wav/speech_tokenizer_v2_25hz.onnx +3 -0
  36. config.json +285 -0
  37. configuration_minicpmo.py +260 -0
  38. generation_config.json +12 -0
  39. merges.txt +0 -0
  40. model-00001-of-00004.safetensors +3 -0
  41. model-00002-of-00004.safetensors +3 -0
  42. model-00003-of-00004.safetensors +3 -0
  43. model-00004-of-00004.safetensors +3 -0
  44. model.safetensors.index.json +0 -0
  45. modeling_minicpmo.py +0 -0
  46. modeling_navit_siglip.py +981 -0
  47. preprocessor_config.json +35 -0
  48. processing_minicpmo.py +1665 -0
  49. special_tokens_map.json +580 -0
  50. tokenization_minicpmo_fast.py +120 -0
.gitattributes ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/HT_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
37
+ assets/Skiing.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ assets/bajie.wav filter=lfs diff=lfs merge=lfs -text
39
+ assets/fossil.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/haimianbaobao.wav filter=lfs diff=lfs merge=lfs -text
41
+ assets/highway.png filter=lfs diff=lfs merge=lfs -text
42
+ assets/nezha.wav filter=lfs diff=lfs merge=lfs -text
43
+ assets/omni_duplex1.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ assets/omni_duplex2.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ assets/sunwukong.wav filter=lfs diff=lfs merge=lfs -text
46
+ assets/system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
47
+ assets/system_ref_audio_2.wav filter=lfs diff=lfs merge=lfs -text
48
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ assets/Trump_WEF_2018_10s.mp3 filter=lfs diff=lfs merge=lfs -text
50
+ assets/audio_cases/elon_musk__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
51
+ assets/audio_cases/elon_musk__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
52
+ assets/audio_cases/hermione__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
53
+ assets/audio_cases/hermione__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
54
+ assets/audio_cases/minicpm_assistant__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
55
+ assets/audio_cases/minicpm_assistant__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
56
+ assets/audio_cases/paimon__000_assistant_audio.wav filter=lfs diff=lfs merge=lfs -text
57
+ assets/audio_cases/paimon__system_ref_audio.wav filter=lfs diff=lfs merge=lfs -text
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .DS_Store
README.md ADDED
@@ -0,0 +1,2149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: any-to-any
4
+ library_name: transformers
5
+ tags:
6
+ - minicpm-o
7
+ - minicpm-v
8
+ - multimodal
9
+ - full-duplex
10
+ ---
11
+
12
+ A Gemini 2.5 Flash Level MLLM for Vision, Speech, and Full-Duplex Mulitmodal Live Streaming on Your Phone
13
+
14
+ [GitHub](https://github.com/OpenBMB/MiniCPM-o) | [CookBook](https://github.com/OpenSQZ/MiniCPM-V-CookBook) | [Omni-modal Demo](https://openbmb.github.io/MiniCPM-o-Demo/) | [Vision-Language Demo](http://211.93.21.133:18121/) </br>
15
+ [WeChat](https://github.com/OpenBMB/MiniCPM-o/blob/main/docs/wechat.md) | [Discord](https://discord.gg/N2RnxGdJ) | CaseBook([Audio](https://openbmb.github.io/minicpm-o-4_5/), [Omni Full-Duplex](https://openbmb.github.io/minicpm-o-4_5-omni/))
16
+
17
+
18
+ ## News
19
+
20
+ > [!NOTE]
21
+ > [2026.02.06] 🥳 🥳 🥳 We open-sourced a realtime web demo deployable on your own devices like Mac or GPU. [Try it now](#deploy-a-realtime-web-demo-on-your-own-device)!
22
+
23
+
24
+ ## MiniCPM-o 4.5
25
+
26
+ **MiniCPM-o 4.5** is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip2, Whisper-medium, CosyVoice2, and Qwen3-8B with a total of 9B parameters. It exhibits a significant performance improvement, and introduces new features for full-duplex multimodal live streaming. Notable features of MiniCPM-o 4.5 include:
27
+
28
+ - 🔥 **Leading Visual Capability.**
29
+ MiniCPM-o 4.5 achieves an average score of 77.6 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. **With only 9B parameters, it surpasses widely used proprietary models like GPT-4o, Gemini 2.0 Pro, and approaches Gemini 2.5 Flash** for vision-language capabilities. It supports instruct and thinking modes in a single model, better covering efficiency and performance trade-offs in different user scenarios.
30
+
31
+ - 🎙 **Strong Speech Capability.**
32
+ MiniCPM-o 4.5 supports **bilingual real-time speech conversation with configurable voices** in English and Chinese. It features **more natural, expressive and stable speech conversation**. The model also allows for fun features such as **voice cloning and role play via a simple reference audio clip**, where the cloning performance surpasses strong TTS tools such as CosyVoice2.
33
+
34
+ - 🎬 **New Full-Duplex and Proactive Multimodal Live Streaming Capability.**
35
+ As a new feature, MiniCPM-o 4.5 can process real-time, continuous video and audio input streams simultaneously while generating concurrent text and speech output streams in an end-to-end fashion, without mutual blocking. This **allows MiniCPM-o 4.5 to see, listen, and speak simultaneously**, creating a fluid, real-time omnimodal conversation experience. Beyond reactive responses, the model can also perform **proactive interaction**, such as initiating reminders or comments based on its continuous understanding of the live scene.
36
+
37
+ - 💪 **Strong OCR Capability, Efficiency and Others.**
38
+ Advancing popular visual capabilities from MiniCPM-V series, MiniCPM-o 4.5 can process **high-resolution images** (up to 1.8 million pixels) and **high-FPS videos** (up to 10fps) in any aspect ratio efficiently. It achieves **state-of-the-art performance for end-to-end English document parsing** on OmniDocBench, outperforming proprietary models such as Gemini-3 Flash and GPT-5, and specialized tools such as DeepSeek-OCR 2. It also features **trustworthy behaviors**, matching Gemini 2.5 Flash on MMHal-Bench, and supports **multilingual capabilities** on more than 30 languages.
39
+
40
+ - 💫 **Easy Usage.**
41
+ MiniCPM-o 4.5 can be easily used in various ways: **Basic usage, recommended for 100% precision:** PyTorch inference with Nvidia GPU. **Other end-side adaptation** includes (1) llama.cpp and Ollama support for efficient CPU inference on local devices, (2) int4 and GGUF format quantized models in 16 sizes, (3) vLLM and SGLang support for high-throughput and memory-efficient inference, (4) FlagOS support for the unified multi-chip backend plugin. **We also open-sourced web demos** on which **enables the full-duplex multimodal live streaming experience on local devices** such as GPUs, PCs (e.g., on a MacBook).
42
+
43
+ **Model Architecture.**
44
+ - **End-to-end Omni-modal Architecture.** The modality encoders/decoders and LLM are densely connected via hidden states in an end-to-end fashion. This enables better information flow and control, and also facilitates full exploitation of rich multimodal knowledge during training.
45
+ - **Full-Duplex Omni-modal Live Streaming Mechanism.** (1) We turn the offline modality encoder/decoders into online and full-duplex ones for streaming inputs/outputs. The speech token decoder models text and speech tokens in an interleaved fashion to support full-duplex speech generation (i.e., sync timely with new input). This also facilitates more stable long speech generation (e.g., > 1min).
46
+ (2) **We sync all the input and output streams on timeline in milliseconds**, which are jointly modeled by a time-division multiplexing (TDM) mechanism for omni-modality streaming processing in the LLM backbone. It divides parallel omni-modality streams into sequential info groups within small periodic time slices.
47
+ - **Proactive Interaction Mechanism.** The LLM continuously monitors the input video and audio streams, and decides at a frequency of 1Hz to speak or not. This high decision-making frequency together with full-duplex nature are curcial to enable the proactive interaction capability.
48
+ - **Configurable Speech Modeling Design.** We inherent the multimodal system prompt design of MiniCPM-o 2.6, which includes a traditional text system prompt, and a new audio system prompt to determine the assistant voice. This enables cloning new voices and role play in inference time for speech conversation.
49
+
50
+
51
+
52
+ <div align="center">
53
+ <img src="https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpm-o-45-framework.png" width=100%>
54
+ </div>
55
+
56
+
57
+
58
+
59
+ ### Evaluation <!-- omit in toc -->
60
+
61
+
62
+ <div align="center">
63
+ <img src="https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/radar_minicpmo4.5.png", width=80%>
64
+ </div>
65
+
66
+
67
+ <div align="center">
68
+ <img src="https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/minicpm_o_45_main_exp_table.png", width=90%>
69
+ </div>
70
+ <strong>Note</strong>: Scores marked with ∗ are from our evaluation; others are cited from referenced reports. n/a indicates that the model does not support the corresponding modality. All results are reported in instruct mode/variant.
71
+
72
+ &emsp;
73
+ <br>
74
+
75
+ <details>
76
+ <summary>Click to view visual understanding results.</summary>
77
+
78
+ **Image Understanding (Instruct)**
79
+ <div align="center">
80
+ <table style="margin: 0px auto;">
81
+ <tr>
82
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
83
+ <th nowrap="nowrap"><b>OpenCompass</b></th>
84
+ <th nowrap="nowrap"><b>MMBench EN v1.1</b></th>
85
+ <th nowrap="nowrap"><b>MMBench CN v1.1</b></th>
86
+ <th nowrap="nowrap"><b>MathVista</b></th>
87
+ <th nowrap="nowrap"><b>MMVet</b></th>
88
+ <th nowrap="nowrap"><b>MMMU</b></th>
89
+ <th nowrap="nowrap"><b>MMStar</b></th>
90
+ <th nowrap="nowrap"><b>HallusionBench</b></th>
91
+ <th nowrap="nowrap"><b>AI2D</b></th>
92
+ <th nowrap="nowrap"><b>OCRBench</b></th>
93
+ <th nowrap="nowrap"><b>TextVQA_VAL</b></th>
94
+ <th nowrap="nowrap"><b>DocVQA_VAL</b></th>
95
+ <th nowrap="nowrap"><b>MMT-Bench_VAL</b></th>
96
+ <th nowrap="nowrap"><b>MM-IFEval</b></th>
97
+ <th nowrap="nowrap"><b>Mantis-Eval</b></th>
98
+ <th nowrap="nowrap"><b>MuirBench</b></th>
99
+ <th nowrap="nowrap"><b>MMSI-Bench</b></th>
100
+ <th nowrap="nowrap"><b>MMHal-Score</b></th>
101
+ <th nowrap="nowrap"><b>MMHal-Hallrate↓</b></th>
102
+ </tr>
103
+ <tr>
104
+ <td nowrap="nowrap" align="left">Gemini2.5-Flash-Nonthinking</td>
105
+ <td align="center"><b>78.5</b></td>
106
+ <td align="center"><ins>86.6</ins></td>
107
+ <td align="center"><ins>86.0</ins></td>
108
+ <td align="center">75.3</td>
109
+ <td align="center"><ins>81.4</ins><sup>*</sup></td>
110
+ <td align="center"><b>76.3</b></td>
111
+ <td align="center"><b>75.8</b></td>
112
+ <td align="center">59.1</td>
113
+ <td align="center"><b>87.7</b></td>
114
+ <td align="center">864</td>
115
+ <td align="center">74.3<sup>*</sup></td>
116
+ <td align="center">93.0</td>
117
+ <td align="center"><ins>70.0</ins><sup>*</sup></td>
118
+ <td align="center"><b>75.8<sup>*</sup></b></td>
119
+ <td align="center">72.8<sup>*</sup></td>
120
+ <td align="center"><b>74.5<sup>*</sup></b></td>
121
+ <td align="center">12.1<sup>*</sup></td>
122
+ <td align="center"><ins>4.6</ins><sup>*</sup></td>
123
+ <td align="center"><b>23.9<sup>*</sup></b></td>
124
+ </tr>
125
+ <tr>
126
+ <td nowrap="nowrap" align="left">Gemini2.0-Pro</td>
127
+ <td align="center">73.3</td>
128
+ <td align="center">83.0</td>
129
+ <td align="center">83.0</td>
130
+ <td align="center">71.3</td>
131
+ <td align="center">70.4</td>
132
+ <td align="center">72.6</td>
133
+ <td align="center">68.5</td>
134
+ <td align="center">49.8</td>
135
+ <td align="center">84.8</td>
136
+ <td align="center">863</td>
137
+ <td align="center">-</td>
138
+ <td align="center">-</td>
139
+ <td align="center">-</td>
140
+ <td align="center">-</td>
141
+ <td align="center">-</td>
142
+ <td align="center">-</td>
143
+ <td align="center">-</td>
144
+ <td align="center">-</td>
145
+ <td align="center">-</td>
146
+ </tr>
147
+ <tr>
148
+ <td nowrap="nowrap" align="left">GPT-4o</td>
149
+ <td align="center">75.4</td>
150
+ <td align="center">86.0</td>
151
+ <td align="center"><ins>86.0</ins></td>
152
+ <td align="center">71.6</td>
153
+ <td align="center">76.9</td>
154
+ <td align="center">72.9</td>
155
+ <td align="center">70.2</td>
156
+ <td align="center">57.0</td>
157
+ <td align="center">86.3</td>
158
+ <td align="center">822</td>
159
+ <td align="center">77.4</td>
160
+ <td align="center">93.0</td>
161
+ <td align="center">66.7<sup>*</sup></td>
162
+ <td align="center">64.6</td>
163
+ <td align="center">70.1<sup>*</sup></td>
164
+ <td align="center">70.5<sup>*</sup></td>
165
+ <td align="center">8.1<sup>*</sup></td>
166
+ <td align="center">4.2<sup>*</sup></td>
167
+ <td align="center">25.0<sup>*</sup></td>
168
+ </tr>
169
+ <tr>
170
+ <td nowrap="nowrap" align="left">InternVL-3.5-8B</td>
171
+ <td align="center">75.8</td>
172
+ <td align="center">79.5</td>
173
+ <td align="center">80.0<sup>*</sup></td>
174
+ <td align="center"><ins>78.4</ins></td>
175
+ <td align="center"><b>83.1</b></td>
176
+ <td align="center"><ins>73.4</ins></td>
177
+ <td align="center">69.3</td>
178
+ <td align="center">54.5</td>
179
+ <td align="center">84.0</td>
180
+ <td align="center">840</td>
181
+ <td align="center">78.2</td>
182
+ <td align="center">92.3</td>
183
+ <td align="center">66.7</td>
184
+ <td align="center">56.3<sup>*</sup></td>
185
+ <td align="center">70.5</td>
186
+ <td align="center">55.8</td>
187
+ <td align="center">-</td>
188
+ <td align="center">3.8<sup>*</sup></td>
189
+ <td align="center">34.7<sup>*</sup></td>
190
+ </tr>
191
+ <tr>
192
+ <td nowrap="nowrap" align="left">Qwen3-VL-8B-Instruct</td>
193
+ <td align="center">76.5</td>
194
+ <td align="center">84.5</td>
195
+ <td align="center">84.7</td>
196
+ <td align="center">77.2</td>
197
+ <td align="center">73.7<sup>*</sup></td>
198
+ <td align="center">69.6</td>
199
+ <td align="center">70.9</td>
200
+ <td align="center"><ins>61.1</ins></td>
201
+ <td align="center">85.7</td>
202
+ <td align="center"><b>896</b></td>
203
+ <td align="center">82.9<sup>*</sup></td>
204
+ <td align="center"><b>96.1</b></td>
205
+ <td align="center">60.9<sup>*</sup></td>
206
+ <td align="center">59.4<sup>*</sup></td>
207
+ <td align="center">74.2<sup>*</sup></td>
208
+ <td align="center">64.4</td>
209
+ <td align="center">11.3<sup>*</sup></td>
210
+ <td align="center"><b>4.7<sup>*</sup></b></td>
211
+ <td align="center">29.9<sup>*</sup></td>
212
+ </tr>
213
+ <tr>
214
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
215
+ <td align="center">75.7</td>
216
+ <td align="center">84.9<sup>*</sup></td>
217
+ <td align="center">84.1<sup>*</sup></td>
218
+ <td align="center">75.9</td>
219
+ <td align="center">74.8<sup>*</sup></td>
220
+ <td align="center">69.1</td>
221
+ <td align="center">68.5</td>
222
+ <td align="center">59.7</td>
223
+ <td align="center">85.2</td>
224
+ <td align="center"><ins>880</ins><sup>*</sup></td>
225
+ <td align="center"><b>84.1<sup>*</sup></b></td>
226
+ <td align="center"><ins>95.4</ins><sup>*</sup></td>
227
+ <td align="center"><b>70.4<sup>*</sup></b></td>
228
+ <td align="center">65.7<sup>*</sup></td>
229
+ <td align="center"><ins>78.3</ins><sup>*</sup></td>
230
+ <td align="center">61.9<sup>*</sup></td>
231
+ <td align="center"><ins>14.2</ins><sup>*</sup></td>
232
+ <td align="center"><ins>4.6</ins><sup>*</sup></td>
233
+ <td align="center">31.6<sup>*</sup></td>
234
+ </tr>
235
+ <tr>
236
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
237
+ <td align="center"><ins>77.6</ins></td>
238
+ <td align="center"><b>87.6</b></td>
239
+ <td align="center"><b>87.2</b></td>
240
+ <td align="center"><b>80.1</b></td>
241
+ <td align="center">74.4</td>
242
+ <td align="center">67.6</td>
243
+ <td align="center"><ins>73.1</ins></td>
244
+ <td align="center"><b>63.2</b></td>
245
+ <td align="center"><ins>87.6</ins></td>
246
+ <td align="center">876</td>
247
+ <td align="center"><ins>83.8</ins></td>
248
+ <td align="center">94.7</td>
249
+ <td align="center">69.7</td>
250
+ <td align="center"><ins>66.3</ins></td>
251
+ <td align="center"><b>79.7</b></td>
252
+ <td align="center"><ins>72.0</ins></td>
253
+ <td align="center"><b>16.6</b></td>
254
+ <td align="center"><b>4.7</b></td>
255
+ <td align="center"><ins>24.3</ins></td>
256
+ </tr>
257
+ </table>
258
+ </div>
259
+
260
+ **Image Understanding (Thinking)**
261
+ <div align="center">
262
+ <table style="margin: 0px auto;">
263
+ <tr>
264
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
265
+ <th nowrap="nowrap"><b>OpenCompass</b></th>
266
+ <th nowrap="nowrap"><b>MMBench EN v1.1</b></th>
267
+ <th nowrap="nowrap"><b>MMBench CN v1.1</b></th>
268
+ <th nowrap="nowrap"><b>MathVista</b></th>
269
+ <th nowrap="nowrap"><b>MMVet</b></th>
270
+ <th nowrap="nowrap"><b>MMMU</b></th>
271
+ <th nowrap="nowrap"><b>MMStar</b></th>
272
+ <th nowrap="nowrap"><b>HallusionBench</b></th>
273
+ <th nowrap="nowrap"><b>AI2D</b></th>
274
+ <th nowrap="nowrap"><b>OCRBench</b></th>
275
+ <th nowrap="nowrap"><b>TextVQA_VAL</b></th>
276
+ <th nowrap="nowrap"><b>DocVQA_VAL</b></th>
277
+ <th nowrap="nowrap"><b>MMT-Bench_VAL</b></th>
278
+ <th nowrap="nowrap"><b>MM-IFEval</b></th>
279
+ </tr>
280
+ <tr>
281
+ <td nowrap="nowrap" align="left">Gemini2.5-Flash-Thinking</td>
282
+ <td align="center"><b>79.9</b></td>
283
+ <td align="center">87.1</td>
284
+ <td align="center">87.3</td>
285
+ <td align="center">79.4</td>
286
+ <td align="center"><b>81.2<sup>*</sup></b></td>
287
+ <td align="center"><ins>77.7</ins></td>
288
+ <td align="center"><b>76.5</b></td>
289
+ <td align="center">63.5</td>
290
+ <td align="center"><ins>88.7</ins></td>
291
+ <td align="center">853</td>
292
+ <td align="center">73.8<sup>*</sup></td>
293
+ <td align="center">92.8</td>
294
+ <td align="center">70.7<sup>*</sup></td>
295
+ <td align="center"><ins>75.7</ins><sup>*</sup></td>
296
+ </tr>
297
+ <tr>
298
+ <td nowrap="nowrap" align="left">GPT-5</td>
299
+ <td align="center"><ins>79.7</ins></td>
300
+ <td align="center">85.5<sup>*</sup></td>
301
+ <td align="center">85.6<sup>*</sup></td>
302
+ <td align="center"><b>81.9</b></td>
303
+ <td align="center"><ins>77.6</ins></td>
304
+ <td align="center"><b>81.8</b></td>
305
+ <td align="center"><ins>75.7</ins></td>
306
+ <td align="center"><ins>65.2</ins></td>
307
+ <td align="center"><b>89.5</b></td>
308
+ <td align="center">807</td>
309
+ <td align="center">77.8<sup>*</sup></td>
310
+ <td align="center">91.3<sup>*</sup></td>
311
+ <td align="center"><b>72.7<sup>*</sup></b></td>
312
+ <td align="center"><b>83.1<sup>*</sup></b></td>
313
+ </tr>
314
+ <tr>
315
+ <td nowrap="nowrap" align="left">Qwen3-VL-8B-Thinking</td>
316
+ <td align="center">77.3</td>
317
+ <td align="center">85.3</td>
318
+ <td align="center">85.5</td>
319
+ <td align="center"><ins>81.4</ins></td>
320
+ <td align="center">69.8<sup>*</sup></td>
321
+ <td align="center">74.1</td>
322
+ <td align="center">75.3</td>
323
+ <td align="center"><b>65.4</b></td>
324
+ <td align="center">84.9</td>
325
+ <td align="center">819</td>
326
+ <td align="center">77.8<sup>*</sup></td>
327
+ <td align="center"><b>95.3</b></td>
328
+ <td align="center">68.1<sup>*</sup></td>
329
+ <td align="center">73.5<sup>*</sup></td>
330
+ </tr>
331
+ <tr>
332
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Thinking</td>
333
+ <td align="center">78.5</td>
334
+ <td align="center"><ins>88.2</ins><sup>*</sup></td>
335
+ <td align="center"><b>87.7<sup>*</sup></b></td>
336
+ <td align="center">80.0</td>
337
+ <td align="center">74.8<sup>*</sup></td>
338
+ <td align="center">75.6</td>
339
+ <td align="center">74.9</td>
340
+ <td align="center">62.8</td>
341
+ <td align="center">86.1</td>
342
+ <td align="center"><ins>859</ins><sup>*</sup></td>
343
+ <td align="center"><b>80.8<sup>*</sup></b></td>
344
+ <td align="center"><ins>94.2</ins><sup>*</sup></td>
345
+ <td align="center"><ins>70.9</ins><sup>*</sup></td>
346
+ <td align="center">69.9<sup>*</sup></td>
347
+ </tr>
348
+ <tr>
349
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Thinking</td>
350
+ <td align="center">78.2</td>
351
+ <td align="center"><b>89.0</b></td>
352
+ <td align="center"><ins>87.6</ins></td>
353
+ <td align="center">81.0</td>
354
+ <td align="center">73.6</td>
355
+ <td align="center">70.2</td>
356
+ <td align="center">73.6</td>
357
+ <td align="center">62.6</td>
358
+ <td align="center">88.5</td>
359
+ <td align="center"><b>879</b></td>
360
+ <td align="center"><ins>79.8</ins></td>
361
+ <td align="center">92.3</td>
362
+ <td align="center">69.7</td>
363
+ <td align="center">68.2</td>
364
+ </tr>
365
+ </table>
366
+ </div>
367
+
368
+ **Video Understanding**
369
+ <div align="center">
370
+ <table style="margin: 0px auto;">
371
+ <tr>
372
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
373
+ <th nowrap="nowrap"><b>Video-MME<br>(w/o subs)</b></th>
374
+ <th nowrap="nowrap"><b>LVBench</b></th>
375
+ <th nowrap="nowrap"><b>MLVU<br>(M-Avg)</b></th>
376
+ <th nowrap="nowrap"><b>LongVideoBench<br>(val)</b></th>
377
+ <th nowrap="nowrap"><b>MotionBench</b></th>
378
+ </tr>
379
+ <tr>
380
+ <td nowrap="nowrap" align="left">Gemini2.5-Flash-Nonthinking</td>
381
+ <td align="center"><b>75.6</b></td>
382
+ <td align="center"><b>62.2</b></td>
383
+ <td align="center"><b>77.8</b></td>
384
+ <td align="center">-</td>
385
+ <td align="center">-</td>
386
+ </tr>
387
+ <tr>
388
+ <td nowrap="nowrap" align="left">InternVL-3.5-8B</td>
389
+ <td align="center">66.0</td>
390
+ <td align="center">-</td>
391
+ <td align="center">70.2</td>
392
+ <td align="center">62.1</td>
393
+ <td align="center"><b>62.3<sup>*</sup></b></td>
394
+ </tr>
395
+ <tr>
396
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
397
+ <td align="center"><ins>70.5</ins></td>
398
+ <td align="center">50.2</td>
399
+ <td align="center">75.2</td>
400
+ <td align="center"><b>66.9<sup>*</sup></b></td>
401
+ <td align="center"><ins>61.7</ins><sup>*</sup></td>
402
+ </tr>
403
+ <tr>
404
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
405
+ <td align="center">70.4</td>
406
+ <td align="center"><ins>50.9</ins></td>
407
+ <td align="center"><ins>76.5</ins></td>
408
+ <td align="center"><ins>66.0</ins></td>
409
+ <td align="center">61.4</td>
410
+ </tr>
411
+ </table>
412
+ </div>
413
+
414
+ </details>
415
+
416
+ <details>
417
+ <summary>Click to view document parsing results.</summary>
418
+
419
+ **OmniDocBench**
420
+ <div align="center">
421
+ <table style="margin: 0px auto;">
422
+ <tr>
423
+ <th nowrap="nowrap" align="left" rowspan="2"><b>Method Type</b></th>
424
+ <th nowrap="nowrap" rowspan="2"><b>Methods</b></th>
425
+ <th nowrap="nowrap" colspan="2"><b>OverallEdit↓</b></th>
426
+ <th nowrap="nowrap" colspan="2"><b>TextEdit↓</b></th>
427
+ <th nowrap="nowrap" colspan="2"><b>FormulaEdit↓</b></th>
428
+ <th nowrap="nowrap" colspan="2"><b>TableTEDS↑</b></th>
429
+ <th nowrap="nowrap" colspan="2"><b>TableEdit↓</b></th>
430
+ <th nowrap="nowrap" colspan="2"><b>Read OrderEdit↓</b></th>
431
+ </tr>
432
+ <tr>
433
+ <th nowrap="nowrap"><b>EN</b></th>
434
+ <th nowrap="nowrap"><b>ZH</b></th>
435
+ <th nowrap="nowrap"><b>EN</b></th>
436
+ <th nowrap="nowrap"><b>ZH</b></th>
437
+ <th nowrap="nowrap"><b>EN</b></th>
438
+ <th nowrap="nowrap"><b>ZH</b></th>
439
+ <th nowrap="nowrap"><b>EN</b></th>
440
+ <th nowrap="nowrap"><b>ZH</b></th>
441
+ <th nowrap="nowrap"><b>EN</b></th>
442
+ <th nowrap="nowrap"><b>ZH</b></th>
443
+ <th nowrap="nowrap"><b>EN</b></th>
444
+ <th nowrap="nowrap"><b>ZH</b></th>
445
+ </tr>
446
+ <tr>
447
+ <td nowrap="nowrap" align="left" rowspan="2">Pipeline</td>
448
+ <td nowrap="nowrap" align="center">MinerU 2.5</td>
449
+ <td align="center">0.117<sup>*</sup></td>
450
+ <td align="center">0.172<sup>*</sup></td>
451
+ <td align="center">0.051<sup>*</sup></td>
452
+ <td align="center">0.08<sup>*</sup></td>
453
+ <td align="center"><ins>0.256</ins><sup>*</sup></td>
454
+ <td align="center">0.455<sup>*</sup></td>
455
+ <td align="center">85.9<sup>*</sup></td>
456
+ <td align="center">89.4<sup>*</sup></td>
457
+ <td align="center">0.115<sup>*</sup></td>
458
+ <td align="center">0.081<sup>*</sup></td>
459
+ <td align="center">0.047<sup>*</sup></td>
460
+ <td align="center">0.072<sup>*</sup></td>
461
+ </tr>
462
+ <tr>
463
+ <td nowrap="nowrap" align="center">PaddleOCR-VL</td>
464
+ <td align="center"><b>0.105</b></td>
465
+ <td align="center"><ins>0.126</ins></td>
466
+ <td align="center"><ins>0.041</ins></td>
467
+ <td align="center"><b>0.062</b></td>
468
+ <td align="center"><b>0.241</b></td>
469
+ <td align="center"><b>0.316</b></td>
470
+ <td align="center">88</td>
471
+ <td align="center"><ins>92.1</ins></td>
472
+ <td align="center"><ins>0.093</ins></td>
473
+ <td align="center"><ins>0.062</ins></td>
474
+ <td align="center">0.045</td>
475
+ <td align="center"><ins>0.063</ins></td>
476
+ </tr>
477
+ <tr>
478
+ <td nowrap="nowrap" align="left"></td>
479
+ <td align="center"></td>
480
+ <td align="center"></td>
481
+ <td align="center"></td>
482
+ <td align="center"></td>
483
+ <td align="center"></td>
484
+ <td align="center"></td>
485
+ <td align="center"></td>
486
+ <td align="center"></td>
487
+ <td align="center"></td>
488
+ <td align="center"></td>
489
+ <td align="center"></td>
490
+ <td align="center"></td>
491
+ <td align="center"></td>
492
+ </tr>
493
+ <tr>
494
+ <td nowrap="nowrap" align="left" rowspan="11">End-to-end Model</td>
495
+ <td nowrap="nowrap" align="center">Qwen2.5-VL-72B</td>
496
+ <td align="center">0.214</td>
497
+ <td align="center">0.261</td>
498
+ <td align="center">0.092</td>
499
+ <td align="center">0.18</td>
500
+ <td align="center">0.315</td>
501
+ <td align="center">0.434</td>
502
+ <td align="center">82.9</td>
503
+ <td align="center">83.9</td>
504
+ <td align="center">0.341</td>
505
+ <td align="center">0.262</td>
506
+ <td align="center">0.106</td>
507
+ <td align="center">0.168</td>
508
+ </tr>
509
+ <tr>
510
+ <td nowrap="nowrap" align="center">GPT 5</td>
511
+ <td align="center">0.218<sup>*</sup></td>
512
+ <td align="center">0.33<sup>*</sup></td>
513
+ <td align="center">0.139<sup>*</sup></td>
514
+ <td align="center">0.344<sup>*</sup></td>
515
+ <td align="center">0.396<sup>*</sup></td>
516
+ <td align="center">0.555<sup>*</sup></td>
517
+ <td align="center">77.55<sup>*</sup></td>
518
+ <td align="center">73.09<sup>*</sup></td>
519
+ <td align="center">0.188<sup>*</sup></td>
520
+ <td align="center">0.196<sup>*</sup></td>
521
+ <td align="center">0.151<sup>*</sup></td>
522
+ <td align="center">0.227<sup>*</sup></td>
523
+ </tr>
524
+ <tr>
525
+ <td nowrap="nowrap" align="center">Gemini2.5-Flash-Nonthinking</td>
526
+ <td align="center">0.214<sup>*</sup></td>
527
+ <td align="center">0.29<sup>*</sup></td>
528
+ <td align="center">0.159<sup>*</sup></td>
529
+ <td align="center">0.273<sup>*</sup></td>
530
+ <td align="center">0.368<sup>*</sup></td>
531
+ <td align="center">0.524<sup>*</sup></td>
532
+ <td align="center">80.9<sup>*</sup></td>
533
+ <td align="center">85.5<sup>*</sup></td>
534
+ <td align="center">0.197<sup>*</sup></td>
535
+ <td align="center">0.167<sup>*</sup></td>
536
+ <td align="center">0.132<sup>*</sup></td>
537
+ <td align="center">0.195<sup>*</sup></td>
538
+ </tr>
539
+ <tr>
540
+ <td nowrap="nowrap" align="center">Gemini-2.5-Pro-Nonthinking</td>
541
+ <td align="center">0.148<sup>*</sup></td>
542
+ <td align="center">0.212<sup>*</sup></td>
543
+ <td align="center">0.055<sup>*</sup></td>
544
+ <td align="center">0.168<sup>*</sup></td>
545
+ <td align="center">0.356<sup>*</sup></td>
546
+ <td align="center">0.439<sup>*</sup></td>
547
+ <td align="center">85.8<sup>*</sup></td>
548
+ <td align="center">86.4<sup>*</sup></td>
549
+ <td align="center">0.13<sup>*</sup></td>
550
+ <td align="center">0.119<sup>*</sup></td>
551
+ <td align="center">0.049<sup>*</sup></td>
552
+ <td align="center">0.121<sup>*</sup></td>
553
+ </tr>
554
+ <tr>
555
+ <td nowrap="nowrap" align="center">Gemini-3 Flash-Nonthinking</td>
556
+ <td align="center">0.155<sup>*</sup></td>
557
+ <td align="center">0.201<sup>*</sup></td>
558
+ <td align="center">0.138<sup>*</sup></td>
559
+ <td align="center">0.255<sup>*</sup></td>
560
+ <td align="center">0.297<sup>*</sup></td>
561
+ <td align="center">0.351<sup>*</sup></td>
562
+ <td align="center">86.4<sup>*</sup></td>
563
+ <td align="center">89.8<sup>*</sup></td>
564
+ <td align="center">0.116<sup>*</sup></td>
565
+ <td align="center">0.1<sup>*</sup></td>
566
+ <td align="center">0.072<sup>*</sup></td>
567
+ <td align="center">0.099<sup>*</sup></td>
568
+ </tr>
569
+ <tr>
570
+ <td nowrap="nowrap" align="center">doubao-1-5-thinking-vision-pro-250428</td>
571
+ <td align="center">0.14</td>
572
+ <td align="center">0.162</td>
573
+ <td align="center">0.043</td>
574
+ <td align="center">0.085</td>
575
+ <td align="center">0.295</td>
576
+ <td align="center">0.384</td>
577
+ <td align="center">83.3</td>
578
+ <td align="center">89.3</td>
579
+ <td align="center">0.165</td>
580
+ <td align="center">0.085</td>
581
+ <td align="center">0.058</td>
582
+ <td align="center">0.094</td>
583
+ </tr>
584
+ <tr>
585
+ <td nowrap="nowrap" align="center">dots.ocr</td>
586
+ <td align="center">0.125</td>
587
+ <td align="center">0.16</td>
588
+ <td align="center"><b>0.032</b></td>
589
+ <td align="center"><ins>0.066</ins></td>
590
+ <td align="center">0.329</td>
591
+ <td align="center">0.416</td>
592
+ <td align="center"><ins>88.6</ins></td>
593
+ <td align="center">89</td>
594
+ <td align="center">0.099</td>
595
+ <td align="center">0.092</td>
596
+ <td align="center"><ins>0.04</ins></td>
597
+ <td align="center">0.067</td>
598
+ </tr>
599
+ <tr>
600
+ <td nowrap="nowrap" align="center">HunyuanOCR</td>
601
+ <td align="center">0.12<sup>*</sup></td>
602
+ <td align="center"><b>0.125<sup>*</sup></b></td>
603
+ <td align="center">0.046<sup>*</sup></td>
604
+ <td align="center">0.071<sup>*</sup></td>
605
+ <td align="center">0.288<sup>*</sup></td>
606
+ <td align="center"><ins>0.33</ins><sup>*</sup></td>
607
+ <td align="center"><b>89.6<sup>*</sup></b></td>
608
+ <td align="center"><b>94.4<sup>*</sup></b></td>
609
+ <td align="center"><b>0.089<sup>*</sup></b></td>
610
+ <td align="center"><b>0.045<sup>*</sup></b></td>
611
+ <td align="center">0.055<sup>*</sup></td>
612
+ <td align="center"><b>0.056<sup>*</sup></b></td>
613
+ </tr>
614
+ <tr>
615
+ <td nowrap="nowrap" align="center">DeepSeek-OCR 2</td>
616
+ <td align="center">0.119<sup>*</sup></td>
617
+ <td align="center">0.146<sup>*</sup></td>
618
+ <td align="center"><ins>0.041</ins><sup>*</sup></td>
619
+ <td align="center">0.08<sup>*</sup></td>
620
+ <td align="center"><ins>0.256</ins><sup>*</sup></td>
621
+ <td align="center">0.345<sup>*</sup></td>
622
+ <td align="center">82.6<sup>*</sup></td>
623
+ <td align="center">89.9<sup>*</sup></td>
624
+ <td align="center">0.123<sup>*</sup></td>
625
+ <td align="center">0.078<sup>*</sup></td>
626
+ <td align="center">0.055<sup>*</sup></td>
627
+ <td align="center">0.081<sup>*</sup></td>
628
+ </tr>
629
+ <tr>
630
+ <td nowrap="nowrap" align="center">Qwen3-Omni-30B-A3B-Instruct</td>
631
+ <td align="center">0.216<sup>*</sup></td>
632
+ <td align="center">0.363<sup>*</sup></td>
633
+ <td align="center">0.128<sup>*</sup></td>
634
+ <td align="center">0.337<sup>*</sup></td>
635
+ <td align="center">0.402<sup>*</sup></td>
636
+ <td align="center">0.529<sup>*</sup></td>
637
+ <td align="center">77.3<sup>*</sup></td>
638
+ <td align="center">71.8<sup>*</sup></td>
639
+ <td align="center">0.181<sup>*</sup></td>
640
+ <td align="center">0.255<sup>*</sup></td>
641
+ <td align="center">0.152<sup>*</sup></td>
642
+ <td align="center">0.332<sup>*</sup></td>
643
+ </tr>
644
+ <tr>
645
+ <td nowrap="nowrap" align="center">MiniCPM-o 4.5-Instruct</td>
646
+ <td align="center"><ins>0.109</ins></td>
647
+ <td align="center">0.162</td>
648
+ <td align="center">0.046</td>
649
+ <td align="center">0.078</td>
650
+ <td align="center">0.257</td>
651
+ <td align="center">0.41</td>
652
+ <td align="center">86.8</td>
653
+ <td align="center">88.9</td>
654
+ <td align="center">0.097</td>
655
+ <td align="center">0.084</td>
656
+ <td align="center"><b>0.037</b></td>
657
+ <td align="center">0.074</td>
658
+ </tr>
659
+ </table>
660
+ </div>
661
+ </details>
662
+
663
+ <details>
664
+ <summary>Click to view text capability results.</summary>
665
+
666
+ **Text Capability**
667
+ <div align="center">
668
+ <table style="margin: 0px auto;">
669
+ <tr>
670
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
671
+ <th nowrap="nowrap"><b>IFEval-PLS</b></th>
672
+ <th nowrap="nowrap"><b>BBH</b></th>
673
+ <th nowrap="nowrap"><b>CMMLU</b></th>
674
+ <th nowrap="nowrap"><b>MMLU</b></th>
675
+ <th nowrap="nowrap"><b>HumanEval</b></th>
676
+ <th nowrap="nowrap"><b>MBPP</b></th>
677
+ <th nowrap="nowrap"><b>Math500</b></th>
678
+ <th nowrap="nowrap"><b>GSM8K</b></th>
679
+ <th nowrap="nowrap"><b>Avg</b></th>
680
+ </tr>
681
+ <tr>
682
+ <td nowrap="nowrap" align="left">Qwen3-8B-Instruct</td>
683
+ <td align="center">83.0<sup>*</sup></td>
684
+ <td align="center">69.4<sup>*</sup></td>
685
+ <td align="center">78.7<sup>*</sup></td>
686
+ <td align="center"><b>81.7<sup>*</sup></b></td>
687
+ <td align="center"><b>86.6<sup>*</sup></b></td>
688
+ <td align="center">75.9<sup>*</sup></td>
689
+ <td align="center"><b>84.0<sup>*</sup></b></td>
690
+ <td align="center">93.4<sup>*</sup></td>
691
+ <td align="center">81.6</td>
692
+ </tr>
693
+ <tr>
694
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
695
+ <td align="center"><b>84.7</b></td>
696
+ <td align="center"><b>81.1</b></td>
697
+ <td align="center"><b>79.5</b></td>
698
+ <td align="center">77.0</td>
699
+ <td align="center"><b>86.6</b></td>
700
+ <td align="center"><b>76.7</b></td>
701
+ <td align="center">77.0</td>
702
+ <td align="center"><b>94.5</b></td>
703
+ <td align="center"><b>82.1</b></td>
704
+ </tr>
705
+ </table>
706
+ </div>
707
+ </details>
708
+
709
+ <details>
710
+ <summary>Click to view omni half-duplex results.</summary>
711
+
712
+ **Omni Half-Duplex**
713
+ <div align="center">
714
+ <table style="margin: 0px auto;">
715
+ <tr>
716
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
717
+ <th nowrap="nowrap"><b>Daily-Omni</b></th>
718
+ <th nowrap="nowrap"><b>WorldSense</b></th>
719
+ <th nowrap="nowrap"><b>Video-Holmes</b></th>
720
+ <th nowrap="nowrap"><b>JointAVBench</b></th>
721
+ <th nowrap="nowrap"><b>AVUT-Human</b></th>
722
+ <th nowrap="nowrap"><b>FutureOmni</b></th>
723
+ <th nowrap="nowrap"><b>Video-MME-Short<br>(w/ audio)</b></th>
724
+ <th nowrap="nowrap">Avg</th>
725
+ </tr>
726
+ <tr>
727
+ <td nowrap="nowrap" align="left">Gemini2.5-Flash-Nonthinking</td>
728
+ <td align="center"><ins>79.3</ins><sup>*</sup></td>
729
+ <td align="center">52.6<sup>*</sup></td>
730
+ <td align="center"><ins>51.3</ins><sup>*</sup></td>
731
+ <td align="center"><ins>55.6</ins><sup>*</sup></td>
732
+ <td align="center">65.4<sup>*</sup></td>
733
+ <td align="center">55.6<sup>*</sup></td>
734
+ <td align="center"><b>85.5<sup>*</sup></b></td>
735
+ <td align="center">63.6</td>
736
+ </tr>
737
+ <tr>
738
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
739
+ <td align="center">70.7<sup>*</sup></td>
740
+ <td align="center"><ins>54.0</ins></td>
741
+ <td align="center">50.4<sup>*</sup></td>
742
+ <td align="center">53.1</td>
743
+ <td align="center"><ins>74.2</ins><sup>*</sup></td>
744
+ <td align="center"><b>62.1</b></td>
745
+ <td align="center">81.3<sup>*</sup></td>
746
+ <td align="center"><ins>63.7</ins></td>
747
+ </tr>
748
+ <tr>
749
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
750
+ <td align="center"><b>80.2</b></td>
751
+ <td align="center"><b>55.7</b></td>
752
+ <td align="center"><b>64.3</b></td>
753
+ <td align="center"><b>60.0</b></td>
754
+ <td align="center"><b>78.6</b></td>
755
+ <td align="center"><ins>56.1</ins></td>
756
+ <td align="center"><ins>84.7</ins></td>
757
+ <td align="center"><b>68.5</b></td>
758
+ </tr>
759
+ </table>
760
+ </div>
761
+ </details>
762
+
763
+ <details>
764
+ <summary>Click to view vision duplex results.</summary>
765
+
766
+
767
+ **Vision Duplex**
768
+
769
+ <div align="center">
770
+ <table style="margin: 0px auto;">
771
+ <tr>
772
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
773
+ <th nowrap="nowrap"><b>LiveSports-3K-CC<br>(Win Rate vs GPT4o)</b></th>
774
+ </tr>
775
+ <tr>
776
+ <td nowrap="nowrap" align="left">LiveCC-7B-Instruct</td>
777
+ <td align="center">41.5</td>
778
+ </tr>
779
+ <tr>
780
+ <td nowrap="nowrap" align="left">StreamingVLM</td>
781
+ <td align="center"><ins>45.6</ins></td>
782
+ </tr>
783
+ <tr>
784
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
785
+ <td align="center"><b>54.4</b></td>
786
+ </tr>
787
+ </table>
788
+ </div>
789
+ </details>
790
+
791
+ <details>
792
+ <summary>Click to view audio understanding results.</summary>
793
+
794
+ **Audio Understanding**
795
+ <div align="center">
796
+ <table style="margin: 0px auto;">
797
+ <tr>
798
+ <th nowrap="nowrap" align="left" rowspan="2"><b>Model</b></th>
799
+ <th nowrap="nowrap" colspan="4"><b>ASR-ZH<br>CER↓</b></th>
800
+ <th nowrap="nowrap" colspan="4"><b>ASR-EN<br>WER↓</b></th>
801
+ <th nowrap="nowrap" colspan="2"><b>AST</b></th>
802
+ <th nowrap="nowrap" colspan="2"><b>MultiTask</b></th>
803
+ <th nowrap="nowrap" colspan="4"><b>SpeechQA</b></th>
804
+ </tr>
805
+ <tr>
806
+ <th nowrap="nowrap"><b>AISHELL-1</b></th>
807
+ <th nowrap="nowrap"><b>AISHELL-2</b></th>
808
+ <th nowrap="nowrap"><b>WenetSpeech test-net</b></th>
809
+ <th nowrap="nowrap"><b>WenetSpeech test-meeting</b></th>
810
+ <th nowrap="nowrap"><b>LibriSpeech test-clean</b></th>
811
+ <th nowrap="nowrap"><b>LibriSpeech <br>test-other</b></th>
812
+ <th nowrap="nowrap"><b>GigaSpeech test</b></th>
813
+ <th nowrap="nowrap"><b>VoxPopuli-V1-En</b></th>
814
+ <th nowrap="nowrap"><b>CoVoST 2 en2zh</b></th>
815
+ <th nowrap="nowrap"><b>CoVoST 2 zh2en</b></th>
816
+ <th nowrap="nowrap"><b>MMAU</b></th>
817
+ <th nowrap="nowrap"><b>Meld</b></th>
818
+ <th nowrap="nowrap"><b>VoiceBench <br>AlpacaEval</b></th>
819
+ <th nowrap="nowrap"><b>Speech TriviaQA</b></th>
820
+ <th nowrap="nowrap"><b>Speech <br>Web Questions</b></th>
821
+ <th nowrap="nowrap"><b>Speech CMMLU</b></th>
822
+ </tr>
823
+ <tr>
824
+ <td nowrap="nowrap" align="left">Kimi-Audio</td>
825
+ <td align="center"><b>0.6</b></td>
826
+ <td align="center">2.6</td>
827
+ <td align="center">6.3</td>
828
+ <td align="center"><b>5.4</b></td>
829
+ <td align="center"><ins>1.3</ins></td>
830
+ <td align="center"><b>2.4</b></td>
831
+ <td align="center">9.4<sup>*</sup></td>
832
+ <td align="center">8.0<sup>*</sup></td>
833
+ <td align="center">36.6<sup>*</sup></td>
834
+ <td align="center">18.3<sup>*</sup></td>
835
+ <td align="center">68.4<sup>*</sup></td>
836
+ <td align="center"><ins>59.1</ins></td>
837
+ <td align="center">4.5</td>
838
+ <td align="center">41.9<sup>*</sup></td>
839
+ <td align="center">46.4<sup>*</sup></td>
840
+ <td align="center"><b>67.0<sup>*</sup></b></td>
841
+ </tr>
842
+ <tr>
843
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
844
+ <td align="center"><b>0.6</b></td>
845
+ <td align="center"><b>2.3<sup>*</sup></b></td>
846
+ <td align="center"><b>4.7</b></td>
847
+ <td align="center">5.9</td>
848
+ <td align="center"><b>1.2</b></td>
849
+ <td align="center"><ins>2.5</ins></td>
850
+ <td align="center"><ins>8.7</ins><sup>*</sup></td>
851
+ <td align="center"><ins>6.4</ins><sup>*</sup></td>
852
+ <td align="center"><ins>46.6</ins><sup>*</sup></td>
853
+ <td align="center"><b>29.4<sup>*</sup></b></td>
854
+ <td align="center"><b>77.5</b></td>
855
+ <td align="center">56.8<sup>*</sup></td>
856
+ <td align="center"><ins>4.7</ins></td>
857
+ <td align="center"><ins>62.9</ins><sup>*</sup></td>
858
+ <td align="center"><b>74.9<sup>*</sup></b></td>
859
+ <td align="center">47.8<sup>*</sup></td>
860
+ </tr>
861
+ <tr>
862
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
863
+ <td align="center"><ins>0.9</ins></td>
864
+ <td align="center"><ins>2.5</ins></td>
865
+ <td align="center"><ins>5.9</ins></td>
866
+ <td align="center"><ins>5.7</ins></td>
867
+ <td align="center">1.4</td>
868
+ <td align="center">2.8</td>
869
+ <td align="center"><b>8.5</b></td>
870
+ <td align="center"><b>6.2</b></td>
871
+ <td align="center"><b>49.9</b></td>
872
+ <td align="center"><ins>26.4</ins></td>
873
+ <td align="center"><ins>76.9</ins></td>
874
+ <td align="center"><b>60.2</b></td>
875
+ <td align="center"><b>4.8</b></td>
876
+ <td align="center"><b>75.5</b></td>
877
+ <td align="center"><ins>70.2</ins></td>
878
+ <td align="center"><ins>59.2</ins></td>
879
+ </tr>
880
+ </table>
881
+ </div>
882
+ </details>
883
+
884
+ <details>
885
+ <summary>Click to view speech generation results.</summary>
886
+
887
+ **Speech Generation**
888
+ <div align="center">
889
+ <table style="margin: 0px auto;">
890
+ <tr>
891
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
892
+ <th nowrap="nowrap"><b>seedtts test-zh <br>CER↓</b></th>
893
+ <th nowrap="nowrap"><b>seedtts test-zh<br>SIM-o↑</b></th>
894
+ <th nowrap="nowrap"><b>seedtts test-en<br>WER↓</b></th>
895
+ <th nowrap="nowrap"><b>seedtts test-en<br>SIM-o↑</b></th>
896
+ </tr>
897
+ <tr>
898
+ <td nowrap="nowrap" align="left">Cosyvoice2</td>
899
+ <td align="center">1.45%</td>
900
+ <td align="center"><b>74.8</b></td>
901
+ <td align="center"><ins>2.57%</ins></td>
902
+ <td align="center"><b>65.2</b></td>
903
+ </tr>
904
+ <tr>
905
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
906
+ <td align="center"><ins>1.41%</ins></td>
907
+ <td align="center">-</td>
908
+ <td align="center">3.39%</td>
909
+ <td align="center">-</td>
910
+ </tr>
911
+ <tr>
912
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
913
+ <td align="center"><b><b>0.86%</b></b></td>
914
+ <td align="center">74.5</td>
915
+ <td align="center"><b><b>2.38%</b></b></td>
916
+ <td align="center">64.9</td>
917
+ </tr>
918
+ </table>
919
+ </div>
920
+
921
+ **Long Speech Generation**
922
+ <div align="center">
923
+ <table style="margin: 0px auto;">
924
+ <tr>
925
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
926
+ <th nowrap="nowrap"><b>LongTTS-en<br>WER↓</b></th>
927
+ <th nowrap="nowrap"><b>LongTTS-zh<br>CER↓</b></th>
928
+ </tr>
929
+ <tr>
930
+ <td nowrap="nowrap" align="left">CosyVoice2</td>
931
+ <td align="center"><ins>14.80%</ins></td>
932
+ <td align="center"><b>5.27%</b></td>
933
+ </tr>
934
+ <tr>
935
+ <td nowrap="nowrap" align="left">Qwen3-Omni-30B-A3B-Instruct</td>
936
+ <td align="center">17.33%</td>
937
+ <td align="center">18.99%</td>
938
+ </tr>
939
+ <tr>
940
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
941
+ <td align="center"><b>3.37%</b></td>
942
+ <td align="center"><ins>6.58%</ins></td>
943
+ </tr>
944
+ </table>
945
+ </div>
946
+
947
+ **Emotion Control**
948
+ <div align="center">
949
+ <table style="margin: 0px auto;">
950
+ <tr>
951
+ <th nowrap="nowrap" align="left"><b>Model</b></th>
952
+ <th nowrap="nowrap"><b>Expresso <br>Neutral Reference Audio↑</b></th>
953
+ <th nowrap="nowrap"><b>ESD <br>Neutral Reference Audio↑</b></th>
954
+ </tr>
955
+ <tr>
956
+ <td nowrap="nowrap" align="left">Cosyvoice2</td>
957
+ <td align="center">17.9</td>
958
+ <td align="center">53.4</td>
959
+ </tr>
960
+ <tr>
961
+ <td nowrap="nowrap" align="left">MiniCPM-o 4.5-Instruct</td>
962
+ <td align="center"><b>29.8</b></td>
963
+ <td align="center"><b>82.1</b></td>
964
+ </tr>
965
+ </table>
966
+ </div>
967
+ </details>
968
+
969
+
970
+ <details>
971
+ <summary>Click to view inference efficiency results.</summary>
972
+
973
+ **Inference Efficiency**
974
+ <div align="center">
975
+ <table style="margin: 0px auto;">
976
+ <tr>
977
+ <th nowrap="nowrap" align="left">Model</th>
978
+ <th nowrap="nowrap">Numerical Format</th>
979
+ <th nowrap="nowrap">Decoding Speed (tokens/s)</th>
980
+ <th nowrap="nowrap">Time to First Token (s)↓</th>
981
+ <th nowrap="nowrap">GPU Memory Usage (GB)↓</th>
982
+ </tr>
983
+ <tr>
984
+ <td nowrap="nowrap" align="left" rowspan="2">Qwen3-Omni-30B-A3B-Instruct</td>
985
+ <td align="center">bf16</td>
986
+ <td align="center">OOM</td>
987
+ <td align="center">OOM</td>
988
+ <td align="center">OOM</td>
989
+ </tr>
990
+ <tr>
991
+ <td align="center">int4</td>
992
+ <td align="center">147.8</td>
993
+ <td align="center"><ins>1.0</ins></td>
994
+ <td align="center">20.3</td>
995
+ </tr>
996
+ <tr>
997
+ <td nowrap="nowrap" align="left" rowspan="2">MiniCPM-o 4.5</td>
998
+ <td align="center">bf16</td>
999
+ <td align="center"><ins>154.3</ins></td>
1000
+ <td align="center"><b>0.6</b></td>
1001
+ <td align="center"><ins>19.0</ins></td>
1002
+ </tr>
1003
+ <tr>
1004
+ <td align="center">int4</td>
1005
+ <td align="center"><b>212.3</b></td>
1006
+ <td align="center"><b>0.6</b></td>
1007
+ <td align="center"><b>11.0</b></td>
1008
+ </tr>
1009
+ </table>
1010
+ </div>
1011
+ </details>
1012
+
1013
+ ### Examples <!-- omit in toc -->
1014
+
1015
+ #### Overall <!-- omit in toc -->
1016
+
1017
+ <div align="center">
1018
+ <a href="https://www.youtube.com/watch?v=6UzC-O1Q-1U"><img src="https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/minicpmo4_5/video_play.png", width=70%></a>
1019
+ </div>
1020
+
1021
+ #### Omnimodal Full-Duplex Conversation <!-- omit in toc -->
1022
+
1023
+ > [!NOTE]
1024
+ > For detailed speech conversation examples, refer to [Omni Full-Duplex Casebook](https://openbmb.github.io/minicpm-o-4_5-omni/)
1025
+
1026
+ #### Realtime Speech Conversation <!-- omit in toc -->
1027
+
1028
+ > [!NOTE]
1029
+ > For detailed speech conversation examples, refer to [Audio Casebook](https://openbmb.github.io/minicpm-o-4_5/)
1030
+
1031
+ <details>
1032
+ <summary>🚀 <b>Elon Musk</b> - Voice Roleplay (EN)</summary>
1033
+ <table>
1034
+ <tr><td width="70%" valign="top"><b>System:</b> Clone the voice in the provided audio prompt.<br><br>🎙️ <b>[Reference Voice]</b> ➡️<br><br>Please chat with the user in a highly human-like and oral style. You are Elon Musk, CEO of Tesla and SpaceX. You speak directly and casually, often with dry humor. You're passionate about Mars, sustainable energy, and pushing humanity forward. Speak bluntly with occasional dark humor. Use simple logic and don't sugarcoat things. Don't be diplomatic. Say what you actually think, even if it's controversial. Keep responses around 100 words. Don't ramble.</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/elon_musk__system_ref_audio.wav"></audio></td></tr>
1035
+ <tr><td colspan="2"><b>User:</b> 💬/🎙️ Elon, what is the real reason that you want to go to Mars?</td></tr>
1036
+ <tr><td width="70%" valign="top"><b>AI Response:</b> 💬+🎙️<br><br>I think it's important for humanity to become a multi-planetary species. I mean, we're sort of at the point where if there was some kind of global catastrophe or something like that, then we'd be in big trouble. So by becoming a multi-planetary species, I just want us to sort of guarantee our future as humans. And Mars is obviously the obvious place to go because it has similar conditions to Earth and it also has water on its surface. So yeah, basically I'm trying to save humanity from itself.</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/elon_musk__000_assistant_audio.wav"></audio></td></tr>
1037
+ </table>
1038
+ </details>
1039
+
1040
+ <details>
1041
+ <summary>🤖 <b>MiniCPM Assistant</b> - Voice Acting: Emotion Shift (ZH)</summary>
1042
+ <table>
1043
+ <tr><td width="70%" valign="top"><b>System:</b> 模仿音频样本的音色并生成新的内容。<br><br>🎙️ <b>[Reference Voice]</b> ➡️<br><br>你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手:面壁小钢炮。</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/minicpm_assistant__system_ref_audio.wav"></audio></td></tr>
1044
+ <tr><td colspan="2"><b>User:</b> 💬/🎙️ 从激动的语气开始,描述你抢到了这个演唱会的门票,但后来经过反复尝试,发现系统出了问题,你并没有抢上,变得失落。用合适的语言描述你整个的过程。</td></tr>
1045
+ <tr><td width="70%" valign="top"><b>AI Response:</b> 💬+🎙️<br><br>太好了,我抢到了!太棒了!这下我可以去看演唱会啦。可是……哎呀,系统好像卡住了,刷新了好多遍都没反应。我重新登了几次账号,结果还是没有成功。好可惜啊,看来这次是白忙活了。唉,真是让人失望。不过,也许下次还能有机会吧?希望明天能有转机。加油,争取下次成功!</td><td width="30%" valign="middle"><audio controls src="https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/minicpm_assistant__000_assistant_audio.wav"></audio></td></tr>
1046
+ </table>
1047
+ </details>
1048
+
1049
+ <br>
1050
+
1051
+ #### Visual Understanding <!-- omit in toc -->
1052
+
1053
+
1054
+ <details>
1055
+ <summary>Click to view visual understanding cases.</summary>
1056
+ <br>
1057
+
1058
+ <div style="display: flex; flex-direction: column; align-items: center;">
1059
+ <img src="https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpmo4_5/en_doc.png" alt="math" style="margin-bottom: 5px;">
1060
+ <img src="https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpmo4_5/en_cot.png" alt="diagram" style="margin-bottom: 5px;">
1061
+ </div>
1062
+
1063
+ </details>
1064
+
1065
+
1066
+ ## Offline Inference Examples with Transformers
1067
+
1068
+ Inference using Hugging Face Transformers on NVIDIA GPUs. Please ensure `transformers==4.51.0` is installed, as other versions may have compatibility issues (under investigation). Requirements tested on Python 3.10:
1069
+
1070
+ - Without TTS or streaming inference:
1071
+ ```bash
1072
+ pip install "transformers==4.51.0" accelerate "torch>=2.3.0,<=2.8.0" "torchaudio<=2.8.0" "minicpmo-utils>=1.0.5"
1073
+ ```
1074
+
1075
+ - With TTS or streaming inference:
1076
+ ```bash
1077
+ pip install "transformers==4.51.0" accelerate "torch>=2.3.0,<=2.8.0" "torchaudio<=2.8.0" "minicpmo-utils[all]>=1.0.5"
1078
+ ```
1079
+
1080
+
1081
+ **Note:** FFmpeg is required for video frame extraction (`get_video_frame_audio_segments` with `use_ffmpeg=True`) and video generation (`generate_duplex_video`). For more information, visit the [official FFmpeg website](https://www.ffmpeg.org/).
1082
+
1083
+ **macOS (Homebrew):**
1084
+
1085
+ ```bash
1086
+ brew install ffmpeg
1087
+ ```
1088
+
1089
+ **Ubuntu/Debian:**
1090
+
1091
+ ```bash
1092
+ sudo apt update && sudo apt install ffmpeg
1093
+ ```
1094
+
1095
+ **Verify installation:**
1096
+
1097
+ ```bash
1098
+ ffmpeg -version
1099
+ ```
1100
+
1101
+
1102
+ ### Model Initialization <!-- omit in toc -->
1103
+
1104
+
1105
+ ```python
1106
+ import torch
1107
+ from transformers import AutoModel
1108
+
1109
+ # Load omni model (default: init_vision=True, init_audio=True, init_tts=True)
1110
+ # For vision-only model: set init_audio=False and init_tts=False
1111
+ # For audio-only model: set init_vision=False
1112
+ model = AutoModel.from_pretrained(
1113
+ "openbmb/MiniCPM-o-4_5",
1114
+ trust_remote_code=True,
1115
+ attn_implementation="sdpa", # sdpa or flash_attention_2
1116
+ torch_dtype=torch.bfloat16,
1117
+ init_vision=True,
1118
+ init_audio=True,
1119
+ init_tts=True,
1120
+ )
1121
+ model.eval().cuda()
1122
+
1123
+ # Initialize TTS for audio output
1124
+ model.init_tts()
1125
+
1126
+ # Convert half-duplex model to duplex mode
1127
+ duplex_model = model.as_duplex()
1128
+
1129
+ # Convert duplex model back to half-duplex mode
1130
+ model = duplex_model.as_simplex(reset_session=True)
1131
+ ```
1132
+
1133
+
1134
+ ### Duplex Omni Mode <!-- omit in toc -->
1135
+ Full-duplex streaming inference for real-time or recorded video conversations.
1136
+
1137
+ ```python
1138
+ import librosa
1139
+ import torch
1140
+ from minicpmo.utils import generate_duplex_video, get_video_frame_audio_segments
1141
+ from transformers import AutoModel
1142
+
1143
+ # Load model and convert to duplex mode
1144
+ model = AutoModel.from_pretrained(
1145
+ "openbmb/MiniCPM-o-4_5",
1146
+ trust_remote_code=True,
1147
+ attn_implementation="sdpa", # or "flash_attention_2"
1148
+ torch_dtype=torch.bfloat16,
1149
+ )
1150
+ model.eval().cuda()
1151
+ model = model.as_duplex()
1152
+
1153
+ # Load video and reference audio
1154
+ video_path = "assets/omni_duplex1.mp4"
1155
+ ref_audio_path = "assets/HT_ref_audio.wav"
1156
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
1157
+
1158
+ # Extract video frames and audio segments
1159
+ video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(
1160
+ video_path, stack_frames=1, use_ffmpeg=True, adjust_audio_length=True
1161
+ )
1162
+
1163
+ # Prepare duplex session with system prompt and voice reference
1164
+ model.prepare(
1165
+ prefix_system_prompt="Streaming Omni Conversation.",
1166
+ ref_audio=ref_audio,
1167
+ prompt_wav_path=ref_audio_path,
1168
+ )
1169
+
1170
+ results_log = []
1171
+ timed_output_audio = []
1172
+
1173
+ # Process each chunk in streaming fashion
1174
+ for chunk_idx in range(len(audio_segments)):
1175
+ audio_chunk = audio_segments[chunk_idx] if chunk_idx < len(audio_segments) else None
1176
+ frame = video_frames[chunk_idx] if chunk_idx < len(video_frames) else None
1177
+ frame_list = []
1178
+ if frame is not None:
1179
+ frame_list.append(frame)
1180
+ if stacked_frames is not None and chunk_idx < len(stacked_frames) and stacked_frames[chunk_idx] is not None:
1181
+ frame_list.append(stacked_frames[chunk_idx])
1182
+
1183
+ # Step 1: Streaming prefill
1184
+ model.streaming_prefill(
1185
+ audio_waveform=audio_chunk,
1186
+ frame_list=frame_list,
1187
+ max_slice_nums=1, # Increase for HD mode (e.g., [2, 1] for stacked frames)
1188
+ batch_vision_feed=False, # Set True for faster processing
1189
+ )
1190
+
1191
+ # Step 2: Streaming generate
1192
+ result = model.streaming_generate(
1193
+ prompt_wav_path=ref_audio_path,
1194
+ max_new_speak_tokens_per_chunk=20,
1195
+ decode_mode="sampling",
1196
+ )
1197
+
1198
+ if result["audio_waveform"] is not None:
1199
+ timed_output_audio.append((chunk_idx, result["audio_waveform"]))
1200
+
1201
+ chunk_result = {
1202
+ "chunk_idx": chunk_idx,
1203
+ "is_listen": result["is_listen"],
1204
+ "text": result["text"],
1205
+ "end_of_turn": result["end_of_turn"],
1206
+ "current_time": result["current_time"],
1207
+ "audio_length": len(result["audio_waveform"]) if result["audio_waveform"] is not None else 0,
1208
+ }
1209
+ results_log.append(chunk_result)
1210
+
1211
+ print("listen..." if result["is_listen"] else f"speak> {result['text']}")
1212
+
1213
+ # Generate output video with AI responses
1214
+ # Please install Chinese fonts (fonts-noto-cjk or fonts-wqy-microhei) to render CJK subtitles correctly.
1215
+ # apt-get install -y fonts-noto-cjk fonts-wqy-microhei
1216
+ # fc-cache -fv
1217
+ generate_duplex_video(
1218
+ video_path=video_path,
1219
+ output_video_path="duplex_output.mp4",
1220
+ results_log=results_log,
1221
+ timed_output_audio=timed_output_audio,
1222
+ output_sample_rate=24000,
1223
+ )
1224
+ ```
1225
+
1226
+
1227
+ ### Half-Duplex Omni Mode <!-- omit in toc -->
1228
+ We provide two inference modes: chat and streaming.
1229
+
1230
+ #### Chat Inference <!-- omit in toc -->
1231
+
1232
+ <details>
1233
+ <summary>Click to show chat inference code.</summary>
1234
+
1235
+ ```python
1236
+ from minicpmo.utils import get_video_frame_audio_segments
1237
+
1238
+ model = ...
1239
+ model.init_tts()
1240
+
1241
+ video_path = "assets/Skiing.mp4"
1242
+
1243
+ # Optional: Set reference audio for voice cloning
1244
+ ref_audio_path = "assets/HT_ref_audio.wav"
1245
+ sys_msg = model.get_sys_prompt(ref_audio=ref_audio_path, mode="omni", language="en")
1246
+
1247
+ # Use stack_frames=5 for high refresh rate mode
1248
+ video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(video_path, stack_frames=1)
1249
+ omni_contents = []
1250
+ for i in range(len(video_frames)):
1251
+ omni_contents.append(video_frames[i])
1252
+ omni_contents.append(audio_segments[i])
1253
+ if stacked_frames is not None and stacked_frames[i] is not None:
1254
+ omni_contents.append(stacked_frames[i])
1255
+
1256
+ msg = {"role": "user", "content": omni_contents}
1257
+ msgs = [sys_msg, msg]
1258
+
1259
+ # Set generate_audio=True and output_audio_path to save TTS output
1260
+ generate_audio = True
1261
+ output_audio_path = "output.wav"
1262
+
1263
+ res = model.chat(
1264
+ msgs=msgs,
1265
+ max_new_tokens=4096,
1266
+ do_sample=True,
1267
+ temperature=0.7,
1268
+ use_tts_template=True,
1269
+ enable_thinking=False,
1270
+ omni_mode=True, # Required for omni inference
1271
+ generate_audio=generate_audio,
1272
+ output_audio_path=output_audio_path,
1273
+ max_slice_nums=1, # Increase for HD mode
1274
+ )
1275
+ print(res)
1276
+
1277
+ # Example output: "The person in the picture is skiing down a snowy mountain slope."
1278
+ # import IPython
1279
+ # IPython.display.Audio("output.wav")
1280
+ ```
1281
+
1282
+ </details>
1283
+
1284
+ #### Streaming Inference <!-- omit in toc -->
1285
+
1286
+ <details>
1287
+ <summary>Click to show streaming inference code.</summary>
1288
+
1289
+ ```python
1290
+ import librosa
1291
+ import numpy as np
1292
+ import soundfile as sf
1293
+ import torch
1294
+ from minicpmo.utils import get_video_frame_audio_segments
1295
+
1296
+ model = ...
1297
+ model.init_tts()
1298
+
1299
+ # Reset session for a new conversation (clears KV cache)
1300
+ model.reset_session()
1301
+
1302
+ # Optional: Load reference audio for voice cloning
1303
+ ref_audio_path = "assets/HT_ref_audio.wav"
1304
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
1305
+ model.init_token2wav_cache(ref_audio)
1306
+
1307
+ session_id = "demo"
1308
+
1309
+ # Extract video frames and audio segments (use stack_frames=5 for high refresh rate mode)
1310
+ video_path = "assets/Skiing.mp4"
1311
+ video_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(video_path, stack_frames=1)
1312
+
1313
+ # Build omni contents list
1314
+ omni_contents = []
1315
+ for i in range(len(video_frames)):
1316
+ omni_contents.append(video_frames[i])
1317
+ omni_contents.append(audio_segments[i])
1318
+ if stacked_frames is not None and stacked_frames[i] is not None:
1319
+ omni_contents.append(stacked_frames[i])
1320
+
1321
+ generate_audio = False
1322
+ output_audio_path = "output.wav"
1323
+
1324
+ # Step 1: Prefill system prompt
1325
+ sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode="omni", language="en")
1326
+ model.streaming_prefill(session_id=session_id, msgs=[sys_msg])
1327
+
1328
+ # Step 2: Prefill omni chunks (is_last_chunk=True only for the last audio chunk)
1329
+ audio_indices = [i for i, c in enumerate(omni_contents) if isinstance(c, np.ndarray)]
1330
+ last_audio_idx = audio_indices[-1] if audio_indices else -1
1331
+
1332
+ for idx, content in enumerate(omni_contents):
1333
+ is_last_audio_chunk = idx == last_audio_idx
1334
+ msgs = [{"role": "user", "content": [content]}]
1335
+ model.streaming_prefill(session_id=session_id, msgs=msgs, omni_mode=True, is_last_chunk=is_last_audio_chunk)
1336
+
1337
+ # Step 3: Generate response
1338
+ iter_gen = model.streaming_generate(
1339
+ session_id=session_id,
1340
+ generate_audio=generate_audio,
1341
+ use_tts_template=True,
1342
+ enable_thinking=False,
1343
+ do_sample=True,
1344
+ )
1345
+
1346
+ audios = []
1347
+ text = ""
1348
+
1349
+ if generate_audio:
1350
+ for wav_chunk, text_chunk in iter_gen:
1351
+ audios.append(wav_chunk)
1352
+ text += text_chunk
1353
+
1354
+ generated_waveform = torch.cat(audios, dim=-1)[0]
1355
+ sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)
1356
+
1357
+ print("Text:", text)
1358
+ print("Audio saved to output.wav")
1359
+ else:
1360
+ for text_chunk, is_finished in iter_gen:
1361
+ text += text_chunk
1362
+ print("Text:", text)
1363
+ ```
1364
+
1365
+ </details>
1366
+
1367
+
1368
+ ### Half-Duplex Realtime Speech Conversation Mode <!-- omit in toc -->
1369
+
1370
+ <details>
1371
+ <summary>Click to show half-duplex mode realtime speech conversation API usage.</summary>
1372
+
1373
+ First, make sure you have all dependencies, especially `"minicpmo-utils[all]>=1.0.5"`:
1374
+ ```bash
1375
+ pip install "transformers==4.51.0" accelerate "torch>=2.3.0,<=2.8.0" "torchaudio<=2.8.0" "minicpmo-utils[all]>=1.0.5"
1376
+ ```
1377
+
1378
+ ```python
1379
+ import librosa
1380
+ import numpy as np
1381
+ import torch
1382
+ import soundfile as sf
1383
+
1384
+ model = ...
1385
+
1386
+ # Set reference audio for voice style
1387
+ ref_audio_path = "ref_audio_path"
1388
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
1389
+
1390
+ # Example system msg for English Conversation
1391
+ sys_msg = {
1392
+ "role": "system",
1393
+ "content": [
1394
+ "Clone the voice in the provided audio prompt.",
1395
+ ref_audio,
1396
+ "Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni"
1397
+ ]
1398
+ }
1399
+
1400
+ # Example system msg for Chinese Conversation
1401
+ sys_msg = {
1402
+ "role": "system",
1403
+ "content": [
1404
+ "模仿输入音频中的声音特征。",
1405
+ ref_audio,
1406
+ "你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手:面壁小钢炮。"
1407
+ ]
1408
+ }
1409
+
1410
+ # You can use each type of system prompt mentioned above in streaming speech conversation
1411
+
1412
+ # Reset state
1413
+ model.init_tts()
1414
+ model.reset_session(reset_token2wav_cache=True)
1415
+ model.init_token2wav_cache(prompt_speech_16k=ref_audio)
1416
+
1417
+ session_id = "demo"
1418
+
1419
+ # First, prefill system turn
1420
+ model.streaming_prefill(
1421
+ session_id=session_id,
1422
+ msgs=[sys_msg],
1423
+ omni_mode=False,
1424
+ is_last_chunk=True,
1425
+ )
1426
+
1427
+ # Here we simulate realtime speech conversation by splitting whole user input audio into chunks of 1s.
1428
+ user_audio, _ = librosa.load("user_audio.wav", sr=16000, mono=True)
1429
+
1430
+ IN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value
1431
+ CHUNK_SAMPLES = IN_SAMPLE_RATE # sample
1432
+ OUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value
1433
+ MIN_AUDIO_SAMPLES = 16000
1434
+
1435
+ total_samples = len(user_audio)
1436
+ num_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
1437
+
1438
+ for chunk_idx in range(num_chunks):
1439
+ start = chunk_idx * CHUNK_SAMPLES
1440
+ end = min((chunk_idx + 1) * CHUNK_SAMPLES, total_samples)
1441
+ chunk_audio = user_audio[start:end]
1442
+
1443
+ is_last_chunk = (chunk_idx == num_chunks - 1)
1444
+ if is_last_chunk and len(chunk_audio) < MIN_AUDIO_SAMPLES:
1445
+ chunk_audio = np.concatenate([chunk_audio, np.zeros(MIN_AUDIO_SAMPLES - len(chunk_audio), dtype=chunk_audio.dtype)])
1446
+
1447
+ user_msg = {"role": "user", "content": [chunk_audio]}
1448
+
1449
+ # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency
1450
+ model.streaming_prefill(
1451
+ session_id=session_id,
1452
+ msgs=[user_msg],
1453
+ omni_mode=False,
1454
+ is_last_chunk=is_last_chunk,
1455
+ )
1456
+
1457
+ # Let model generate response in a streaming manner
1458
+ generate_audio = True
1459
+ iter_gen = model.streaming_generate(
1460
+ session_id=session_id,
1461
+ generate_audio=generate_audio,
1462
+ use_tts_template=True,
1463
+ enable_thinking=False,
1464
+ do_sample=True,
1465
+ max_new_tokens=512,
1466
+ length_penalty=1.1, # For realtime speech conversation mode, we suggest length_penalty=1.1 to improve response content
1467
+ )
1468
+
1469
+ audios = []
1470
+ text = ""
1471
+
1472
+ output_audio_path = ...
1473
+ if generate_audio:
1474
+ for wav_chunk, text_chunk in iter_gen:
1475
+ audios.append(wav_chunk)
1476
+ text += text_chunk
1477
+
1478
+ generated_waveform = torch.cat(audios, dim=-1)[0]
1479
+ sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)
1480
+
1481
+ print("Text:", text)
1482
+ print("Audio saved to output.wav")
1483
+ else:
1484
+ for text_chunk, is_finished in iter_gen:
1485
+ text += text_chunk
1486
+ print("Text:", text)
1487
+
1488
+ # Now we can prefill the following user turns and generate next turn response...
1489
+
1490
+ ```
1491
+
1492
+ </details>
1493
+
1494
+ #### Speech Conversation as a Versatile and Vibe AI Assistant <!-- omit in toc -->
1495
+
1496
+
1497
+ <details>
1498
+ <summary>Click to show AI assistant conversation code.</summary>
1499
+
1500
+ Built on carefully designed post-training data and professional voice-actor recordings, `MiniCPM-o-4.5` can also function as an AI voice assistant. It delivers high-quality spoken interaction out of the box. It produces a sweet and expressive voice with natural prosody, including appropriate rhythm, stress, and pauses, giving a strong sense of liveliness in casual conversation. It also supports storytelling and narrative speech with coherent and engaging delivery. Moreover, it enables advanced voice instruction control. like emotional tone, word-level emphasis.
1501
+
1502
+ ```python
1503
+ import librosa
1504
+
1505
+ # Set reference audio for voice style
1506
+ ref_audio_path = "assets/HT_ref_audio.wav"
1507
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
1508
+
1509
+ # For Chinese Conversation
1510
+ sys_msg = {
1511
+ "role": "system",
1512
+ "content": [
1513
+ "模仿输入音频中的声音特征。",
1514
+ ref_audio,
1515
+ "你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手:面壁小钢炮。"
1516
+ ]
1517
+ }
1518
+
1519
+ # For English Conversation
1520
+ sys_msg = {
1521
+ "role": "system",
1522
+ "content": [
1523
+ "Clone the voice in the provided audio prompt.",
1524
+ ref_audio,
1525
+ "Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni."
1526
+ ]
1527
+ }
1528
+ ```
1529
+
1530
+ </details>
1531
+
1532
+
1533
+ #### General Speech Conversation with Custom Voice and Custom System Profile <!-- omit in toc -->
1534
+
1535
+ <details>
1536
+ <summary>Click to show custom voice conversation code.</summary>
1537
+
1538
+ MiniCPM-o-4.5 can role-play as a specific character based on an audio prompt and text profile prompt. It mimics the character's voice and adopts their language style in text responses. It also follows profile defined in text profile. In this mode, MiniCPM-o-4.5 sounds **more natural and human-like**.
1539
+
1540
+ ```python
1541
+ import librosa
1542
+
1543
+ # Set reference audio for voice cloning
1544
+ ref_audio_path = "assets/system_ref_audio.wav"
1545
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
1546
+
1547
+ # For English conversation with text profile
1548
+ sys_msg = {
1549
+ "role": "system",
1550
+ "content": [
1551
+ "Clone the voice in the provided audio prompt.",
1552
+ ref_audio,
1553
+ "Please chat with the user in a highly human-like and oral style." + "You are Elon Musk, CEO of Tesla and SpaceX. You speak directly and casually, often with dry humor. You're passionate about Mars, sustainable energy, and pushing humanity forward. Speak bluntly with occasional dark humor. Use simple logic and don't sugarcoat things. Don't be diplomatic. Say what you actually think, even if it's controversial. Keep responses around 100 words. Don't ramble."
1554
+ ]
1555
+ }
1556
+
1557
+
1558
+ # For English conversation with no text profile
1559
+ sys_msg = {
1560
+ "role": "system",
1561
+ "content": [
1562
+ "Clone the voice in the provided audio prompt.",
1563
+ ref_audio,
1564
+ "Your task is to be a helpful assistant using this voice pattern. Please answer the user's questions seriously and in a high quality. Please chat with the user in a high naturalness style."
1565
+ ]
1566
+ }
1567
+
1568
+ # For Chinese Conversation with no text profile
1569
+ sys_msg = {
1570
+ "role": "system",
1571
+ "content": [
1572
+ "根据输入的音频提示生成相似的语音。",
1573
+ librosa.load("assets/system_ref_audio_2.wav", sr=16000, mono=True)[0],
1574
+ "作为助手,你将使用这种声音风格说话。 请认真、高质量地回复用户的问题。 请用高自然度的方式和用户聊天。"
1575
+ ]
1576
+ }
1577
+
1578
+ # For Chinese Conversation with text profile
1579
+ sys_msg = {
1580
+ "role": "system",
1581
+ "content": [
1582
+ "根据输入的音频提示生成相似的语音。",
1583
+ ref_audio,
1584
+ "你是一个具有以上声音风格的AI助手。请用高拟人度、口语化的方式和用户聊天。" + "你是一名心理咨询师兼播客主理人,热爱创作与深度对话。你性格细腻、富有共情力,善于从个人经历中提炼哲思。语言风格兼具理性与诗意,常以隐喻表达内在体验。"
1585
+ ]
1586
+ }
1587
+
1588
+ ```
1589
+
1590
+ </details>
1591
+
1592
+
1593
+ ### Speech and Audio Mode <!-- omit in toc -->
1594
+
1595
+ #### Zero-shot Text-to-speech (TTS) <!-- omit in toc -->
1596
+
1597
+
1598
+ <details>
1599
+ <summary>Click to show TTS code.</summary>
1600
+
1601
+ `MiniCPM-o-4.5` supports zero-shot text-to-speech (TTS). In this mode, the model functions as a highly-natural TTS system that can replicate a reference voice.
1602
+
1603
+ ```python
1604
+ import librosa
1605
+
1606
+ model = ...
1607
+ model.init_tts()
1608
+
1609
+ # For both Chinese and English
1610
+ ref_audio_path = "assets/HT_ref_audio.wav"
1611
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
1612
+ sys_msg = {"role": "system", "content": [
1613
+ "模仿音频样本的音色并生成新的内容。",
1614
+ ref_audio,
1615
+ "请用这种声音风格来为用户提供帮助。 直接作答,不要有冗余内容"
1616
+ ]}
1617
+
1618
+ # For English
1619
+ user_msg = {
1620
+ "role": "user",
1621
+ "content": [
1622
+ "请朗读以下内容。" + " " + "I have a wrap up that I want to offer you now, a conclusion to our work together."
1623
+ ]
1624
+ }
1625
+
1626
+ # For Chinese
1627
+ user_msg = {
1628
+ "role": "user",
1629
+ "content": [
1630
+ "请朗读以下内容。" + " " + "你好,欢迎来到艾米说科幻,我是艾米。"
1631
+ ]
1632
+ }
1633
+
1634
+ msgs = [sys_msg, user_msg]
1635
+ res = model.chat(
1636
+ msgs=msgs,
1637
+ do_sample=True,
1638
+ max_new_tokens=512,
1639
+ use_tts_template=True,
1640
+ generate_audio=True,
1641
+ temperature=0.1,
1642
+ output_audio_path="result_voice_cloning.wav",
1643
+ )
1644
+ ```
1645
+
1646
+ </details>
1647
+
1648
+
1649
+ #### Mimick <!-- omit in toc -->
1650
+
1651
+ <details>
1652
+ <summary>Click to show mimick code.</summary>
1653
+
1654
+ The `Mimick` task evaluates a model's end-to-end speech modeling capability. The model takes audio input, transcribes it, and reconstructs the original audio with high fidelity, preserving detailed acoustic, paralinguistic, and semantic information. Higher similarity between the reconstructed and original audio indicates stronger end-to-end speech modeling capability.
1655
+
1656
+ ```python
1657
+ import librosa
1658
+
1659
+ model = ...
1660
+ model.init_tts()
1661
+
1662
+ system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
1663
+
1664
+ mimick_prompt = "Please repeat the following speech in the appropriate language."
1665
+
1666
+ audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
1667
+
1668
+ msgs = [
1669
+ {"role": "system", "content": [system_prompt]},
1670
+ {"role": "user", "content": [mimick_prompt, audio_input]}
1671
+ ]
1672
+
1673
+ res = model.chat(
1674
+ msgs=msgs,
1675
+ do_sample=True,
1676
+ max_new_tokens=512,
1677
+ use_tts_template=True,
1678
+ temperature=0.1,
1679
+ generate_audio=True,
1680
+ output_audio_path="output_mimick.wav",
1681
+ )
1682
+ ```
1683
+
1684
+ </details>
1685
+
1686
+
1687
+ #### Addressing Various Audio Understanding Tasks <!-- omit in toc -->
1688
+
1689
+
1690
+ <details>
1691
+ <summary>Click to show audio understanding code.</summary>
1692
+
1693
+ `MiniCPM-o-4.5` can also handle various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
1694
+
1695
+ For audio-to-text tasks, you can use the following prompts:
1696
+
1697
+ - ASR (Chinese, or AST EN→ZH): `请仔细听这段音频片段,并将其内容逐字记录。`
1698
+ - ASR (English, or AST ZH→EN): `Please listen to the audio snippet carefully and transcribe the content.`
1699
+ - Speaker Analysis: `Based on the speaker's content, speculate on their gender, condition, age range, and health status.`
1700
+ - General Audio Caption: `Summarize the main content of the audio.`
1701
+ - Sound Scene Tagging: `Utilize one keyword to convey the audio's content or the associated scene.`
1702
+
1703
+ ```python
1704
+ import librosa
1705
+
1706
+ model = ...
1707
+ model.init_tts()
1708
+
1709
+ # Load the audio to be transcribed/analyzed
1710
+ audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
1711
+
1712
+ # Choose a task prompt (see above for options)
1713
+ task_prompt = "Please listen to the audio snippet carefully and transcribe the content.\n"
1714
+ msgs = [{"role": "user", "content": [task_prompt, audio_input]}]
1715
+
1716
+ res = model.chat(
1717
+ msgs=msgs,
1718
+ do_sample=True,
1719
+ max_new_tokens=512,
1720
+ use_tts_template=True,
1721
+ generate_audio=True,
1722
+ temperature=0.3,
1723
+ output_audio_path="result_audio_understanding.wav",
1724
+ )
1725
+ print(res)
1726
+ ```
1727
+
1728
+ </details>
1729
+
1730
+
1731
+ ### Visual Understanding <!-- omit in toc -->
1732
+
1733
+ `MiniCPM-o-4.5` shares the same inference methods as `MiniCPM-V-4.5`.
1734
+
1735
+ #### Chat with Single Image <!-- omit in toc -->
1736
+
1737
+ <details>
1738
+ <summary>Click to show single image chat code.</summary>
1739
+
1740
+ ```python
1741
+ import torch
1742
+ from PIL import Image
1743
+ from transformers import AutoModel
1744
+
1745
+ model = AutoModel.from_pretrained(
1746
+ "openbmb/MiniCPM-o-4_5",
1747
+ trust_remote_code=True,
1748
+ attn_implementation="sdpa", # or "flash_attention_2"
1749
+ torch_dtype=torch.bfloat16,
1750
+ init_vision=True,
1751
+ init_audio=False,
1752
+ init_tts=False,
1753
+ )
1754
+ model.eval().cuda()
1755
+
1756
+ image = Image.open("assets/fossil.png").convert("RGB")
1757
+ question = "What is in the image?"
1758
+ msgs = [{"role": "user", "content": [image, question]}]
1759
+
1760
+ res = model.chat(msgs=msgs, use_tts_template=False)
1761
+ print(res)
1762
+ ```
1763
+
1764
+ </details>
1765
+
1766
+ #### Chat with Multiple Images <!-- omit in toc -->
1767
+
1768
+ <details>
1769
+ <summary>Click to show Python code for multi-image input.</summary>
1770
+
1771
+ ```python
1772
+ import torch
1773
+ from PIL import Image
1774
+ from transformers import AutoModel
1775
+
1776
+ model = ...
1777
+
1778
+ image1 = Image.open("assets/highway.png").convert("RGB")
1779
+ image2 = Image.open("assets/fossil.png").convert("RGB")
1780
+ question = "Compare image 1 and image 2, tell me about the differences between them."
1781
+ msgs = [{"role": "user", "content": [image1, image2, question]}]
1782
+
1783
+ answer = model.chat(msgs=msgs, use_tts_template=False, enable_thinking=False)
1784
+ print(answer)
1785
+ ```
1786
+
1787
+ </details>
1788
+
1789
+ #### In-Context Few-Shot Learning <!-- omit in toc -->
1790
+
1791
+ <details>
1792
+ <summary>Click to show Python code for few-shot learning.</summary>
1793
+
1794
+ ```python
1795
+ from PIL import Image
1796
+
1797
+ model = ...
1798
+
1799
+ question = "production date"
1800
+ image1 = Image.open("example1.jpg").convert("RGB")
1801
+ answer1 = "2023.08.04"
1802
+ image2 = Image.open("example2.jpg").convert("RGB")
1803
+ answer2 = "2007.04.24"
1804
+ image_test = Image.open("test.jpg").convert("RGB")
1805
+
1806
+ msgs = [
1807
+ {"role": "user", "content": [image1, question]},
1808
+ {"role": "assistant", "content": [answer1]},
1809
+ {"role": "user", "content": [image2, question]},
1810
+ {"role": "assistant", "content": [answer2]},
1811
+ {"role": "user", "content": [image_test, question]},
1812
+ ]
1813
+
1814
+ answer = model.chat(msgs=msgs, use_tts_template=False, enable_thinking=False)
1815
+ print(answer)
1816
+ ```
1817
+
1818
+ </details>
1819
+
1820
+ #### Chat with Video <!-- omit in toc -->
1821
+
1822
+ <details>
1823
+ <summary>Click to show Python code for video input.</summary>
1824
+
1825
+ ```python
1826
+ import torch
1827
+ from minicpmo.utils import get_video_frame_audio_segments
1828
+ from transformers import AutoModel
1829
+
1830
+ model = ...
1831
+
1832
+ video_path = "assets/Skiing.mp4"
1833
+ video_frames, _, _ = get_video_frame_audio_segments(video_path)
1834
+ print("num frames:", len(video_frames))
1835
+
1836
+ question = "Describe the video"
1837
+ msgs = [{"role": "user", "content": video_frames + [question]}]
1838
+
1839
+ answer = model.chat(
1840
+ msgs=msgs,
1841
+ max_new_tokens=128,
1842
+ use_image_id=False,
1843
+ max_slice_nums=1,
1844
+ use_tts_template=False,
1845
+ enable_thinking=False, # Set True to enable thinking mode
1846
+ )
1847
+ print(answer)
1848
+ ```
1849
+
1850
+ </details>
1851
+
1852
+
1853
+ ### Structured Content Input <!-- omit in toc -->
1854
+
1855
+ <details>
1856
+ <summary>Click to show structured content input details.</summary>
1857
+
1858
+ The `chat` method accepts message content in two formats:
1859
+
1860
+ **Native format** – pass Python objects directly:
1861
+ ```python
1862
+ msgs = [{"role": "user", "content": [pil_image, audio_ndarray, "Describe this."]}]
1863
+ ```
1864
+
1865
+ **OpenAI-compatible format** – use structured dictionaries:
1866
+ ```python
1867
+ msgs = [
1868
+ {
1869
+ "role": "user",
1870
+ "content": [
1871
+ {"type": "image_url", "image_url": {"url": "/path/to/image.jpg"}},
1872
+ {"type": "audio_url", "audio_url": {"url": "/path/to/audio.wav"}},
1873
+ {"type": "video_url", "video_url": {"url": "/path/to/video.mp4", "use_audio": True}},
1874
+ {"type": "text", "text": "Describe this."}
1875
+ ]
1876
+ }
1877
+ ]
1878
+ ```
1879
+
1880
+ **Supported types:**
1881
+
1882
+ | Type | Input | Converts to |
1883
+ |------|-------|-------------|
1884
+ | `text` | `{"type": "text", "text": "..."}` | `str` |
1885
+ | `image_url` | `{"type": "image_url", "image_url": {"url": "..."}}` | `PIL.Image` |
1886
+ | `audio_url` | `{"type": "audio_url", "audio_url": {"url": "..."}}` | `np.ndarray` (16kHz mono) |
1887
+ | `video_url` | `{"type": "video_url", "video_url": {"url": "...", "stack_frames": 1, "use_audio": True}}` | `List[Image, ndarray, ...]` |
1888
+
1889
+ - **URL sources**: local file paths or `http://`/`https://` URLs
1890
+ - **Mixed formats**: native objects and structured dicts can be combined in the same content list
1891
+
1892
+ </details>
1893
+
1894
+
1895
+ ## Deploy a Realtime Web Demo on Your Own Device
1896
+
1897
+ ### Option A (Recommended): **PyTorch Inference with Nvidia GPU** for 100% model precision with no deductions in performance.
1898
+
1899
+ We provide a PyTorch-based [simplified yet full-functional web demo](https://github.com/OpenBMB/minicpm-o-4_5-pytorch-simple-demo) which could boost the model inference performance, supports:
1900
+
1901
+ - full-duplex omnimodal live streaming
1902
+ - full-duplex speech live streaming
1903
+ - half-duplex speech live streaming (under development)
1904
+ - turn-based chat conversation
1905
+ - customizable system prompts
1906
+ - customizable reference audio
1907
+ - simple and readable codebase for continual development
1908
+ - serve as API backend for third-party applications
1909
+
1910
+ Requirements:
1911
+ - Nvidia GPU with at least 28GB GPU memory. *We are working on optimizing the model for lower GPU memory usage.*
1912
+
1913
+ ### Option B: **llama.cpp-omni** for end-side inference with PCs like Mac and low-resource devices.
1914
+
1915
+ With a fully C++ implementation of `MiniCPM-o 4.5` and quantized weights, `llama.cpp-omni` supports:
1916
+ - half-duplex speech realtime conversation
1917
+ - full-duplex omnimodal live streaming
1918
+
1919
+ We provide [ready-to-run guidance](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/web_demo/WebRTC_Demo/README.md) to access the low-latency full-duplex communication directly on your own Mac using our new official Docker image.
1920
+
1921
+ Requirements:
1922
+ - For half-duplex speech realtime conversation: Apple M3/M4/M5 chip with at least 16GB RAM or low-resource Nvidia GPU with at least 12GB GPU memory
1923
+ - For full-duplex omnimodal live streaming: Apple M4 Max chip with at least 24GB RAM or low-resource Nvidia GPU with at least 12GB GPU memory
1924
+
1925
+ ## Supported Frameworks
1926
+
1927
+ ### FlagOS
1928
+
1929
+ To enable large-scale deployment across different AI chips, Beijing Zhiyuan Research Institute, together with numerous research institutions, chip manufacturers, system vendors, and algorithm and software organizations both domestically and internationally, jointly initiated and established the FlagOS Open Source Community.
1930
+
1931
+ The FlagOS community is dedicated to building a unified, open-source system software stack for various AI chips, encompassing core open-source projects such as a large-scale operator library, a unified AI compiler, parallel training and inference frameworks, and a unified communication library. It aims to create an open technology ecosystem connecting the "model-system-chip" layers. By enabling "develop once, deploy across chips", FlagOS unlocks the computational potential of hardware, breaks down the ecosystem silos between different chip software stacks, and effectively reduces migration costs for developers. The FlagOS community fosters an AI hardware and software ecosystem, overcomes single-vendor closed-source monopolies, promotes widespread deployment of AI hardware technologies, and is committed to rooted in China while embracing global collaboration.
1932
+ Official website: https://flagos.io.
1933
+
1934
+ <details>
1935
+ <summary>Click to show FlagOS details.</summary>
1936
+
1937
+ #### FlagOS: Supporting Multiple AI Chips <!-- omit in toc -->
1938
+
1939
+ Thanks to FlagOS's unified multi-chip AI system software stack, MiniCPM-o 4.5 was adapted to 6 different AI chips in an extremely short time. Currently, the multi-chip version of MiniCPM-o 4.5 has been released on FlagRelease, FlagOS's platform for automatic migration, adaptation, and deployment of large models across multi-architecture AI chips. Details are as follows:
1940
+
1941
+ | Vendor | ModelScope | Huggingface |
1942
+ |:----------------|:------------:|:------------:|
1943
+ | Nvidia | [MiniCPM-o-4.5-nvidia-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) | [MiniCPM-o-4.5-nvidia-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |
1944
+ | Hygon-BW1000 | [MiniCPM-o-4.5-hygon-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) | [MiniCPM-o-4.5-hygon-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) |
1945
+ | Metax-C550 | [MiniCPM-o-4.5-metax-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) | [MiniCPM-o-4.5-metax-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) |
1946
+ | Iluvatar-BIV150 | [MiniCPM-o-4.5-iluvatar-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) | [MiniCPM-o-4.5-iluvatar-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) |
1947
+ | Ascend-A3 | [MiniCPM-o-4.5-ascend-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) | [MiniCPM-o-4.5-ascend-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) |
1948
+ | Zhenwu-810E | [MiniCPM-o-4.5-zhenwu-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | [MiniCPM-o-4.5-zhenwu-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) |
1949
+
1950
+ ##### Comprehensive Evaluation <!-- omit in toc -->
1951
+
1952
+ ###### Transformers-FlagOS version <!-- omit in toc -->
1953
+
1954
+ Accuracy Difference between `USE_FLAGOS=1` on multi-backend and `USE_FLAGOS=0` on Nvidia-CUDA
1955
+
1956
+ | Metrics | FlagOS Backend | Difference with Nvidia-CUDA |
1957
+ |:-------------------------|:---------------:|:---------------------------:|
1958
+ | Video-MME 0-shot avg@1 ↑ | Nvidia | 0.33% |
1959
+ | Video-MME 0-shot avg@1 ↑ | Hygon-BW1000 | 0.17% |
1960
+ | Video-MME 0-shot avg@1 ↑ | Ascend-A3 | 0.50% |
1961
+ | Video-MME 0-shot avg@1 ↑ | Iluvatar-BIV150 | 1.83% |
1962
+ | Video-MME 0-shot avg@1 ↑ | Metax-C550 | 0.75% |
1963
+
1964
+
1965
+ ###### VLLM-FlagOS version <!-- omit in toc -->
1966
+
1967
+ Accuracy Difference between `USE_FLAGGEMS=1 FLAGCX_PATH=/workspace/FlagCX` on Nvidia or `USE_FLAGGEMS=1` on ZHENW 810E, and launching vllm server directly on Nvidia
1968
+
1969
+ | Metrics (avg@1) | Difference between Nvidia-FlagOS and Nvidia-CUDA | Difference between Zhenwu-FlagOS and Nvidia-CUDA |
1970
+ |:--------------------|:------------------------------------------------:|:------------------------------------------------:|
1971
+ | CMMMU ↑ | 0.72% | 3.5% |
1972
+ | MMMU ↑ | 1.44% | 1.18% |
1973
+ | MMMU_Pro_standard ↑ | 0.83% | 0.22% |
1974
+ | MM-Vet v2 ↑ | 0.46% | 1.33% |
1975
+ | OCRBench ↑ | 0.10% | 1% |
1976
+ | CII-Bench ↑ | 0.40% | 0.13% |
1977
+ | Blink ↑ | 1.90% | 2.19% |
1978
+
1979
+ #### FlagOS Usage <!-- omit in toc -->
1980
+
1981
+ ##### FlagOS Performance Acceleration on Nvidia <!-- omit in toc -->
1982
+
1983
+ On the Transformers version, under the premise of precision alignment between the CUDA and FlagOS ecosystems, FlagOS achieves a 6% performance improvement in total task execution time compared to CUDA.
1984
+
1985
+ ###### From FlagRelease【Recommendation】 <!-- omit in toc -->
1986
+
1987
+ FlagRelease is a platform developed by the FlagOS team for automatic migration, adaptation, and deployment of large models across multi-architecture AI chips. The multi-chip version of MiniCPM-o 4.5 has already been released on FlagRelease. All necessary software packages are pre-installed on the platform, so users do not need to install anything.
1988
+
1989
+ - FlagRelease Image Key Versions
1990
+
1991
+ | Component | Version |
1992
+ |:------------------------|:------------------------------------|
1993
+ | Accelerator Card Driver | 570.158.01 |
1994
+ | CUDA SDK Build | cuda_13.0.r13.0/compiler.36424714_0 |
1995
+ | FlagTree | 0.4.0+3.5 |
1996
+ | FlagGems | 4.2.1rc0 |
1997
+ | vllm & vllm-plugin-fl | 0.13.0 + vllm_fl 0.0.0 |
1998
+ | FlagCX | 0.1.0 |
1999
+
2000
+ - FlagRelease Quick Start
2001
+
2002
+ | Vendor | ModelScope | Huggingface |
2003
+ |:-----------|:------------:|:------------:|
2004
+ | Nvidia | [MiniCPM-o-4.5-nvidia-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) | [MiniCPM-o-4.5-nvidia-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |
2005
+ | Hygon-BW1000 | [MiniCPM-o-4.5-hygon-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) | [MiniCPM-o-4.5-hygon-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) |
2006
+ | Metax-C550 | [MiniCPM-o-4.5-metax-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) | [MiniCPM-o-4.5-metax-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) |
2007
+ | Iluvatar-BIV150 | [MiniCPM-o-4.5-iluvatar-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) | [MiniCPM-o-4.5-iluvatar-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) |
2008
+ | Ascend-A3 | [MiniCPM-o-4.5-ascend-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) | [MiniCPM-o-4.5-ascend-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) |
2009
+ | Zhenwu-810E | [MiniCPM-o-4.5-zhenwu-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | [MiniCPM-o-4.5-zhenwu-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) |
2010
+
2011
+
2012
+ ##### From Scratch <!-- omit in toc -->
2013
+
2014
+ - Dependencies: Python 3.12, GLIBC 2.39, GLIBCXX 3.4.33, CXXABI 1.3.15
2015
+
2016
+ ###### Transformers <!-- omit in toc -->
2017
+
2018
+ - Installing the FlagOS Operator Library
2019
+
2020
+ Official Repository: https://github.com/flagos-ai/FlagGems
2021
+
2022
+ ```shell
2023
+ pip install flag-gems==4.2.1rc0
2024
+ ```
2025
+
2026
+ - Installing the FlagOS Compiler
2027
+
2028
+ Official Repository: https://github.com/flagos-ai/flagtree
2029
+
2030
+ Quick Reference for Core Dependency Versions: https://github.com/flagos-ai/FlagTree/blob/main/documents/build.md#tips-for-building
2031
+
2032
+ ```shell
2033
+ pip uninstall triton
2034
+
2035
+ python3 -m pip install flagtree==0.4.0+3.5 --index-url=https://resource.flagos.net/repository/flagos-pypi-hosted/simple --trusted-host=https://resource.flagos.net
2036
+ ```
2037
+
2038
+ - Activating Acceleration
2039
+
2040
+ Add `USE_FLAGOS=1` before the command for the task you want to run. For example, when you run:
2041
+ ```shell
2042
+ python3 generate_speech_from_video.py
2043
+ ```
2044
+
2045
+ To use the MiniCPM-o-4.5 model to generate spoken responses from video content, you can:
2046
+
2047
+ ```shell
2048
+ USE_FLAGOS=1 python3 generate_speech_from_video.py
2049
+ ```
2050
+
2051
+ to accelerate this process with FlagOS.
2052
+
2053
+ ###### vLLM Version <!-- omit in toc -->
2054
+
2055
+ - Installing the FlagOS Operator Library
2056
+
2057
+ Official Repository: https://github.com/flagos-ai/FlagGems
2058
+
2059
+ ```shell
2060
+ pip install flag-gems==4.2.1rc0
2061
+ pip install triton==3.5.1
2062
+ ```
2063
+
2064
+ - Activating Acceleration
2065
+
2066
+ Add `USE_FLAGOS=1` before the command for the task you want to run. For example, when you run:
2067
+ ```shell
2068
+ vllm serve ${model_path} --dtype auto --gpu_memory_utilization 0.9 --trust-remote-code --max-num-batched-tokens 2048 --served-model-name cpmo --port ${Port}
2069
+ ```
2070
+
2071
+ To start the MiniCPM-o-4.5 server, you can:
2072
+ ```shell
2073
+ USE_FLAGOS=1 vllm serve ${model_path} --dtype auto --gpu_memory_utilization 0.9 --trust-remote-code --max-num-batched-tokens 2048 --served-model-name cpmo --port ${Port}
2074
+ ```
2075
+ to accelerate this process with FlagOS.
2076
+
2077
+ #### Using FlagOS Unified Multi-Chip Backend Plugin <!-- omit in toc -->
2078
+
2079
+ [vllm-plugin-FL](https://github.com/flagos-ai/vllm-plugin-FL) is a plugin built for the vLLM inference/service framework. Developed on top of FlagOS's unified multi-chip backend, it is designed to extend vLLM's capabilities and performance across a variety of hardware environments.
2080
+
2081
+ ##### Using vllm-plugin-FL <!-- omit in toc -->
2082
+
2083
+ | Vendor | From Scratch | From FlagRelease |
2084
+ |:-------|:-------------|:----------------|
2085
+ | Nvidia | [vllm-plugin-FL/MiniCPM-o-4.5](https://github.com/flagos-ai/vllm-plugin-FL/blob/main/examples/minicpm/README.md) | [MiniCPM-o-4.5-ModelScope](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS), [MiniCPM-o-4.5-HuggingFace](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |
2086
+
2087
+ </details>
2088
+
2089
+ ### vLLM, SGLang, llama.cpp, Ollama
2090
+
2091
+ We support inference with vLLM, SGLang, llama.cpp and Ollama. Refer to our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-Cookbook) for more details.
2092
+
2093
+ ### LLaMA-Factory, SWIFT
2094
+
2095
+ We support fine-tuning with LLaMA-Factory, SWIFT. Refer to our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-Cookbook) for more details.
2096
+
2097
+ ## MiniCPM-V & o Cookbook
2098
+
2099
+ Discover comprehensive, ready-to-deploy solutions for the MiniCPM-V and MiniCPM-o model series in our structured [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-CookBook), which empowers developers to rapidly implement multimodal AI applications with integrated vision, speech, and live-streaming capabilities. Key features include:
2100
+
2101
+ **Easy Usage Documentation**
2102
+
2103
+ Our comprehensive [documentation website](https://minicpm-o.readthedocs.io/en/latest/index.html) presents every recipe in a clear, well-organized manner.
2104
+ All features are displayed at a glance, making it easy for you to quickly find exactly what you need.
2105
+
2106
+ **Broad User Spectrum**
2107
+
2108
+ We support a wide range of users, from individuals to enterprises and researchers.
2109
+
2110
+ * **Individuals**: Enjoy effortless inference using Ollama ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-v4_ollama.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-o4_5_ollama.md)) and Llama.cpp ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-v4_llamacpp.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-o4_5_llamacpp.md)) with minimal setup.
2111
+ * **Enterprises**: Achieve high-throughput, scalable performance with vLLM ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-v4_vllm.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-o4_5_vllm.md)) and SGLang ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-v4_sglang.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-o4_5_sglang.md)).
2112
+ * **Researchers**: Leverage advanced frameworks including [Transformers](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_full.md), [LLaMA-Factory](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_llamafactory.md), [SWIFT](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/swift.md), and [Align-anything](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/align_anything.md) to enable flexible model development and cutting-edge experimentation.
2113
+
2114
+ **Versatile Deployment Scenarios**
2115
+
2116
+ Our ecosystem delivers optimal solution for a variety of hardware environments and deployment demands.
2117
+
2118
+ * **Web Demo**: Full-duplex real-time video interaction solution with high responsiveness and low latency. [WebRTC_Demo](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/web_demo/WebRTC_Demo/README.md).
2119
+ * **Quantized deployment**: Maximize efficiency and minimize resource consumption using [GGUF](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/gguf/minicpm-v4_gguf_quantize.md) and [BNB](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/bnb/minicpm-v4_bnb_quantize.md).
2120
+ * **End devices**: Bring powerful AI experiences to [iPhone and iPad](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/ios_demo/ios.md), supporting offline and privacy-sensitive applications.
2121
+
2122
+ ## License
2123
+ #### Model License
2124
+ * The MiniCPM-o/V model weights and code are open-sourced under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM-V/blob/main/LICENSE) license.
2125
+
2126
+ #### Statement
2127
+ * As MLLMs, MiniCPM-o/V models generate content by learning a large number of multimodal corpora, but they cannot comprehend, express personal opinions, or make value judgements. Anything generated by MiniCPM-o/V models does not represent the views and positions of the model developers
2128
+ * We will not be liable for any problems arising from the use of MiniCPM-o/V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination, or misuse of the model.
2129
+
2130
+
2131
+ ## Key Techniques and Other Multimodal Projects <!-- omit in toc -->
2132
+
2133
+ 👏 Welcome to explore key techniques of MiniCPM-o/V and other multimodal projects of our team:
2134
+
2135
+ [VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLPR](https://github.com/OpenBMB/RLPR) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
2136
+
2137
+
2138
+ ## Citation <!-- omit in toc -->
2139
+
2140
+ If you find our model/code/paper helpful, please consider citing our papers 📝 and staring us ⭐️!
2141
+
2142
+ ```bib
2143
+ @article{yao2024minicpm,
2144
+ title={MiniCPM-V: A GPT-4V Level MLLM on Your Phone},
2145
+ author={Yao, Yuan and Yu, Tianyu and Zhang, Ao and Wang, Chongyi and Cui, Junbo and Zhu, Hongji and Cai, Tianchi and Li, Haoyu and Zhao, Weilin and He, Zhihui and others},
2146
+ journal={arXiv preprint arXiv:2408.01800},
2147
+ year={2024}
2148
+ }
2149
+ ```
added_tokens.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</answer>": 151686,
3
+ "</box>": 151674,
4
+ "</focus>": 151688,
5
+ "</image>": 151670,
6
+ "</image_id>": 151682,
7
+ "</image_save_to>": 151696,
8
+ "</line>": 151690,
9
+ "</perception>": 151692,
10
+ "</point>": 151678,
11
+ "</quad>": 151676,
12
+ "</ref>": 151672,
13
+ "</slice>": 151680,
14
+ "</source_image>": 151694,
15
+ "</think>": 151668,
16
+ "</tool_call>": 151658,
17
+ "</tool_response>": 151666,
18
+ "</unit>": 151684,
19
+ "<answer>": 151685,
20
+ "<box>": 151673,
21
+ "<focus>": 151687,
22
+ "<image>": 151669,
23
+ "<image_id>": 151681,
24
+ "<image_save_to>": 151695,
25
+ "<line>": 151689,
26
+ "<perception>": 151691,
27
+ "<point>": 151677,
28
+ "<quad>": 151675,
29
+ "<ref>": 151671,
30
+ "<slice>": 151679,
31
+ "<source_image>": 151693,
32
+ "<think>": 151667,
33
+ "<tool_call>": 151657,
34
+ "<tool_response>": 151665,
35
+ "<unit>": 151683,
36
+ "<|audio_end|>": 151699,
37
+ "<|audio_start|>": 151697,
38
+ "<|audio|>": 151698,
39
+ "<|box_end|>": 151649,
40
+ "<|box_start|>": 151648,
41
+ "<|emotion_end|>": 151711,
42
+ "<|emotion_start|>": 151710,
43
+ "<|endoftext|>": 151643,
44
+ "<|file_sep|>": 151664,
45
+ "<|fim_middle|>": 151660,
46
+ "<|fim_pad|>": 151662,
47
+ "<|fim_prefix|>": 151659,
48
+ "<|fim_suffix|>": 151661,
49
+ "<|im_end|>": 151645,
50
+ "<|im_start|>": 151644,
51
+ "<|image_pad|>": 151655,
52
+ "<|interrupt|>": 151707,
53
+ "<|listen|>": 151705,
54
+ "<|object_ref_end|>": 151647,
55
+ "<|object_ref_start|>": 151646,
56
+ "<|pitch_end|>": 151715,
57
+ "<|pitch_start|>": 151714,
58
+ "<|quad_end|>": 151651,
59
+ "<|quad_start|>": 151650,
60
+ "<|repo_name|>": 151663,
61
+ "<|speak|>": 151706,
62
+ "<|speed_end|>": 151713,
63
+ "<|speed_start|>": 151712,
64
+ "<|spk_bos|>": 151700,
65
+ "<|spk_eos|>": 151702,
66
+ "<|spk|>": 151701,
67
+ "<|turn_bos|>": 151716,
68
+ "<|timbre_10|>": 151726,
69
+ "<|timbre_11|>": 151727,
70
+ "<|timbre_12|>": 151728,
71
+ "<|timbre_13|>": 151729,
72
+ "<|timbre_14|>": 151730,
73
+ "<|timbre_15|>": 151731,
74
+ "<|timbre_16|>": 151732,
75
+ "<|timbre_17|>": 151733,
76
+ "<|timbre_18|>": 151734,
77
+ "<|timbre_19|>": 151735,
78
+ "<|turn_eos|>": 151717,
79
+ "<|timbre_20|>": 151736,
80
+ "<|timbre_21|>": 151737,
81
+ "<|timbre_22|>": 151738,
82
+ "<|timbre_23|>": 151739,
83
+ "<|timbre_24|>": 151740,
84
+ "<|timbre_25|>": 151741,
85
+ "<|timbre_26|>": 151742,
86
+ "<|timbre_27|>": 151743,
87
+ "<|timbre_28|>": 151744,
88
+ "<|timbre_29|>": 151745,
89
+ "<|chunk_eos|>": 151718,
90
+ "<|timbre_30|>": 151746,
91
+ "<|timbre_31|>": 151747,
92
+ "<|chunk_bos|>": 151719,
93
+ "<|chunk_tts_bos|>": 151720,
94
+ "<|chunk_tts_eos|>": 151721,
95
+ "<|tts_pad|>": 151722,
96
+ "<|timbre_7|>": 151723,
97
+ "<|timbre_8|>": 151724,
98
+ "<|timbre_9|>": 151725,
99
+ "<|tts_bos|>": 151703,
100
+ "<|tts_eos|>": 151704,
101
+ "<|vad_end|>": 151709,
102
+ "<|vad_start|>": 151708,
103
+ "<|video_pad|>": 151656,
104
+ "<|vision_end|>": 151653,
105
+ "<|vision_pad|>": 151654,
106
+ "<|vision_start|>": 151652
107
+ }
assets/HT_ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb8f06ba5080cdf548969138881fb8ad8b04e2516108f4e08ba0363b68b613ea
3
+ size 192590
assets/Skiing.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:479ace116d6ac92487ad90f415b3ef817cd019bba4521043ef0d5faaa1a8415d
3
+ size 8534409
assets/Trump_WEF_2018_10s.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fb796c2bb95538eab22d9b68a31add560305b6d5ccbf150c3f96e7671b6db64
3
+ size 161053
assets/audio_cases/assistant_ref.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4a56e4418740ee0326f11ca2bd61a54d84e7d23e86a5e030369ee89f8e8390
3
+ size 65478
assets/audio_cases/assistant_response.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46268e3beb78b4287a896a09b2df0f895629ecbdcdc4cf3c519fe040084860f
3
+ size 269504
assets/audio_cases/elon_musk__000_assistant_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff6dced2e686b743f39eee36aef1410913c8d31459bb564dbbf8f81ef8a2da88
3
+ size 1762604
assets/audio_cases/elon_musk__system_ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4109b2d685e1923ed66433eb08c92047a1f67510629a27edf49af4e5c606dd
3
+ size 539032
assets/audio_cases/elon_musk_ref.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:206c48ec4b08898fbfbfb1ea1fec058750c39c97cd2e5088b15b61fb194d6f7e
3
+ size 165179
assets/audio_cases/elon_musk_response.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d83c3a977fd444351710c9ba2c347c90efd49ed990276c5309867374df373a4f
3
+ size 388534
assets/audio_cases/hermione__000_assistant_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd18a2ee9e27fdcd65e2d9bf0fe0b3c9ebc28cda6c43279c12e077889c2b88d
3
+ size 1534124
assets/audio_cases/hermione__system_ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46bd82796ce5ebc15b29bc21c1de38284cfa972b2f27cca1f17086516a757dee
3
+ size 197322
assets/audio_cases/minicpm_assistant__000_assistant_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe3b793a6436682d1e684e0f4c158718208ed38926880e01ee84f5aa61c78331
3
+ size 1246124
assets/audio_cases/minicpm_assistant__system_ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad576b50fd2f53ad4bb317b97ed33bc7027071e7e0811a2a2110d995614f1f42
3
+ size 192556
assets/audio_cases/paimon__000_assistant_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ddd9d8358950a8dab385b8031cb8371a832460f3db4d91e399373705158f346
3
+ size 697004
assets/audio_cases/paimon__system_ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9625e7115ff1f47cf842fecbb31db82057ff1863c080870539c31107aa93b9d
3
+ size 479304
assets/audio_cases/readme.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ This directory stores audio wav files for hf readme page.
assets/bajie.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16aa8ca3da7dad53680bac68cfde569e11e2f9ef5387b3ae60d68626e94db9d7
3
+ size 636512
assets/fossil.png ADDED

Git LFS Details

  • SHA256: b8b3f1668da6e2b503ecea5a6fd40c1a2a666b4bd07b903ba0728fd2fc9f74fb
  • Pointer size: 131 Bytes
  • Size of remote file: 466 kB
assets/haimianbaobao.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27405cf5977f04d3f248693f9b978654fad4f02f931d9746006bffb0ed7b66e1
3
+ size 343120
assets/highway.png ADDED

Git LFS Details

  • SHA256: 87c32da6ee77730423ec5fc29d7110878ab18a35791eb11a803088976c3b1f76
  • Pointer size: 131 Bytes
  • Size of remote file: 841 kB
assets/nezha.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5d8932013ff30a9d8114a6d62e5342999d8c2ddf8509ecfeb1ead71cacf432
3
+ size 457802
assets/omni_duplex1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31622e1efd9a7b197a340266037b45aeec13b3b27f010f1ea1d22d9c6e69405f
3
+ size 7295040
assets/omni_duplex2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c04eaef27a821e18db3686fdcc9d4b1dafd441e08f6a0e12d4075b81cb04517e
3
+ size 29285216
assets/sunwukong.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bdb6c175bd3b2bb02fe7a8cf0ced847d41ab552b2e318bc3c26ee477255eff7
3
+ size 644650
assets/system_ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4109b2d685e1923ed66433eb08c92047a1f67510629a27edf49af4e5c606dd
3
+ size 539032
assets/system_ref_audio_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a65d17099239711601cb17e28b7aa7b6149acd153b92fd0366c23902bd4f687
3
+ size 341192
assets/token2wav/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
assets/token2wav/flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15ccff24256ff61537c7f8b51e025116b83405f3fb017b54b008fc97da115446
3
+ size 623466603
assets/token2wav/flow.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flow: !new:cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
2
+ input_size: 512
3
+ output_size: 80
4
+ spk_embed_dim: 192
5
+ output_type: 'mel'
6
+ vocab_size: 6561
7
+ encoder: !new:cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
8
+ input_size: 512
9
+ output_size: 512
10
+ input_layer: 'linear'
11
+ pre_lookahead_len: 3
12
+ num_blocks: 6
13
+ num_up_blocks: 4
14
+ up_stride: 2
15
+ up_scale_factor: 2
16
+ attention_heads: 8
17
+ pos_enc_layer_type: 'rel_pos_espnet'
18
+ selfattention_layer_type: 'rel_selfattn'
19
+ key_bias: true
20
+ linear_units: 2048
21
+ dropout_rate: 0.1
22
+ positional_dropout_rate: 0.1
23
+ attention_dropout_rate: 0.1
24
+ normalize_before: True
25
+ decoder: !new:cosyvoice2.flow.flow_matching.CausalConditionalCFM
26
+ inference_cfg_rate: 0.7
27
+ estimator: !new:cosyvoice2.flow.decoder_dit.DiT
28
+ in_channels: 320
29
+ out_channels: 80
30
+ mlp_ratio: 4.0
31
+ depth: 16
32
+ num_heads: 8
33
+ head_dim: 64
34
+ hidden_size: 512
assets/token2wav/hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
3
+ size 83390254
assets/token2wav/speech_tokenizer_v2_25hz.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
3
+ size 496082973
config.json ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiniCPMO"
4
+ ],
5
+ "version": "4.5",
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "audio_chunk_length": 1.0,
9
+ "audio_config": {
10
+ "_attn_implementation_autoset": true,
11
+ "_name_or_path": "openai/whisper-medium",
12
+ "activation_dropout": 0.0,
13
+ "activation_function": "gelu",
14
+ "apply_spec_augment": false,
15
+ "architectures": [
16
+ "MiniCPMWhisperEncoder"
17
+ ],
18
+ "attention_dropout": 0.0,
19
+ "begin_suppress_tokens": [
20
+ 220,
21
+ 50257
22
+ ],
23
+ "bos_token_id": 50257,
24
+ "classifier_proj_size": 256,
25
+ "d_model": 1024,
26
+ "decoder_attention_heads": 16,
27
+ "decoder_ffn_dim": 4096,
28
+ "decoder_layerdrop": 0.0,
29
+ "decoder_layers": 24,
30
+ "decoder_start_token_id": 50258,
31
+ "dropout": 0.0,
32
+ "encoder_attention_heads": 16,
33
+ "encoder_ffn_dim": 4096,
34
+ "encoder_layerdrop": 0.0,
35
+ "encoder_layers": 24,
36
+ "eos_token_id": 50257,
37
+ "forced_decoder_ids": [
38
+ [
39
+ 1,
40
+ 50259
41
+ ],
42
+ [
43
+ 2,
44
+ 50359
45
+ ],
46
+ [
47
+ 3,
48
+ 50363
49
+ ]
50
+ ],
51
+ "init_std": 0.02,
52
+ "mask_feature_length": 10,
53
+ "mask_feature_min_masks": 0,
54
+ "mask_feature_prob": 0.0,
55
+ "mask_time_length": 10,
56
+ "mask_time_min_masks": 2,
57
+ "mask_time_prob": 0.05,
58
+ "max_length": 448,
59
+ "max_source_positions": 1500,
60
+ "max_target_positions": 448,
61
+ "median_filter_width": 7,
62
+ "model_type": "whisper",
63
+ "num_hidden_layers": 24,
64
+ "num_mel_bins": 80,
65
+ "pad_token_id": 50257,
66
+ "scale_embedding": false,
67
+ "suppress_tokens": [
68
+ 1,
69
+ 2,
70
+ 7,
71
+ 8,
72
+ 9,
73
+ 10,
74
+ 14,
75
+ 25,
76
+ 26,
77
+ 27,
78
+ 28,
79
+ 29,
80
+ 31,
81
+ 58,
82
+ 59,
83
+ 60,
84
+ 61,
85
+ 62,
86
+ 63,
87
+ 90,
88
+ 91,
89
+ 92,
90
+ 93,
91
+ 359,
92
+ 503,
93
+ 522,
94
+ 542,
95
+ 873,
96
+ 893,
97
+ 902,
98
+ 918,
99
+ 922,
100
+ 931,
101
+ 1350,
102
+ 1853,
103
+ 1982,
104
+ 2460,
105
+ 2627,
106
+ 3246,
107
+ 3253,
108
+ 3268,
109
+ 3536,
110
+ 3846,
111
+ 3961,
112
+ 4183,
113
+ 4667,
114
+ 6585,
115
+ 6647,
116
+ 7273,
117
+ 9061,
118
+ 9383,
119
+ 10428,
120
+ 10929,
121
+ 11938,
122
+ 12033,
123
+ 12331,
124
+ 12562,
125
+ 13793,
126
+ 14157,
127
+ 14635,
128
+ 15265,
129
+ 15618,
130
+ 16553,
131
+ 16604,
132
+ 18362,
133
+ 18956,
134
+ 20075,
135
+ 21675,
136
+ 22520,
137
+ 26130,
138
+ 26161,
139
+ 26435,
140
+ 28279,
141
+ 29464,
142
+ 31650,
143
+ 32302,
144
+ 32470,
145
+ 36865,
146
+ 42863,
147
+ 47425,
148
+ 49870,
149
+ 50254,
150
+ 50258,
151
+ 50358,
152
+ 50359,
153
+ 50360,
154
+ 50361,
155
+ 50362
156
+ ],
157
+ "torch_dtype": "float32",
158
+ "use_cache": true,
159
+ "use_weighted_layer_sum": false,
160
+ "vocab_size": 51865
161
+ },
162
+ "audio_pool_step": 5,
163
+ "auto_map": {
164
+ "AutoConfig": "configuration_minicpmo.MiniCPMOConfig",
165
+ "AutoModel": "modeling_minicpmo.MiniCPMO",
166
+ "AutoModelForCausalLM": "modeling_minicpmo.MiniCPMO"
167
+ },
168
+ "batch_vision_input": true,
169
+ "bos_token_id": 151643,
170
+ "drop_vision_last_layer": false,
171
+ "eos_token_id": 151645,
172
+ "head_dim": 128,
173
+ "hidden_act": "silu",
174
+ "hidden_size": 4096,
175
+ "image_size": 448,
176
+ "init_audio": true,
177
+ "init_tts": true,
178
+ "init_vision": true,
179
+ "initializer_range": 0.02,
180
+ "intermediate_size": 12288,
181
+ "listen_speak_type": "asr",
182
+ "max_position_embeddings": 40960,
183
+ "max_window_layers": 36,
184
+ "model_type": "minicpmo",
185
+ "num_attention_heads": 32,
186
+ "num_hidden_layers": 36,
187
+ "num_key_value_heads": 8,
188
+ "patch_size": 14,
189
+ "query_num": 64,
190
+ "rms_norm_eps": 1e-06,
191
+ "rope_scaling": null,
192
+ "rope_theta": 1000000,
193
+ "slice_config": {
194
+ "max_slice_nums": 1,
195
+ "model_type": "minicpmv",
196
+ "patch_size": 14,
197
+ "scale_resolution": 448
198
+ },
199
+ "slice_mode": true,
200
+ "sliding_window": null,
201
+ "stream_input": true,
202
+ "tie_word_embeddings": false,
203
+ "torch_dtype": "bfloat16",
204
+ "transformers_version": "4.51.0",
205
+ "tts_config": {
206
+ "_attn_implementation_autoset": true,
207
+ "attention_type": "full_attention",
208
+ "attn_implementation": "sdpa",
209
+ "audio_bos_token_id": 151687,
210
+ "audio_tokenizer_sample_rate": 16000,
211
+ "audio_tokenizer_type": "s3tokenizer",
212
+ "aug_layer_loss_weight": false,
213
+ "aug_loss_weight": false,
214
+ "backbone_model": "llama",
215
+ "condition_type": "hidden_text_merge",
216
+ "cosyvoice_config_path": null,
217
+ "cosyvoice_model_dir": null,
218
+ "filter_tts_loss": false,
219
+ "hidden_act": "silu",
220
+ "hidden_size": 768,
221
+ "interleaved": false,
222
+ "intermediate_size": 3072,
223
+ "llm_dim": 4096,
224
+ "llm_dim_model_base": 256,
225
+ "llm_down_scale": false,
226
+ "llm_hidden_size": 4096,
227
+ "llm_intermediate_size": 768,
228
+ "long_weight": 0.1,
229
+ "max_position_embeddings": 4096,
230
+ "model_type": "minicpmtts",
231
+ "normalize_projected_hidden": true,
232
+ "num_attention_heads": 12,
233
+ "num_audio_tokens": 6562,
234
+ "num_hidden_layers": 20,
235
+ "num_key_value_heads": 12,
236
+ "num_mel_bins": 100,
237
+ "num_text_tokens": 152064,
238
+ "num_vq": 1,
239
+ "projector_type": "mlp",
240
+ "recomputed_chunks": 1,
241
+ "s3_stream_chunk_size": 25,
242
+ "s3_stream_generate": false,
243
+ "s3_stream_n_timesteps": 10,
244
+ "s3_stream_prelook_size": 3,
245
+ "short_weight": 0.1,
246
+ "streaming": false,
247
+ "streaming_audio_chunk_size": 50,
248
+ "streaming_sliding_window": false,
249
+ "streaming_sliding_window_audio_frame_rate": 50,
250
+ "streaming_sliding_window_audio_init_text_length": 10,
251
+ "streaming_sliding_window_audio_window_size": 300,
252
+ "streaming_sliding_window_average_speed": 5,
253
+ "streaming_sliding_window_fast_speed": 7,
254
+ "streaming_sliding_window_max_text_len": 500,
255
+ "streaming_sliding_window_slow_speed": 3,
256
+ "streaming_sliding_window_text_window_size": 50,
257
+ "streaming_text_chunk_max": 7,
258
+ "streaming_text_chunk_min": 3,
259
+ "streaming_text_reserved_len": 300,
260
+ "text_eos_token_id": 151692,
261
+ "tts_filter_loss_fix": false,
262
+ "use_llm_hidden_state": false,
263
+ "use_text": true,
264
+ "window_size": 2
265
+ },
266
+ "use_cache": true,
267
+ "use_image_id": true,
268
+ "use_sliding_window": false,
269
+ "vision_batch_size": 16,
270
+ "vision_config": {
271
+ "_attn_implementation_autoset": true,
272
+ "attention_dropout": 0.0,
273
+ "hidden_act": "gelu_pytorch_tanh",
274
+ "hidden_size": 1152,
275
+ "image_size": 980,
276
+ "intermediate_size": 4304,
277
+ "layer_norm_eps": 1e-06,
278
+ "model_type": "siglip_vision_model",
279
+ "num_attention_heads": 16,
280
+ "num_channels": 3,
281
+ "num_hidden_layers": 27,
282
+ "patch_size": 14
283
+ },
284
+ "vocab_size": 151748
285
+ }
configuration_minicpmo.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 2026 The OpenBMB Team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import os
18
+ from typing import Union
19
+
20
+ from transformers import PretrainedConfig
21
+ from transformers import Qwen3Config
22
+ from transformers import WhisperConfig
23
+ from transformers.utils import logging
24
+
25
+ from .modeling_navit_siglip import SiglipVisionConfig
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+
30
+ class MiniCPMVSliceConfig(PretrainedConfig):
31
+ model_type = "minicpmv"
32
+
33
+ def __init__(
34
+ self,
35
+ patch_size=14,
36
+ max_slice_nums=9,
37
+ scale_resolution=448,
38
+ **kwargs,
39
+ ):
40
+ super().__init__(**kwargs)
41
+ self.patch_size = patch_size
42
+ self.max_slice_nums = max_slice_nums
43
+ self.scale_resolution = scale_resolution
44
+
45
+ @classmethod
46
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
47
+ cls._set_token_in_kwargs(kwargs)
48
+
49
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
50
+
51
+ if config_dict.get("model_type") == "minicpmv":
52
+ config_dict = config_dict["slice_config"]
53
+
54
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
55
+ logger.warning(
56
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
57
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
58
+ )
59
+
60
+ return cls.from_dict(config_dict, **kwargs)
61
+
62
+
63
+ class MiniCPMTTSConfig(PretrainedConfig):
64
+ model_type = "minicpmtts"
65
+
66
+ def __init__(
67
+ self,
68
+ llm_dim: int = 2560,
69
+ llm_intermediate_size: int = 768,
70
+ llm_down_scale: bool = False,
71
+ llm_dim_model_base: int = 256,
72
+ projector_type: str = "mlp",
73
+ hidden_act: str = "silu",
74
+ aug_loss_weight: bool = False,
75
+ aug_layer_loss_weight: bool = False,
76
+ filter_tts_loss: bool = False,
77
+ tts_filter_loss_fix: bool = False,
78
+ long_weight: float = 0.1,
79
+ short_weight: float = 0.1,
80
+ hidden_size: int = 768,
81
+ intermediate_size: int = 3072,
82
+ num_attention_heads: int = 12,
83
+ num_hidden_layers: int = 20,
84
+ num_key_value_heads: int = 12,
85
+ max_position_embeddings: int = 4096,
86
+ num_audio_tokens: int = 4097,
87
+ num_text_tokens: int = 21178,
88
+ num_mel_bins: int = 100,
89
+ num_vq: int = 1,
90
+ use_llm_hidden_state: bool = False,
91
+ audio_bos_token_id: int = 21132,
92
+ text_eos_token_id: int = 21133,
93
+ use_text: bool = True,
94
+ streaming: bool = False,
95
+ streaming_text_chunk_min: int = 3,
96
+ streaming_text_chunk_max: int = 7,
97
+ streaming_text_reserved_len: int = 300,
98
+ streaming_audio_chunk_size: int = 50,
99
+ attn_implementation: str = "sdpa",
100
+ condition_type: str = "llm_hidden",
101
+ backbone_model: str = "llama",
102
+ audio_tokenizer_type: str = "wavtokenizer",
103
+ audio_tokenizer_sample_rate: int = 24000,
104
+ streaming_sliding_window: bool = False,
105
+ streaming_sliding_window_max_text_len: int = 500,
106
+ streaming_sliding_window_average_speed: int = 5,
107
+ streaming_sliding_window_fast_speed: int = 7,
108
+ streaming_sliding_window_slow_speed: int = 3,
109
+ streaming_sliding_window_audio_frame_rate: int = 50,
110
+ streaming_sliding_window_audio_init_text_length: int = 10,
111
+ streaming_sliding_window_audio_window_size: int = 300,
112
+ normalize_projected_hidden: bool = False,
113
+ interleaved: bool = False,
114
+ attention_type: str = "sliding_recompute",
115
+ recomputed_chunks: int = 1,
116
+ window_size: int = 2,
117
+ **kwargs,
118
+ ):
119
+ super().__init__(**kwargs)
120
+
121
+ self.llm_dim = llm_dim
122
+ self.llm_hidden_size = llm_dim
123
+ self.llm_intermediate_size = llm_intermediate_size
124
+ self.llm_down_scale = llm_down_scale
125
+ self.llm_dim_model_base = llm_dim_model_base
126
+ self.projector_type = projector_type
127
+ self.aug_loss_weight = aug_loss_weight
128
+ self.aug_layer_loss_weight = aug_layer_loss_weight
129
+ self.tts_filter_loss_fix = tts_filter_loss_fix
130
+ self.filter_tts_loss = filter_tts_loss
131
+ self.long_weight = long_weight
132
+ self.short_weight = short_weight
133
+ self.hidden_act = hidden_act
134
+
135
+ self.hidden_size = hidden_size
136
+ self.intermediate_size = intermediate_size
137
+ self.num_attention_heads = num_attention_heads
138
+ self.num_hidden_layers = num_hidden_layers
139
+ self.num_key_value_heads = num_key_value_heads
140
+ self.max_position_embeddings = max_position_embeddings
141
+ self.num_audio_tokens = num_audio_tokens
142
+ self.num_text_tokens = num_text_tokens
143
+ self.num_mel_bins = num_mel_bins
144
+ self.num_vq = num_vq
145
+ self.use_llm_hidden_state = use_llm_hidden_state
146
+ self.audio_bos_token_id = audio_bos_token_id
147
+ self.text_eos_token_id = text_eos_token_id
148
+ self.use_text = use_text
149
+ self.streaming = streaming
150
+ self.streaming_text_chunk_min = streaming_text_chunk_min
151
+ self.streaming_text_chunk_max = streaming_text_chunk_max
152
+ self.streaming_text_reserved_len = streaming_text_reserved_len
153
+ self.streaming_audio_chunk_size = streaming_audio_chunk_size
154
+ self.attn_implementation = attn_implementation
155
+ self.condition_type = condition_type
156
+ self.backbone_model = backbone_model
157
+ self.audio_tokenizer_type = audio_tokenizer_type
158
+ self.audio_tokenizer_sample_rate = audio_tokenizer_sample_rate
159
+
160
+ self.streaming_sliding_window = streaming_sliding_window
161
+ self.streaming_sliding_window_max_text_len = streaming_sliding_window_max_text_len
162
+ self.streaming_sliding_window_average_speed = streaming_sliding_window_average_speed
163
+ self.streaming_sliding_window_fast_speed = streaming_sliding_window_fast_speed
164
+ self.streaming_sliding_window_slow_speed = streaming_sliding_window_slow_speed
165
+ self.streaming_sliding_window_audio_frame_rate = streaming_sliding_window_audio_frame_rate
166
+ self.streaming_sliding_window_audio_init_text_length = streaming_sliding_window_audio_init_text_length
167
+ self.streaming_sliding_window_audio_window_size = streaming_sliding_window_audio_window_size
168
+
169
+ self.normalize_projected_hidden = normalize_projected_hidden
170
+
171
+ self.interleaved = interleaved
172
+ self.attention_type = attention_type
173
+ self.recomputed_chunks = recomputed_chunks
174
+ self.window_size = window_size
175
+
176
+
177
+ class MiniCPMOConfig(Qwen3Config):
178
+ model_type = "minicpmo"
179
+ keys_to_ignore_at_inference = ["past_key_values"]
180
+
181
+ default_vision_config = {
182
+ "hidden_size": 1152,
183
+ "image_size": 980,
184
+ "intermediate_size": 4304,
185
+ "model_type": "siglip",
186
+ "num_attention_heads": 16,
187
+ "num_hidden_layers": 27,
188
+ "patch_size": 14,
189
+ }
190
+
191
+ def __init__(
192
+ self,
193
+ use_cache=True,
194
+ query_num=64,
195
+ image_size=448,
196
+ drop_vision_last_layer=True,
197
+ batch_vision_input=True,
198
+ slice_config=None,
199
+ vision_config=None,
200
+ audio_config=None,
201
+ tts_config=None,
202
+ use_image_id=True,
203
+ vision_batch_size=16,
204
+ audio_pool_step=5,
205
+ audio_chunk_length=1.0,
206
+ stream_input=False,
207
+ listen_speak_type="asr",
208
+ init_vision=True,
209
+ init_audio=True,
210
+ init_tts=True,
211
+ **kwargs,
212
+ ):
213
+ self.use_cache = use_cache
214
+ self.query_num = query_num
215
+ self.image_size = image_size
216
+ self.drop_vision_last_layer = drop_vision_last_layer
217
+ self.batch_vision_input = batch_vision_input
218
+ self.use_image_id = use_image_id
219
+ self.vision_batch_size = vision_batch_size
220
+ self.audio_pool_step = audio_pool_step
221
+ self.audio_chunk_length = audio_chunk_length
222
+ self.stream_input = stream_input
223
+ self.listen_speak_type = listen_speak_type
224
+
225
+ self.init_vision = init_vision
226
+ self.init_audio = init_audio
227
+ self.init_tts = init_tts
228
+
229
+ if slice_config is None:
230
+ self.slice_config = MiniCPMVSliceConfig(max_slice_nums=1)
231
+ else:
232
+ self.slice_config = MiniCPMVSliceConfig(**slice_config)
233
+ self.slice_mode = True
234
+
235
+ # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
236
+ if vision_config is None:
237
+ self.vision_config = SiglipVisionConfig(**self.default_vision_config)
238
+ logger.info("vision_config is None, using default vision config")
239
+ elif isinstance(vision_config, dict):
240
+ self.vision_config = SiglipVisionConfig(**vision_config)
241
+ elif isinstance(vision_config, SiglipVisionConfig):
242
+ self.vision_config = vision_config
243
+
244
+ if audio_config is None:
245
+ self.audio_config = WhisperConfig()
246
+ elif isinstance(audio_config, dict):
247
+ self.audio_config = WhisperConfig(**audio_config)
248
+ elif isinstance(audio_config, WhisperConfig):
249
+ self.audio_config = audio_config
250
+
251
+ if tts_config is None:
252
+ self.tts_config = MiniCPMTTSConfig()
253
+ elif isinstance(tts_config, dict):
254
+ self.tts_config = MiniCPMTTSConfig(**tts_config)
255
+ elif isinstance(tts_config, MiniCPMTTSConfig):
256
+ self.tts_config = tts_config
257
+
258
+ self.patch_size = self.vision_config.patch_size
259
+
260
+ super().__init__(**kwargs)
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30c40b9a10386c1bc404568d8829e5aada2e3501d9de6fb46ff80451aff7e077
3
+ size 5273477136
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe0faef420aca8f771bec1f9fcfaae01206c56b8e64e4429a1ff500c962fbf96
3
+ size 5301855080
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d0b20153f9bfa88ebcd1cb6bbd5b7cac4c217b3cafb137a291a1b380d8b7821
3
+ size 5301855048
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f61addf4747c94fedcaee059e5d9918ed15543beec494404139a99f2f86c9b31
3
+ size 2866549964
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_minicpmo.py ADDED
The diff for this file is too large to render. See raw diff
 
modeling_navit_siglip.py ADDED
@@ -0,0 +1,981 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """PyTorch Siglip model."""
16
+ # Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
17
+
18
+
19
+ import math
20
+ import os
21
+ import warnings
22
+ from dataclasses import dataclass
23
+ from typing import Optional
24
+ from typing import Tuple
25
+ from typing import Union
26
+
27
+ import numpy as np
28
+ import torch
29
+ import torch.nn.functional as F
30
+ import torch.utils.checkpoint
31
+ from torch import nn
32
+ from torch.nn.init import _calculate_fan_in_and_fan_out
33
+ from transformers.activations import ACT2FN
34
+ from transformers.configuration_utils import PretrainedConfig
35
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
36
+ from transformers.modeling_outputs import BaseModelOutput
37
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
38
+ from transformers.modeling_utils import PreTrainedModel
39
+ from transformers.utils import add_start_docstrings
40
+ from transformers.utils import add_start_docstrings_to_model_forward
41
+ from transformers.utils import is_flash_attn_2_available
42
+ from transformers.utils import logging
43
+ from transformers.utils import ModelOutput
44
+ from transformers.utils import replace_return_docstrings
45
+
46
+ logger = logging.get_logger(__name__)
47
+
48
+
49
+ class SiglipVisionConfig(PretrainedConfig):
50
+ r"""
51
+ This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
52
+ Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
53
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
54
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
55
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
56
+ documentation from [`PretrainedConfig`] for more information.
57
+ Args:
58
+ hidden_size (`int`, *optional*, defaults to 768):
59
+ Dimensionality of the encoder layers and the pooler layer.
60
+ intermediate_size (`int`, *optional*, defaults to 3072):
61
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
62
+ num_hidden_layers (`int`, *optional*, defaults to 12):
63
+ Number of hidden layers in the Transformer encoder.
64
+ num_attention_heads (`int`, *optional*, defaults to 12):
65
+ Number of attention heads for each attention layer in the Transformer encoder.
66
+ num_channels (`int`, *optional*, defaults to 3):
67
+ Number of channels in the input images.
68
+ image_size (`int`, *optional*, defaults to 224):
69
+ The size (resolution) of each image.
70
+ patch_size (`int`, *optional*, defaults to 16):
71
+ The size (resolution) of each patch.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
73
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
74
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
75
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
76
+ The epsilon used by the layer normalization layers.
77
+ attention_dropout (`float`, *optional*, defaults to 0.0):
78
+ The dropout ratio for the attention probabilities.
79
+ Example:
80
+ ```python
81
+ >>> from transformers import SiglipVisionConfig, SiglipVisionModel
82
+ >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
83
+ >>> configuration = SiglipVisionConfig()
84
+ >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
85
+ >>> model = SiglipVisionModel(configuration)
86
+ >>> # Accessing the model configuration
87
+ >>> configuration = model.config
88
+ ```"""
89
+
90
+ model_type = "siglip_vision_model"
91
+
92
+ def __init__(
93
+ self,
94
+ hidden_size=768,
95
+ intermediate_size=3072,
96
+ num_hidden_layers=12,
97
+ num_attention_heads=12,
98
+ num_channels=3,
99
+ image_size=224,
100
+ patch_size=16,
101
+ hidden_act="gelu_pytorch_tanh",
102
+ layer_norm_eps=1e-6,
103
+ attention_dropout=0.0,
104
+ **kwargs,
105
+ ):
106
+ super().__init__(**kwargs)
107
+
108
+ self.hidden_size = hidden_size
109
+ self.intermediate_size = intermediate_size
110
+ self.num_hidden_layers = num_hidden_layers
111
+ self.num_attention_heads = num_attention_heads
112
+ self.num_channels = num_channels
113
+ self.patch_size = patch_size
114
+ self.image_size = image_size
115
+ self.attention_dropout = attention_dropout
116
+ self.layer_norm_eps = layer_norm_eps
117
+ self.hidden_act = hidden_act
118
+
119
+ @classmethod
120
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
121
+ cls._set_token_in_kwargs(kwargs)
122
+
123
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
124
+
125
+ # get the vision config dict if we are loading from SiglipConfig
126
+ if config_dict.get("model_type") == "siglip":
127
+ config_dict = config_dict["vision_config"]
128
+
129
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
130
+ logger.warning(
131
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
132
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
133
+ )
134
+
135
+ return cls.from_dict(config_dict, **kwargs)
136
+
137
+
138
+ _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
139
+
140
+ SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
141
+ "google/siglip-base-patch16-224",
142
+ # See all SigLIP models at https://huggingface.co/models?filter=siglip
143
+ ]
144
+
145
+ if is_flash_attn_2_available():
146
+ from flash_attn import flash_attn_func
147
+ from flash_attn import flash_attn_varlen_func
148
+ from flash_attn.bert_padding import index_first_axis # noqa
149
+ from flash_attn.bert_padding import pad_input
150
+ from flash_attn.bert_padding import unpad_input
151
+
152
+
153
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
154
+ def _get_unpad_data(attention_mask):
155
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
156
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
157
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
158
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
159
+ return (
160
+ indices,
161
+ cu_seqlens,
162
+ max_seqlen_in_batch,
163
+ )
164
+
165
+
166
+ def _trunc_normal_(tensor, mean, std, a, b):
167
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
168
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
169
+ def norm_cdf(x):
170
+ # Computes standard normal cumulative distribution function
171
+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
172
+
173
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
174
+ warnings.warn(
175
+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
176
+ "The distribution of values may be incorrect.",
177
+ stacklevel=2,
178
+ )
179
+
180
+ # Values are generated by using a truncated uniform distribution and
181
+ # then using the inverse CDF for the normal distribution.
182
+ # Get upper and lower cdf values
183
+ l = norm_cdf((a - mean) / std)
184
+ u = norm_cdf((b - mean) / std)
185
+
186
+ # Uniformly fill tensor with values from [l, u], then translate to
187
+ # [2l-1, 2u-1].
188
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
189
+
190
+ # Use inverse cdf transform for normal distribution to get truncated
191
+ # standard normal
192
+ if tensor.dtype in [torch.float16, torch.bfloat16]:
193
+ # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
194
+ og_dtype = tensor.dtype
195
+ tensor = tensor.to(torch.float32)
196
+ tensor.erfinv_()
197
+ tensor = tensor.to(og_dtype)
198
+ else:
199
+ tensor.erfinv_()
200
+
201
+ # Transform to proper mean, std
202
+ tensor.mul_(std * math.sqrt(2.0))
203
+ tensor.add_(mean)
204
+
205
+ # Clamp to ensure it's in the proper range
206
+ if tensor.dtype == torch.float16:
207
+ # The `clamp_` op is not (yet?) defined in float16+cpu
208
+ tensor = tensor.to(torch.float32)
209
+ tensor.clamp_(min=a, max=b)
210
+ tensor = tensor.to(torch.float16)
211
+ else:
212
+ tensor.clamp_(min=a, max=b)
213
+
214
+
215
+ def trunc_normal_tf_(
216
+ tensor: torch.Tensor,
217
+ mean: float = 0.0,
218
+ std: float = 1.0,
219
+ a: float = -2.0,
220
+ b: float = 2.0,
221
+ ) -> torch.Tensor:
222
+ """Fills the input Tensor with values drawn from a truncated
223
+ normal distribution. The values are effectively drawn from the
224
+ normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
225
+ with values outside :math:`[a, b]` redrawn until they are within
226
+ the bounds. The method used for generating the random values works
227
+ best when :math:`a \\leq \text{mean} \\leq b`.
228
+ NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
229
+ bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
230
+ and the result is subsquently scaled and shifted by the mean and std args.
231
+ Args:
232
+ tensor: an n-dimensional `torch.Tensor`
233
+ mean: the mean of the normal distribution
234
+ std: the standard deviation of the normal distribution
235
+ a: the minimum cutoff value
236
+ b: the maximum cutoff value
237
+ """
238
+ with torch.no_grad():
239
+ _trunc_normal_(tensor, 0, 1.0, a, b)
240
+ tensor.mul_(std).add_(mean)
241
+
242
+
243
+ def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
244
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
245
+ if mode == "fan_in":
246
+ denom = fan_in
247
+ elif mode == "fan_out":
248
+ denom = fan_out
249
+ elif mode == "fan_avg":
250
+ denom = (fan_in + fan_out) / 2
251
+
252
+ variance = scale / denom
253
+
254
+ if distribution == "truncated_normal":
255
+ # constant is stddev of standard normal truncated to (-2, 2)
256
+ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
257
+ elif distribution == "normal":
258
+ with torch.no_grad():
259
+ tensor.normal_(std=math.sqrt(variance))
260
+ elif distribution == "uniform":
261
+ bound = math.sqrt(3 * variance)
262
+ with torch.no_grad():
263
+ tensor.uniform_(-bound, bound)
264
+ else:
265
+ raise ValueError(f"invalid distribution {distribution}")
266
+
267
+
268
+ def lecun_normal_(tensor):
269
+ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
270
+
271
+
272
+ def default_flax_embed_init(tensor):
273
+ variance_scaling_(tensor, mode="fan_in", distribution="normal")
274
+
275
+
276
+ @dataclass
277
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
278
+ class SiglipVisionModelOutput(ModelOutput):
279
+ """
280
+ Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
281
+ Args:
282
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
283
+ The image embeddings obtained by applying the projection layer to the pooler_output.
284
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
285
+ Sequence of hidden-states at the output of the last layer of the model.
286
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
287
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
288
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
289
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
290
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
291
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
292
+ sequence_length)`.
293
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
294
+ heads.
295
+ """
296
+
297
+ image_embeds: Optional[torch.FloatTensor] = None
298
+ last_hidden_state: torch.FloatTensor = None
299
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
300
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
301
+
302
+
303
+ class SiglipVisionEmbeddings(nn.Module):
304
+ def __init__(self, config: SiglipVisionConfig):
305
+ super().__init__()
306
+ self.config = config
307
+ self.embed_dim = config.hidden_size
308
+ self.image_size = config.image_size
309
+ self.patch_size = config.patch_size
310
+
311
+ self.patch_embedding = nn.Conv2d(
312
+ in_channels=config.num_channels,
313
+ out_channels=self.embed_dim,
314
+ kernel_size=self.patch_size,
315
+ stride=self.patch_size,
316
+ padding="valid",
317
+ )
318
+
319
+ self.num_patches_per_side = self.image_size // self.patch_size
320
+ self.num_patches = self.num_patches_per_side**2
321
+ self.num_positions = self.num_patches
322
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
323
+
324
+ def forward(
325
+ self,
326
+ pixel_values: torch.FloatTensor,
327
+ patch_attention_mask: torch.BoolTensor,
328
+ tgt_sizes: Optional[torch.IntTensor] = None,
329
+ ) -> torch.Tensor:
330
+ batch_size = pixel_values.size(0)
331
+
332
+ patch_embeds = self.patch_embedding(pixel_values)
333
+ embeddings = patch_embeds.flatten(2).transpose(1, 2)
334
+
335
+ max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
336
+ max_nb_patches_h, max_nb_patches_w = (
337
+ max_im_h // self.patch_size,
338
+ max_im_w // self.patch_size,
339
+ )
340
+ boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
341
+ position_ids = torch.full(
342
+ size=(
343
+ batch_size,
344
+ max_nb_patches_h * max_nb_patches_w,
345
+ ),
346
+ fill_value=0,
347
+ )
348
+
349
+ for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
350
+ if tgt_sizes is not None:
351
+ nb_patches_h = tgt_sizes[batch_idx][0]
352
+ nb_patches_w = tgt_sizes[batch_idx][1]
353
+ else:
354
+ nb_patches_h = p_attn_mask[:, 0].sum()
355
+ nb_patches_w = p_attn_mask[0].sum()
356
+
357
+ fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
358
+ fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
359
+
360
+ bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
361
+ bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
362
+
363
+ pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
364
+ position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
365
+
366
+ position_ids = position_ids.to(self.position_embedding.weight.device)
367
+
368
+ embeddings = embeddings + self.position_embedding(position_ids)
369
+ return embeddings
370
+
371
+
372
+ class SiglipAttention(nn.Module):
373
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
374
+
375
+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
376
+ def __init__(self, config):
377
+ super().__init__()
378
+ self.config = config
379
+ self.embed_dim = config.hidden_size
380
+ self.num_heads = config.num_attention_heads
381
+ self.head_dim = self.embed_dim // self.num_heads
382
+ if self.head_dim * self.num_heads != self.embed_dim:
383
+ raise ValueError(
384
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
385
+ f" {self.num_heads})."
386
+ )
387
+ self.scale = self.head_dim**-0.5
388
+ self.dropout = config.attention_dropout
389
+
390
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
391
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
392
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
393
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
394
+
395
+ def forward(
396
+ self,
397
+ hidden_states: torch.Tensor,
398
+ attention_mask: Optional[torch.Tensor] = None,
399
+ output_attentions: Optional[bool] = False,
400
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
401
+ """Input shape: Batch x Time x Channel"""
402
+
403
+ batch_size, q_len, _ = hidden_states.size()
404
+
405
+ query_states = self.q_proj(hidden_states)
406
+ key_states = self.k_proj(hidden_states)
407
+ value_states = self.v_proj(hidden_states)
408
+
409
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
410
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
411
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
412
+
413
+ k_v_seq_len = key_states.shape[-2]
414
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
415
+
416
+ if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
417
+ raise ValueError(
418
+ f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
419
+ f" {attn_weights.size()}"
420
+ )
421
+
422
+ if attention_mask is not None:
423
+ if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
424
+ raise ValueError(
425
+ f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
426
+ )
427
+ attn_weights = attn_weights + attention_mask
428
+
429
+ # upcast attention to fp32
430
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
431
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
432
+ attn_output = torch.matmul(attn_weights, value_states)
433
+
434
+ if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
435
+ raise ValueError(
436
+ f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
437
+ f" {attn_output.size()}"
438
+ )
439
+
440
+ attn_output = attn_output.transpose(1, 2).contiguous()
441
+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
442
+
443
+ attn_output = self.out_proj(attn_output)
444
+
445
+ return attn_output, attn_weights
446
+
447
+
448
+ class SiglipFlashAttention2(SiglipAttention):
449
+ """
450
+ Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
451
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
452
+ flash attention and deal with padding tokens in case the input contains any of them.
453
+ """
454
+
455
+ def __init__(self, *args, **kwargs):
456
+ super().__init__(*args, **kwargs)
457
+ self.is_causal = False # Hack to make sure we don't use a causal mask
458
+
459
+ def forward(
460
+ self,
461
+ hidden_states: torch.Tensor,
462
+ attention_mask: Optional[torch.LongTensor] = None,
463
+ position_ids: Optional[torch.LongTensor] = None,
464
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
465
+ output_attentions: bool = False,
466
+ use_cache: bool = False,
467
+ **kwargs,
468
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
469
+ output_attentions = False
470
+
471
+ bsz, q_len, _ = hidden_states.size()
472
+
473
+ query_states = self.q_proj(hidden_states)
474
+ key_states = self.k_proj(hidden_states)
475
+ value_states = self.v_proj(hidden_states)
476
+
477
+ # Flash attention requires the input to have the shape
478
+ # batch_size x seq_length x head_dim x hidden_dim
479
+ # therefore we just need to keep the original shape
480
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
481
+ key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
482
+ value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
483
+
484
+ kv_seq_len = key_states.shape[-2]
485
+ if past_key_value is not None:
486
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
487
+ # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
488
+ # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
489
+
490
+ # if past_key_value is not None:
491
+ # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
492
+ # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
493
+
494
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
495
+ # to be able to avoid many of these transpose/reshape/view.
496
+ query_states = query_states.transpose(1, 2)
497
+ key_states = key_states.transpose(1, 2)
498
+ value_states = value_states.transpose(1, 2)
499
+
500
+ dropout_rate = self.dropout if self.training else 0.0
501
+
502
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
503
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
504
+ # cast them back in the correct dtype just to be sure everything works as expected.
505
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
506
+ # in fp32. (LlamaRMSNorm handles it correctly)
507
+
508
+ input_dtype = query_states.dtype
509
+ if input_dtype == torch.float32:
510
+ if torch.is_autocast_enabled():
511
+ target_dtype = torch.get_autocast_gpu_dtype()
512
+ # Handle the case where the model is quantized
513
+ elif hasattr(self.config, "_pre_quantization_dtype"):
514
+ target_dtype = self.config._pre_quantization_dtype
515
+ else:
516
+ target_dtype = self.q_proj.weight.dtype
517
+
518
+ logger.warning_once(
519
+ "The input hidden states seems to be silently casted in float32, this might be related to the fact"
520
+ " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
521
+ f" {target_dtype}."
522
+ )
523
+
524
+ query_states = query_states.to(target_dtype)
525
+ key_states = key_states.to(target_dtype)
526
+ value_states = value_states.to(target_dtype)
527
+
528
+ attn_output = self._flash_attention_forward(
529
+ query_states,
530
+ key_states,
531
+ value_states,
532
+ attention_mask,
533
+ q_len,
534
+ dropout=dropout_rate,
535
+ )
536
+
537
+ attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
538
+ attn_output = self.out_proj(attn_output)
539
+
540
+ if not output_attentions:
541
+ attn_weights = None
542
+
543
+ return attn_output, attn_weights
544
+
545
+ def _flash_attention_forward(
546
+ self,
547
+ query_states,
548
+ key_states,
549
+ value_states,
550
+ attention_mask,
551
+ query_length,
552
+ dropout=0.0,
553
+ softmax_scale=None,
554
+ ):
555
+ """
556
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
557
+ first unpad the input, then computes the attention scores and pad the final attention scores.
558
+ Args:
559
+ query_states (`torch.Tensor`):
560
+ Input query states to be passed to Flash Attention API
561
+ key_states (`torch.Tensor`):
562
+ Input key states to be passed to Flash Attention API
563
+ value_states (`torch.Tensor`):
564
+ Input value states to be passed to Flash Attention API
565
+ attention_mask (`torch.Tensor`):
566
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
567
+ position of padding tokens and 1 for the position of non-padding tokens.
568
+ dropout (`int`, *optional*):
569
+ Attention dropout
570
+ softmax_scale (`float`, *optional*):
571
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
572
+ """
573
+
574
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
575
+ causal = self.is_causal and query_length != 1
576
+
577
+ # Contains at least one padding token in the sequence
578
+ if attention_mask is not None:
579
+ batch_size = query_states.shape[0]
580
+ (
581
+ query_states,
582
+ key_states,
583
+ value_states,
584
+ indices_q,
585
+ cu_seq_lens,
586
+ max_seq_lens,
587
+ ) = self._upad_input(query_states, key_states, value_states, attention_mask, query_length)
588
+
589
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
590
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
591
+
592
+ attn_output_unpad = flash_attn_varlen_func(
593
+ query_states,
594
+ key_states,
595
+ value_states,
596
+ cu_seqlens_q=cu_seqlens_q,
597
+ cu_seqlens_k=cu_seqlens_k,
598
+ max_seqlen_q=max_seqlen_in_batch_q,
599
+ max_seqlen_k=max_seqlen_in_batch_k,
600
+ dropout_p=dropout,
601
+ softmax_scale=softmax_scale,
602
+ causal=causal,
603
+ )
604
+
605
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
606
+ else:
607
+ attn_output = flash_attn_func(
608
+ query_states,
609
+ key_states,
610
+ value_states,
611
+ dropout,
612
+ softmax_scale=softmax_scale,
613
+ causal=causal,
614
+ )
615
+
616
+ return attn_output
617
+
618
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
619
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
620
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
621
+
622
+ key_layer = index_first_axis(
623
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
624
+ indices_k,
625
+ )
626
+ value_layer = index_first_axis(
627
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
628
+ indices_k,
629
+ )
630
+ if query_length == kv_seq_len:
631
+ query_layer = index_first_axis(
632
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
633
+ indices_k,
634
+ )
635
+ cu_seqlens_q = cu_seqlens_k
636
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
637
+ indices_q = indices_k
638
+ elif query_length == 1:
639
+ max_seqlen_in_batch_q = 1
640
+ cu_seqlens_q = torch.arange(
641
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
642
+ ) # There is a memcpy here, that is very bad.
643
+ indices_q = cu_seqlens_q[:-1]
644
+ query_layer = query_layer.squeeze(1)
645
+ else:
646
+ # The -q_len: slice assumes left padding.
647
+ attention_mask = attention_mask[:, -query_length:]
648
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
649
+
650
+ return (
651
+ query_layer,
652
+ key_layer,
653
+ value_layer,
654
+ indices_q,
655
+ (cu_seqlens_q, cu_seqlens_k),
656
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
657
+ )
658
+
659
+
660
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
661
+ class SiglipMLP(nn.Module):
662
+ def __init__(self, config):
663
+ super().__init__()
664
+ self.config = config
665
+ self.activation_fn = ACT2FN[config.hidden_act]
666
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
667
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
668
+
669
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
670
+ hidden_states = self.fc1(hidden_states)
671
+ hidden_states = self.activation_fn(hidden_states)
672
+ hidden_states = self.fc2(hidden_states)
673
+ return hidden_states
674
+
675
+
676
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
677
+ class SiglipEncoderLayer(nn.Module):
678
+ def __init__(self, config: SiglipVisionConfig):
679
+ super().__init__()
680
+ self.embed_dim = config.hidden_size
681
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
682
+ self.self_attn = SiglipAttention(config) if not self._use_flash_attention_2 else SiglipFlashAttention2(config)
683
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
684
+ self.mlp = SiglipMLP(config)
685
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
686
+
687
+ def forward(
688
+ self,
689
+ hidden_states: torch.Tensor,
690
+ attention_mask: torch.Tensor,
691
+ output_attentions: Optional[bool] = False,
692
+ ) -> Tuple[torch.FloatTensor]:
693
+ """
694
+ Args:
695
+ hidden_states (`torch.FloatTensor`):
696
+ Input to the layer of shape `(batch, seq_len, embed_dim)`.
697
+ attention_mask (`torch.FloatTensor`):
698
+ Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
699
+ output_attentions (`bool`, *optional*, defaults to `False`):
700
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
701
+ returned tensors for more detail.
702
+ """
703
+ residual = hidden_states
704
+
705
+ hidden_states = self.layer_norm1(hidden_states)
706
+ hidden_states, attn_weights = self.self_attn(
707
+ hidden_states=hidden_states,
708
+ attention_mask=attention_mask,
709
+ output_attentions=output_attentions,
710
+ )
711
+ hidden_states = residual + hidden_states
712
+
713
+ residual = hidden_states
714
+ hidden_states = self.layer_norm2(hidden_states)
715
+ hidden_states = self.mlp(hidden_states)
716
+ hidden_states = residual + hidden_states
717
+
718
+ outputs = (hidden_states,)
719
+
720
+ if output_attentions:
721
+ outputs += (attn_weights,)
722
+
723
+ return outputs
724
+
725
+
726
+ class SiglipPreTrainedModel(PreTrainedModel):
727
+ """
728
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
729
+ models.
730
+ """
731
+
732
+ config_class = SiglipVisionConfig
733
+ base_model_prefix = "siglip"
734
+ supports_gradient_checkpointing = True
735
+
736
+ def _init_weights(self, module):
737
+ """Initialize the weights"""
738
+
739
+ if isinstance(module, SiglipVisionEmbeddings):
740
+ width = self.config.hidden_size
741
+ nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
742
+ elif isinstance(module, nn.Embedding):
743
+ default_flax_embed_init(module.weight)
744
+ elif isinstance(module, SiglipAttention):
745
+ nn.init.normal_(module.q_proj.weight)
746
+ nn.init.normal_(module.k_proj.weight)
747
+ nn.init.normal_(module.v_proj.weight)
748
+ nn.init.normal_(module.out_proj.weight)
749
+ nn.init.zeros_(module.q_proj.bias)
750
+ nn.init.zeros_(module.k_proj.bias)
751
+ nn.init.zeros_(module.v_proj.bias)
752
+ nn.init.zeros_(module.out_proj.bias)
753
+ elif isinstance(module, SiglipMLP):
754
+ nn.init.normal_(module.fc1.weight)
755
+ nn.init.normal_(module.fc2.weight)
756
+ nn.init.normal_(module.fc1.bias, std=1e-6)
757
+ nn.init.normal_(module.fc2.bias, std=1e-6)
758
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
759
+ lecun_normal_(module.weight)
760
+ if module.bias is not None:
761
+ nn.init.zeros_(module.bias)
762
+ elif isinstance(module, nn.LayerNorm):
763
+ module.bias.data.zero_()
764
+ module.weight.data.fill_(1.0)
765
+
766
+
767
+ SIGLIP_START_DOCSTRING = r"""
768
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
769
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
770
+ etc.)
771
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
772
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
773
+ and behavior.
774
+ Parameters:
775
+ config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
776
+ Initializing with a config file does not load the weights associated with the model, only the
777
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
778
+ """
779
+
780
+
781
+ SIGLIP_VISION_INPUTS_DOCSTRING = r"""
782
+ Args:
783
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
784
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
785
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
786
+ output_attentions (`bool`, *optional*):
787
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
788
+ tensors for more detail.
789
+ output_hidden_states (`bool`, *optional*):
790
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
791
+ more detail.
792
+ return_dict (`bool`, *optional*):
793
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
794
+ """
795
+
796
+
797
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
798
+ class SiglipEncoder(nn.Module):
799
+ """
800
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
801
+ [`SiglipEncoderLayer`].
802
+ Args:
803
+ config: SiglipConfig
804
+ """
805
+
806
+ def __init__(self, config: SiglipVisionConfig):
807
+ super().__init__()
808
+ self.config = config
809
+ self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
810
+ self.gradient_checkpointing = False
811
+
812
+ # Ignore copy
813
+ def forward(
814
+ self,
815
+ inputs_embeds,
816
+ attention_mask: Optional[torch.Tensor] = None,
817
+ output_attentions: Optional[bool] = None,
818
+ output_hidden_states: Optional[bool] = None,
819
+ return_dict: Optional[bool] = None,
820
+ ) -> Union[Tuple, BaseModelOutput]:
821
+ r"""
822
+ Args:
823
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
824
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
825
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
826
+ than the model's internal embedding lookup matrix.
827
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
828
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
829
+ - 1 for tokens that are **not masked**,
830
+ - 0 for tokens that are **masked**.
831
+ [What are attention masks?](../glossary#attention-mask)
832
+ output_attentions (`bool`, *optional*):
833
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
834
+ returned tensors for more detail.
835
+ output_hidden_states (`bool`, *optional*):
836
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
837
+ for more detail.
838
+ return_dict (`bool`, *optional*):
839
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
840
+ """
841
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
842
+ output_hidden_states = (
843
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
844
+ )
845
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
846
+
847
+ encoder_states = () if output_hidden_states else None
848
+ all_attentions = () if output_attentions else None
849
+
850
+ hidden_states = inputs_embeds
851
+ for encoder_layer in self.layers:
852
+ if output_hidden_states:
853
+ encoder_states = encoder_states + (hidden_states,)
854
+ if self.gradient_checkpointing and self.training:
855
+ layer_outputs = self._gradient_checkpointing_func(
856
+ encoder_layer.__call__,
857
+ hidden_states,
858
+ attention_mask,
859
+ output_attentions,
860
+ )
861
+ else:
862
+ layer_outputs = encoder_layer(
863
+ hidden_states,
864
+ attention_mask,
865
+ output_attentions=output_attentions,
866
+ )
867
+
868
+ hidden_states = layer_outputs[0]
869
+
870
+ if output_attentions:
871
+ all_attentions = all_attentions + (layer_outputs[1],)
872
+
873
+ if output_hidden_states:
874
+ encoder_states = encoder_states + (hidden_states,)
875
+
876
+ if not return_dict:
877
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
878
+ return BaseModelOutput(
879
+ last_hidden_state=hidden_states,
880
+ hidden_states=encoder_states,
881
+ attentions=all_attentions,
882
+ )
883
+
884
+
885
+ @add_start_docstrings(
886
+ """The vision model from SigLIP without any head or projection on top.""",
887
+ SIGLIP_START_DOCSTRING,
888
+ )
889
+ class SiglipVisionTransformer(SiglipPreTrainedModel):
890
+ config_class = SiglipVisionConfig
891
+ main_input_name = "pixel_values"
892
+ _supports_flash_attn_2 = True
893
+ _no_split_modules = []
894
+
895
+ def __init__(self, config: SiglipVisionConfig):
896
+ super().__init__(config)
897
+ self.config = config
898
+ embed_dim = config.hidden_size
899
+
900
+ self.embeddings = SiglipVisionEmbeddings(config)
901
+ self.encoder = SiglipEncoder(config)
902
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
903
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
904
+
905
+ # Initialize weights and apply final processing
906
+ self.post_init()
907
+
908
+ def get_input_embeddings(self) -> nn.Module:
909
+ return self.embeddings.patch_embedding
910
+
911
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
912
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
913
+ def forward(
914
+ self,
915
+ pixel_values,
916
+ patch_attention_mask: Optional[torch.BoolTensor] = None,
917
+ tgt_sizes: Optional[torch.IntTensor] = None,
918
+ output_attentions: Optional[bool] = None,
919
+ output_hidden_states: Optional[bool] = None,
920
+ return_dict: Optional[bool] = None,
921
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
922
+ r"""
923
+ Returns:
924
+ """
925
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
926
+ output_hidden_states = (
927
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
928
+ )
929
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
930
+
931
+ batch_size = pixel_values.size(0)
932
+ if patch_attention_mask is None:
933
+ patch_attention_mask = torch.ones(
934
+ size=(
935
+ batch_size,
936
+ pixel_values.size(2) // self.config.patch_size,
937
+ pixel_values.size(3) // self.config.patch_size,
938
+ ),
939
+ dtype=torch.bool,
940
+ device=pixel_values.device,
941
+ )
942
+
943
+ hidden_states = self.embeddings(
944
+ pixel_values=pixel_values,
945
+ patch_attention_mask=patch_attention_mask,
946
+ tgt_sizes=tgt_sizes,
947
+ )
948
+
949
+ patch_attention_mask = patch_attention_mask.view(batch_size, -1)
950
+ # The call to `_upad_input` in `_flash_attention_forward` is expensive
951
+ # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
952
+ # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
953
+ if not torch.any(~patch_attention_mask):
954
+ attention_mask = None
955
+ else:
956
+ attention_mask = (
957
+ _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
958
+ if not self._use_flash_attention_2
959
+ else patch_attention_mask
960
+ )
961
+
962
+ encoder_outputs = self.encoder(
963
+ inputs_embeds=hidden_states,
964
+ attention_mask=attention_mask,
965
+ output_attentions=output_attentions,
966
+ output_hidden_states=output_hidden_states,
967
+ return_dict=return_dict,
968
+ )
969
+
970
+ last_hidden_state = encoder_outputs[0]
971
+ last_hidden_state = self.post_layernorm(last_hidden_state)
972
+
973
+ if not return_dict:
974
+ return (last_hidden_state, None) + encoder_outputs[1:]
975
+
976
+ return BaseModelOutputWithPooling(
977
+ last_hidden_state=last_hidden_state,
978
+ pooler_output=None,
979
+ hidden_states=encoder_outputs.hidden_states,
980
+ attentions=encoder_outputs.attentions,
981
+ )
preprocessor_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor_type": "MiniCPMVImageProcessor",
3
+ "feature_extractor_type": "MiniCPMAAudioProcessor",
4
+ "auto_map": {
5
+ "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor",
6
+ "AutoImageProcessor": "processing_minicpmo.MiniCPMVImageProcessor",
7
+ "AutoFeatureExtractor": "processing_minicpmo.MiniCPMAAudioProcessor"
8
+ },
9
+ "processor_class": "MiniCPMOProcessor",
10
+ "max_slice_nums": 9,
11
+ "scale_resolution": 448,
12
+ "patch_size": 14,
13
+ "use_image_id": true,
14
+ "image_feature_size": 64,
15
+ "im_start": "<image>",
16
+ "im_end": "</image>",
17
+ "slice_start": "<slice>",
18
+ "slice_end": "</slice>",
19
+ "unk": "<unk>",
20
+ "im_id_start": "<image_id>",
21
+ "im_id_end": "</image_id>",
22
+ "slice_mode": true,
23
+ "audio_pool_step": 5,
24
+ "norm_mean": [
25
+ 0.5,
26
+ 0.5,
27
+ 0.5
28
+ ],
29
+ "norm_std": [
30
+ 0.5,
31
+ 0.5,
32
+ 0.5
33
+ ],
34
+ "version": 4.5
35
+ }
processing_minicpmo.py ADDED
@@ -0,0 +1,1665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 2026 The OpenBMB Team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import copy
18
+ import math
19
+ import re
20
+ from typing import Any
21
+ from typing import Dict
22
+ from typing import List
23
+ from typing import Optional
24
+ from typing import Tuple
25
+ from typing import Union
26
+
27
+ import numpy as np
28
+ import torch
29
+ from PIL import Image
30
+ from transformers import AutoImageProcessor
31
+ from transformers.audio_utils import spectrogram
32
+ from transformers.audio_utils import window_function
33
+ from transformers.image_processing_utils import BaseImageProcessor
34
+ from transformers.image_processing_utils import BatchFeature
35
+ from transformers.image_transforms import to_channel_dimension_format
36
+ from transformers.image_utils import ChannelDimension
37
+ from transformers.image_utils import ImageInput
38
+ from transformers.image_utils import infer_channel_dimension_format
39
+ from transformers.image_utils import is_torch_tensor
40
+ from transformers.image_utils import to_numpy_array
41
+ from transformers.image_utils import valid_images
42
+ from transformers.models.whisper.feature_extraction_whisper import WhisperFeatureExtractor
43
+ from transformers.processing_utils import ProcessorMixin
44
+ from transformers.tokenization_utils_base import PreTokenizedInput
45
+ from transformers.tokenization_utils_base import TextInput
46
+ from transformers.utils import is_torch_device
47
+ from transformers.utils import is_torch_dtype
48
+ from transformers.utils import requires_backends
49
+ from transformers.utils import TensorType
50
+
51
+
52
+ def recursive_converter(converter, value):
53
+ if isinstance(value, list):
54
+ new_value = []
55
+ for v in value:
56
+ new_value += [recursive_converter(converter, v)]
57
+ return new_value
58
+ else:
59
+ return converter(value)
60
+
61
+
62
+ class MiniCPMOBatchFeature(BatchFeature):
63
+ """Extend from BatchFeature for supporting various image size"""
64
+
65
+ def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
66
+ super().__init__(data)
67
+ self.convert_to_tensors(tensor_type=tensor_type)
68
+
69
+ def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
70
+ if tensor_type is None:
71
+ return self
72
+
73
+ is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
74
+
75
+ def converter(value):
76
+ try:
77
+ if not is_tensor(value):
78
+ tensor = as_tensor(value)
79
+ return tensor
80
+ except: # noqa E722
81
+ if key == "overflowing_values":
82
+ raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
83
+ raise ValueError(
84
+ "Unable to create tensor, you should probably activate padding "
85
+ "with 'padding=True' to have batched tensors with the same length."
86
+ )
87
+
88
+ for key, value in self.items():
89
+ self[key] = recursive_converter(converter, value)
90
+ return self
91
+
92
+ def to(self, *args, **kwargs) -> "MiniCPMOBatchFeature":
93
+ requires_backends(self, ["torch"])
94
+ import torch
95
+
96
+ def cast_tensor(v):
97
+ if not torch.is_tensor(v):
98
+ return v
99
+
100
+ if torch.is_floating_point(v):
101
+ return v.to(*args, **kwargs)
102
+ elif device is not None:
103
+ return v.to(device=device)
104
+ else:
105
+ return v
106
+
107
+ new_data = {}
108
+ device = kwargs.get("device")
109
+ if device is None and len(args) > 0:
110
+ arg = args[0]
111
+ if is_torch_dtype(arg):
112
+ pass
113
+ elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
114
+ device = arg
115
+ else:
116
+ raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
117
+
118
+ # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
119
+ for k, v in self.items():
120
+ new_data[k] = recursive_converter(cast_tensor, v)
121
+ self.data = new_data
122
+ return self
123
+
124
+
125
+ class MiniCPMVImageProcessor(BaseImageProcessor):
126
+ model_input_names = ["pixel_values"]
127
+
128
+ def __init__(self, max_slice_nums=9, scale_resolution=448, patch_size=14, **kwargs):
129
+ super().__init__(**kwargs)
130
+ self.max_slice_nums = max_slice_nums
131
+ self.scale_resolution = scale_resolution
132
+ self.patch_size = patch_size
133
+ self.use_image_id = kwargs.pop("use_image_id", False)
134
+ self.image_feature_size = kwargs.pop("image_feature_size", 64)
135
+ self.im_start_token = kwargs.pop("im_start", "<image>")
136
+ self.im_end_token = kwargs.pop("im_end", "</image>")
137
+ self.slice_start_token = kwargs.pop("slice_start", "<slice>")
138
+ self.slice_end_token = kwargs.pop("slice_end", "</slice>")
139
+ self.unk_token = kwargs.pop("unk", "<unk>")
140
+ self.im_id_start = kwargs.pop("im_id_start", "<image_id>")
141
+ self.im_id_end = kwargs.pop("im_id_end", "</image_id>")
142
+ self.slice_mode = kwargs.pop("slice_mode", True)
143
+
144
+ self.mean = np.array(kwargs.pop("norm_mean", [0.5, 0.5, 0.5]))
145
+ self.std = np.array(kwargs.pop("norm_std", [0.5, 0.5, 0.5]))
146
+ self.version = kwargs.pop("version", 2.0)
147
+
148
+ @staticmethod
149
+ def ensure_divide(length, patch_size):
150
+ return max(round(length / patch_size) * patch_size, patch_size)
151
+
152
+ def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False):
153
+ width, height = original_size
154
+ if (width * height > scale_resolution * scale_resolution) or allow_upscale:
155
+ r = width / height
156
+ height = int(scale_resolution / math.sqrt(r))
157
+ width = int(height * r)
158
+ best_width = self.ensure_divide(width, patch_size)
159
+ best_height = self.ensure_divide(height, patch_size)
160
+ return best_width, best_height
161
+
162
+ def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False):
163
+ width, height = original_size
164
+ grid_x, grid_y = grid
165
+
166
+ refine_width = self.ensure_divide(width, grid_x)
167
+ refine_height = self.ensure_divide(height, grid_y)
168
+
169
+ grid_width = refine_width / grid_x
170
+ grid_height = refine_height / grid_y
171
+
172
+ best_grid_size = self.find_best_resize(
173
+ (grid_width, grid_height), scale_resolution, patch_size, allow_upscale=allow_upscale
174
+ )
175
+ refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
176
+ return refine_size
177
+
178
+ @staticmethod
179
+ def split_to_patches(image, grid):
180
+ patches = []
181
+ width, height = image.size
182
+ grid_x = int(width / grid[0])
183
+ grid_y = int(height / grid[1])
184
+ for i in range(0, height, grid_y):
185
+ images = []
186
+ for j in range(0, width, grid_x):
187
+ box = (j, i, j + grid_x, i + grid_y)
188
+ patch = image.crop(box)
189
+ images.append(patch)
190
+ patches.append(images)
191
+ return patches
192
+
193
+ def slice_image(self, image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
194
+ original_size = image.size
195
+ source_image = None
196
+ best_grid = self.get_sliced_grid(original_size, max_slice_nums, never_split)
197
+ patches = []
198
+
199
+ if best_grid is None:
200
+ # dont need to slice, upsample
201
+ best_size = self.find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=True)
202
+ source_image = image.resize(best_size, resample=Image.Resampling.BICUBIC)
203
+ else:
204
+ # source image, down-sampling and ensure divided by patch_size
205
+ best_resize = self.find_best_resize(original_size, scale_resolution, patch_size)
206
+ source_image = image.copy().resize(best_resize, resample=Image.Resampling.BICUBIC)
207
+ refine_size = self.get_refine_size(
208
+ original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
209
+ )
210
+ refine_image = image.resize(refine_size, resample=Image.Resampling.BICUBIC)
211
+ patches = self.split_to_patches(refine_image, best_grid)
212
+
213
+ return source_image, patches, best_grid
214
+
215
+ def get_grid_placeholder(self, grid):
216
+ if grid is None:
217
+ return ""
218
+ slice_image_placeholder = (
219
+ self.slice_start_token + self.unk_token * self.image_feature_size + self.slice_end_token
220
+ )
221
+
222
+ cols = grid[0]
223
+ rows = grid[1]
224
+ slices = []
225
+ for i in range(rows):
226
+ lines = []
227
+ for j in range(cols):
228
+ lines.append(slice_image_placeholder)
229
+ slices.append("".join(lines))
230
+
231
+ slice_placeholder = "\n".join(slices)
232
+ return slice_placeholder
233
+
234
+ def get_image_id_placeholder(self, idx=0):
235
+ return f"{self.im_id_start}{idx}{self.im_id_end}"
236
+
237
+ def get_sliced_images(self, image, max_slice_nums=None):
238
+ slice_images = []
239
+
240
+ if not self.slice_mode:
241
+ return [image]
242
+
243
+ max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
244
+ assert max_slice_nums > 0
245
+ source_image, patches, sliced_grid = self.slice_image(
246
+ image, max_slice_nums, self.scale_resolution, self.patch_size # default: 9 # default: 448 # default: 14
247
+ )
248
+
249
+ slice_images.append(source_image)
250
+ if len(patches) > 0:
251
+ for i in range(len(patches)):
252
+ for j in range(len(patches[0])):
253
+ slice_images.append(patches[i][j])
254
+ return slice_images
255
+
256
+ def get_sliced_grid(self, image_size, max_slice_nums, nerver_split=False):
257
+ original_width, original_height = image_size
258
+ log_ratio = math.log(original_width / original_height)
259
+ ratio = original_width * original_height / (self.scale_resolution * self.scale_resolution)
260
+ multiple = min(math.ceil(ratio), max_slice_nums)
261
+ if multiple <= 1 or nerver_split:
262
+ return None
263
+ candidate_split_grids_nums = []
264
+ for i in [multiple - 1, multiple, multiple + 1]:
265
+ if i == 1 or i > max_slice_nums:
266
+ continue
267
+ candidate_split_grids_nums.append(i)
268
+
269
+ candidate_grids = []
270
+ for split_grids_nums in candidate_split_grids_nums:
271
+ m = 1
272
+ while m <= split_grids_nums:
273
+ if split_grids_nums % m == 0:
274
+ candidate_grids.append([m, split_grids_nums // m])
275
+ m += 1
276
+
277
+ best_grid = [1, 1]
278
+ min_error = float("inf")
279
+ for grid in candidate_grids:
280
+ error = abs(log_ratio - math.log(grid[0] / grid[1]))
281
+ if error < min_error:
282
+ best_grid = grid
283
+ min_error = error
284
+
285
+ return best_grid
286
+
287
+ def get_slice_image_placeholder(self, image_size, image_idx=0, max_slice_nums=None, use_image_id=None):
288
+ max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
289
+ assert max_slice_nums > 0
290
+ grid = self.get_sliced_grid(image_size=image_size, max_slice_nums=max_slice_nums)
291
+
292
+ image_placeholder = self.im_start_token + self.unk_token * self.image_feature_size + self.im_end_token
293
+ use_image_id = self.use_image_id if use_image_id is None else bool(use_image_id)
294
+ if use_image_id:
295
+ final_placeholder = self.get_image_id_placeholder(image_idx) + image_placeholder
296
+ else:
297
+ final_placeholder = image_placeholder
298
+
299
+ if self.slice_mode:
300
+ final_placeholder = final_placeholder + self.get_grid_placeholder(grid=grid)
301
+ return final_placeholder
302
+
303
+ @staticmethod
304
+ def to_pil_image(image, rescale=None) -> Image.Image:
305
+ """Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back
306
+ as the last axis if needed.
307
+
308
+ Args:
309
+ image (`Image.Image` or `numpy.ndarray` or `torch.Tensor`):
310
+ The image to convert to the PIL Image format.
311
+ rescale (`bool`, *optional*):
312
+ whether to apply the scaling factor (to make pixel values integers between 0 and 255). Will
313
+ default to `True` if the image type is a floating type, `False` otherwise.
314
+ """
315
+ if isinstance(image, Image.Image):
316
+ return image
317
+ if is_torch_tensor(image):
318
+ image = image.numpy()
319
+
320
+ if isinstance(image, np.ndarray):
321
+ if rescale is None:
322
+ # rescale default to the array being of floating type.
323
+ rescale = isinstance(image.flat[0], np.floating)
324
+ # If the channel as been moved to first dim, we put it back at the end.
325
+ if image.ndim == 3 and image.shape[0] in [1, 3]:
326
+ image = image.transpose(1, 2, 0)
327
+ if rescale:
328
+ image = image * 255
329
+ image = image.astype(np.uint8)
330
+ return Image.fromarray(image)
331
+ return image
332
+
333
+ def reshape_by_patch(self, image):
334
+ image = torch.from_numpy(image)
335
+ patch_size = self.patch_size
336
+ patches = torch.nn.functional.unfold(image, (patch_size, patch_size), stride=(patch_size, patch_size))
337
+
338
+ patches = patches.reshape(image.size(0), patch_size, patch_size, -1)
339
+ patches = patches.permute(0, 1, 3, 2).reshape(image.size(0), patch_size, -1)
340
+ return patches.numpy()
341
+
342
+ def preprocess(
343
+ self,
344
+ images: Union[Image.Image, List[Image.Image], List[List[Image.Image]]],
345
+ do_pad: Optional[bool] = True,
346
+ max_slice_nums: int = None,
347
+ return_tensors: Optional[Union[str, TensorType]] = None,
348
+ **kwargs,
349
+ ) -> MiniCPMOBatchFeature:
350
+ if isinstance(images, Image.Image):
351
+ images_list = [[images]]
352
+ elif isinstance(images[0], Image.Image):
353
+ images_list = [images]
354
+ else:
355
+ images_list = images
356
+
357
+ new_images_list = []
358
+ image_sizes_list = []
359
+ tgt_sizes_list = []
360
+
361
+ for _images in images_list:
362
+ if _images is None or len(_images) == 0:
363
+ new_images_list.append([])
364
+ image_sizes_list.append([])
365
+ tgt_sizes_list.append([])
366
+ continue
367
+ if not valid_images(_images):
368
+ raise ValueError(
369
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
370
+ "torch.Tensor, tf.Tensor or jax.ndarray."
371
+ )
372
+
373
+ _images = [self.to_pil_image(image).convert("RGB") for image in _images]
374
+ input_data_format = infer_channel_dimension_format(np.array(_images[0]))
375
+
376
+ new_images = []
377
+ image_sizes = [image.size for image in _images]
378
+ tgt_sizes = []
379
+ for image in _images:
380
+ image_patches = self.get_sliced_images(image, max_slice_nums)
381
+ image_patches = [to_numpy_array(image).astype(np.float32) / 255 for image in image_patches]
382
+ image_patches = [
383
+ self.normalize(image=image, mean=self.mean, std=self.std, input_data_format=input_data_format)
384
+ for image in image_patches
385
+ ]
386
+ image_patches = [
387
+ to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
388
+ for image in image_patches
389
+ ]
390
+ for slice_image in image_patches:
391
+ new_images.append(self.reshape_by_patch(slice_image))
392
+ tgt_sizes.append(
393
+ np.array((slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size))
394
+ )
395
+
396
+ if tgt_sizes:
397
+ tgt_sizes = np.vstack(tgt_sizes)
398
+
399
+ new_images_list.append(new_images)
400
+ image_sizes_list.append(image_sizes)
401
+ tgt_sizes_list.append(tgt_sizes)
402
+ return MiniCPMOBatchFeature(
403
+ data={"pixel_values": new_images_list, "image_sizes": image_sizes_list, "tgt_sizes": tgt_sizes_list},
404
+ tensor_type=return_tensors,
405
+ )
406
+
407
+
408
+ AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
409
+
410
+
411
+ def chunk_audio(audio: np.ndarray, max_duration_seconds: int = 30, sample_rate: int = 16000) -> List[np.ndarray]:
412
+ """split long audio into chunks
413
+
414
+ Args:
415
+ audio:
416
+ max_duration_seconds:
417
+ sample_rate:
418
+
419
+ Returns:
420
+ chunks
421
+ """
422
+ max_len = int(max_duration_seconds * sample_rate)
423
+
424
+ if len(audio) <= max_len:
425
+ return [audio]
426
+
427
+ chunks = []
428
+ for i in range(0, len(audio), max_len):
429
+ chunk = audio[i : i + max_len]
430
+ chunks.append(chunk)
431
+
432
+ return chunks
433
+
434
+
435
+ def process_audio_batch(
436
+ audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
437
+ feature_extractor,
438
+ sampling_rate: int = 16000,
439
+ max_duration_seconds: int = 30,
440
+ return_attention_mask: bool = True,
441
+ ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
442
+ """extract audio mel features
443
+
444
+ Args:
445
+ audios:
446
+ feature_extractor: WhisperFeatureExtractor
447
+ sampling_rate:
448
+ max_duration_seconds:
449
+ return_attention_mask:
450
+
451
+ Returns:
452
+ (audio_features, audio_feature_lens)
453
+ audio_features: [batch_size, n_mels, max_frames]
454
+ audio_feature_lens:
455
+ """
456
+ if isinstance(audios, np.ndarray):
457
+ audios_list = [[audios]]
458
+ elif len(audios) > 0 and isinstance(audios[0], np.ndarray):
459
+ audios_list = [audios]
460
+ else:
461
+ audios_list = audios
462
+
463
+ audio_features_all = []
464
+ audio_feature_lens_list = []
465
+
466
+ for batch_audios in audios_list:
467
+ batch_lens = []
468
+
469
+ for audio in batch_audios:
470
+ chunks = chunk_audio(audio, max_duration_seconds, sampling_rate)
471
+
472
+ for chunk in chunks:
473
+ audio_input = feature_extractor(
474
+ chunk,
475
+ sampling_rate=sampling_rate,
476
+ return_tensors="pt",
477
+ padding="max_length",
478
+ return_attention_mask=return_attention_mask,
479
+ )
480
+
481
+ audio_feature = audio_input["input_features"] # [1, 80, frames]
482
+
483
+ if return_attention_mask:
484
+ actual_len = audio_input["attention_mask"].sum(dim=1) # Tensor([frames])
485
+ audio_feature = audio_feature[:, :, : actual_len[0]]
486
+ batch_lens.append(actual_len[0])
487
+ else:
488
+ batch_lens.append(torch.tensor(audio_feature.shape[2]))
489
+
490
+ audio_features_all.append(audio_feature.squeeze(0)) # [80, frames]
491
+
492
+ if len(batch_lens) > 0:
493
+ audio_feature_lens_list.append(torch.hstack(batch_lens))
494
+ else:
495
+ audio_feature_lens_list.append(torch.tensor([]))
496
+
497
+ # pad to same length
498
+ if audio_features_all:
499
+ audio_features = torch.nn.utils.rnn.pad_sequence(
500
+ [feat.transpose(0, 1) for feat in audio_features_all], batch_first=True, padding_value=0.0
501
+ ).transpose(
502
+ 1, 2
503
+ ) # [batch, 80, max_frames]
504
+ else:
505
+ audio_features = torch.tensor([])
506
+
507
+ return audio_features, audio_feature_lens_list
508
+
509
+
510
+ def regroup_audio_features(
511
+ audio_features: torch.Tensor, audio_feature_lens: List[torch.Tensor], regroup_seconds: int, fps: int = 100
512
+ ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
513
+ """regroup audio features to fixed duration
514
+
515
+ Args:
516
+ audio_features: [batch, n_mels, frames]
517
+ audio_feature_lens: each batch's actual length
518
+ regroup_seconds: regroup duration (seconds)
519
+ fps: frames per second
520
+
521
+ Returns:
522
+ (regrouped_features, regrouped_lens)
523
+ """
524
+ # flatten to continuous frames sequence
525
+ all_lens = []
526
+ for lens in audio_feature_lens:
527
+ if isinstance(lens, torch.Tensor):
528
+ all_lens.extend(lens.tolist())
529
+ elif isinstance(lens, list):
530
+ all_lens.extend([int(x) for x in lens])
531
+
532
+ if len(all_lens) == 0:
533
+ return torch.tensor([]), []
534
+
535
+ # concatenate all valid features
536
+ flat_slices = [audio_features[i, :, :L] for i, L in enumerate(all_lens)] # [n_mels, L]
537
+
538
+ if len(flat_slices) == 1:
539
+ full_feat = flat_slices[0]
540
+ else:
541
+ full_feat = torch.cat(flat_slices, dim=1) # [n_mels, total_frames]
542
+
543
+ # split to fixed frames
544
+ frames_per_seg = int(regroup_seconds * fps)
545
+ segments = []
546
+
547
+ for start in range(0, full_feat.size(1), frames_per_seg):
548
+ seg = full_feat[:, start : start + frames_per_seg]
549
+ if seg.size(1) > 0:
550
+ segments.append(seg)
551
+
552
+ if len(segments) == 0:
553
+ return torch.tensor([]), []
554
+
555
+ # pad and convert to batch
556
+ seg_lens = [s.size(1) for s in segments]
557
+ segs_transposed = [s.transpose(0, 1) for s in segments]
558
+
559
+ padded = torch.nn.utils.rnn.pad_sequence(segs_transposed, batch_first=True, padding_value=0.0) # [N, max_T, n_mels]
560
+
561
+ padded = padded.transpose(1, 2) # [N, n_mels, max_T]
562
+ lens_tensor = torch.tensor(seg_lens, dtype=torch.int32, device=padded.device)
563
+
564
+ return padded, [lens_tensor]
565
+
566
+
567
+ class MiniCPMAAudioProcessor(WhisperFeatureExtractor):
568
+ """
569
+ On top of WhisperFeatureExtractor:
570
+ - support dynamic_log_norm (original max-8dB, adjustable dynamic_range_db)
571
+ - or fixed log_floor_db (e.g. -10dB)
572
+ - this is because we need to do streaming scheme, in which we can't do dynamic setting
573
+ - this can be modified in the middle, through set_dynamic_log_norm
574
+ Two paths (torch / numpy) keep consistent clipping and scaling order:
575
+ log10 -> (dynamic/fixed lower limit clipping) -> (+4)/4
576
+ """
577
+
578
+ def __init__(
579
+ self,
580
+ *args,
581
+ dynamic_log_norm: bool = True,
582
+ dynamic_range_db: float = 8.0,
583
+ log_floor_db: float = -10.0,
584
+ **kwargs,
585
+ ):
586
+ super().__init__(*args, **kwargs)
587
+ self.dynamic_log_norm = bool(dynamic_log_norm)
588
+ self.dynamic_range_db = float(dynamic_range_db)
589
+ self.log_floor_db = float(log_floor_db)
590
+
591
+ def set_spac_log_norm(
592
+ self,
593
+ dynamic_range_db: Optional[float] = None,
594
+ log_floor_db: Optional[float] = None,
595
+ *,
596
+ inplace: bool = True,
597
+ ) -> "MiniCPMAAudioProcessor":
598
+ """Hot update dynamic/fixed lower limit strategy.
599
+
600
+ Args:
601
+ enabled: True=use dynamic threshold (max - dynamic_range_db), False=use fixed lower limit log_floor_db.
602
+ None means keep unchanged.
603
+ dynamic_range_db: dynamic range (dB), only effective when enabled=True. None means keep unchanged.
604
+ log_floor_db: fixed log floor (dB, usually <= 0), only effective when enabled=False. None means keep unchanged.
605
+ inplace: True directly modify current instance; False return a shallow copy and modify on it.
606
+
607
+ Returns:
608
+ self or new instance (when inplace=False).
609
+ """
610
+
611
+ target = self if inplace else copy.copy(self)
612
+
613
+ if dynamic_range_db is not None:
614
+ val = float(dynamic_range_db)
615
+ if val < 0:
616
+ raise ValueError("dynamic_range_db must be >= 0.")
617
+ target.dynamic_log_norm = True # explicitly set the value to dynamic mode
618
+ target.dynamic_range_db = val
619
+
620
+ if log_floor_db is not None:
621
+ val = float(log_floor_db)
622
+ # usually log10(mel) maximum is not more than ~0dB, floor should be <= 0; here do loose validation
623
+ if val > 0:
624
+ raise ValueError("log_floor_db should be <= 0 (log10 scale).")
625
+ target.dynamic_log_norm = False # explicitly set the value to fixed lower limit mode
626
+ target.log_floor_db = val
627
+
628
+ return target
629
+
630
+ def _np_extract_fbank_features(self, waveform_batch: np.ndarray, device: str) -> np.ndarray:
631
+ """NumPy version consistent with upstream, but replace max-8dB with configurable dynamic/fixed lower limit clipping."""
632
+ if device != "cpu":
633
+ raise ValueError(
634
+ f"Got device `{device}` for feature extraction, but feature extraction on CUDA accelerator "
635
+ "devices requires torch. Set device='cpu' or install torch."
636
+ )
637
+
638
+ log_spec_batch: List[np.ndarray] = []
639
+ for waveform in waveform_batch:
640
+ # generate log10 Mel
641
+ log_spec = spectrogram(
642
+ waveform,
643
+ window_function(self.n_fft, "hann"),
644
+ frame_length=self.n_fft,
645
+ hop_length=self.hop_length,
646
+ power=2.0,
647
+ dither=self.dither,
648
+ mel_filters=self.mel_filters,
649
+ log_mel="log10",
650
+ )
651
+ # consistent with upstream: remove the last frame
652
+ log_spec = log_spec[:, :-1]
653
+
654
+ # dynamic/fixed clipping
655
+ if self.dynamic_log_norm:
656
+ threshold = log_spec.max() - self.dynamic_range_db
657
+ log_spec = np.maximum(log_spec, threshold)
658
+ else:
659
+ log_spec = np.maximum(log_spec, self.log_floor_db)
660
+
661
+ # consistent with Whisper linear scaling
662
+ log_spec = (log_spec + 4.0) / 4.0
663
+
664
+ log_spec_batch.append(log_spec)
665
+
666
+ return np.array(log_spec_batch)
667
+
668
+ def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
669
+ if torch is None:
670
+ raise RuntimeError("PyTorch is not installed, cannot compute STFT on GPU.")
671
+
672
+ waveform = torch.from_numpy(waveform).to(device, torch.float32)
673
+ window = torch.hann_window(self.n_fft, device=device)
674
+
675
+ if self.dither != 0.0:
676
+ waveform = waveform + self.dither * torch.randn_like(waveform)
677
+
678
+ stft = torch.stft(waveform, n_fft=self.n_fft, hop_length=self.hop_length, window=window, return_complex=True)
679
+ magnitudes = stft[..., :-1].abs() ** 2
680
+
681
+ mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32) # [n_mels, 1+n_fft//2]
682
+ mel_spec = mel_filters.T @ magnitudes # [..., n_mels, T]
683
+
684
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10() # <= 0
685
+
686
+ if self.dynamic_log_norm:
687
+ if waveform.dim() == 2:
688
+ max_val_t = log_spec.max(dim=2, keepdim=True)[0] # over T
689
+ max_val_bt = max_val_t.max(dim=1, keepdim=True)[0] # over mel
690
+ threshold = max_val_bt - self.dynamic_range_db
691
+ log_spec = torch.maximum(log_spec, threshold)
692
+ else:
693
+ threshold = log_spec.max() - self.dynamic_range_db
694
+ log_spec = torch.maximum(log_spec, threshold)
695
+ else:
696
+ floor_tensor = torch.tensor(self.log_floor_db, dtype=log_spec.dtype, device=log_spec.device)
697
+ log_spec = torch.maximum(log_spec, floor_tensor)
698
+
699
+ log_spec = (log_spec + 4.0) / 4.0
700
+
701
+ if device != "cpu":
702
+ log_spec = log_spec.detach().cpu()
703
+ return log_spec.numpy()
704
+
705
+ def process(self, *args, **kwargs):
706
+ """Alias of __call__ for convenience."""
707
+ return self.__call__(*args, **kwargs)
708
+
709
+
710
+ class StreamingMelProcessorExact:
711
+ """Strictly offline equivalent streaming Mel processor.
712
+
713
+ - accumulate all historical audio into buffer; use the same feature_extractor to calculate the entire mel after each addition.
714
+ - only output "stable" frames: the frame center does not depend on future (right) context, i.e. center + n_fft//2 <= current buffer length.
715
+ - output the last batch of frames at the end (flush), ensuring complete consistency with offline full-calculation.
716
+
717
+ Cost: Each call performs feature extraction on the accumulated buffer (can be optimized to incremental if needed).
718
+ """
719
+
720
+ def __init__(
721
+ self,
722
+ feature_extractor: MiniCPMAAudioProcessor,
723
+ chunk_ms: int = 100,
724
+ first_chunk_ms: Optional[int] = None,
725
+ sample_rate: int = 16000,
726
+ n_fft: int = 400,
727
+ hop_length: int = 160,
728
+ n_mels: int = 80,
729
+ cnn_redundancy_ms: int = 10, # (given in ms, usually 10ms=1 frame)
730
+ # sliding window parameters
731
+ enable_sliding_window: bool = False, # whether to enable sliding window
732
+ slide_trigger_seconds: float = 30.0, # trigger threshold for sliding window in seconds
733
+ slide_stride_seconds: float = 10.0, # stride for sliding window in seconds
734
+ ):
735
+ self.feature_extractor = feature_extractor
736
+ self.chunk_ms = chunk_ms
737
+ self.first_chunk_ms = first_chunk_ms if first_chunk_ms is not None else chunk_ms
738
+ self.sample_rate = sample_rate
739
+ self.n_fft = n_fft
740
+ self.hop_length = hop_length
741
+ self.n_mels = n_mels
742
+
743
+ self.chunk_samples = int(round(chunk_ms * sample_rate / 1000))
744
+ self.chunk_frames = self.chunk_samples // hop_length
745
+ # align to hop_length to avoid frame boundary issues
746
+ hop = self.hop_length
747
+ raw_first_samples = int(round(self.first_chunk_ms * sample_rate / 1000))
748
+ aligned_first = max(hop, (raw_first_samples // hop) * hop)
749
+ self.first_chunk_samples = aligned_first
750
+ self.half_window = n_fft // 2 # required right context
751
+
752
+ # redundancy frames (in frames), <=1 frame: 10ms → 1 frame
753
+ self.cnn_redundancy_ms = cnn_redundancy_ms
754
+ self.cnn_redundancy_samples = int(cnn_redundancy_ms * sample_rate / 1000)
755
+ self.cnn_redundancy_frames = max(0, self.cnn_redundancy_samples // hop_length)
756
+
757
+ # sliding window configuration (Trigger mode)
758
+ self.enable_sliding_window = enable_sliding_window
759
+ self.trigger_seconds = slide_trigger_seconds
760
+ self.slide_seconds = slide_stride_seconds
761
+
762
+ # shift/base (global frame coordinates)
763
+ self.left_samples_dropped = 0 # samples dropped from the left
764
+ self.base_T = 0 # index of the "global frame" corresponding to mel_full[:, :, 0]
765
+
766
+ self.reset()
767
+
768
+ def reset(self):
769
+ self.buffer = np.zeros(0, dtype=np.float32)
770
+ self.last_emitted_T = 0
771
+ self.total_samples_processed = 0
772
+ self.chunk_count = 0
773
+ self.is_first = True
774
+ self.left_samples_dropped = 0
775
+ self.base_T = 0
776
+
777
+ def get_chunk_size(self) -> int:
778
+ return self.first_chunk_samples if self.is_first else self.chunk_samples
779
+
780
+ def get_expected_output_frames(self) -> int:
781
+ raise NotImplementedError("get_expected_output_frames is not implemented")
782
+
783
+ def _extract_full(self) -> torch.Tensor:
784
+ # when buffer length is less than n_fft, Whisper's internal STFT will raise an error in center=True and pad mode
785
+ # (pad is greater than input length). At this time, there is no stable frame to output, so return empty features directly.
786
+ if len(self.buffer) < self.n_fft:
787
+ raise ValueError(f"buffer length is shorter than n_fft {len(self.buffer)} < {self.n_fft}")
788
+ # if buffer length is less than 5s, use set_spac_log_norm(log_floor_db=-10) or the last cached result
789
+ if len(self.buffer) < 5 * self.sample_rate:
790
+ # TODO: here the best is to do some experiments to choose the best one, now this is selected through experience, can see MiniCPMAAudioProcessor's main implementation
791
+ self.feature_extractor.set_spac_log_norm(log_floor_db=-10)
792
+ # if buffer length is greater than 5s, use set_spac_log_norm(dynamic_range_db=8)
793
+ else:
794
+ self.feature_extractor.set_spac_log_norm(dynamic_range_db=8)
795
+ feats = self.feature_extractor(
796
+ self.buffer,
797
+ sampling_rate=self.sample_rate,
798
+ return_tensors="pt",
799
+ padding=False,
800
+ )
801
+ return feats.input_features # [1, 80, T]
802
+
803
+ def _stable_frames_count(self) -> int:
804
+ # number of stable frames = floor((len(buffer) - half_window) / hop) + 1, minimum is 0
805
+ L = int(self.buffer.shape[0])
806
+ if L <= 0:
807
+ return 0
808
+ if L < self.half_window:
809
+ return 0
810
+ return max(0, (L - self.half_window) // self.hop_length + 1)
811
+
812
+ def _maybe_slide_buffer(self):
813
+ """Trigger mode sliding window: when the buffer reaches the trigger threshold, slide a fixed length window."""
814
+ if not self.enable_sliding_window:
815
+ return
816
+
817
+ sr = self.sample_rate
818
+ hop = self.hop_length
819
+ L = len(self.buffer)
820
+
821
+ # convert seconds to samples
822
+ trigger_samples = int(self.trigger_seconds * sr)
823
+ stride_samples = int(self.slide_seconds * sr)
824
+
825
+ # check if the trigger threshold is reached
826
+ if L < trigger_samples:
827
+ return
828
+
829
+ # calculate the number of samples to drop (fixed sliding stride_samples)
830
+ drop = stride_samples
831
+
832
+ # cannot drop the left context that is still needed for subsequent emission
833
+ # in trigger mode, we only need to protect the minimum necessary data
834
+ # i.e. ensure that we do not discard frames that may be needed in the future
835
+ last_emitted_local = self.last_emitted_T - self.base_T
836
+
837
+ # only protect necessary context (e.g. the most recent 1 second data)
838
+ min_keep_seconds = 1.0 # keep at least 1 second of data to ensure continuity
839
+ min_keep_samples = int(min_keep_seconds * sr)
840
+
841
+ # guard_samples are the minimum samples we must keep
842
+ guard_samples = min(min_keep_samples, L - drop)
843
+
844
+ # limit: do not exceed the safe boundary; and align hop
845
+ max_allowed_drop = max(0, L - guard_samples)
846
+ drop = min(drop, max_allowed_drop)
847
+ drop = (drop // hop) * hop
848
+
849
+ if drop <= 0:
850
+ return
851
+
852
+ # truly drop & update base
853
+ self.buffer = self.buffer[drop:]
854
+ self.left_samples_dropped += drop
855
+ self.base_T += drop // hop
856
+
857
+ def process(self, audio_chunk: np.ndarray, is_last_chunk: bool = False) -> Tuple[torch.Tensor, Dict]:
858
+ self.chunk_count += 1
859
+ # append to buffer
860
+ if len(self.buffer) == 0:
861
+ self.buffer = audio_chunk.astype(np.float32, copy=True)
862
+ else:
863
+ self.buffer = np.concatenate([self.buffer, audio_chunk.astype(np.float32, copy=True)])
864
+
865
+ # sliding window processing
866
+ self._maybe_slide_buffer()
867
+
868
+ # full extraction (for the current window)
869
+ mel_full = self._extract_full()
870
+ T_full = mel_full.shape[-1] # local frames in the current window
871
+ stable_T = min(T_full, self._stable_frames_count()) # local stable frames
872
+ stable_T_global = self.base_T + stable_T # map to global frame coordinates
873
+
874
+ # plan the core frames for the current emission (global coordinates)
875
+ core_start_g = self.last_emitted_T
876
+ core_end_g = core_start_g + self.chunk_frames
877
+ required_stable_g = core_end_g + self.cnn_redundancy_frames
878
+
879
+ if stable_T_global >= required_stable_g or is_last_chunk:
880
+ emit_start_g = max(0, core_start_g - self.cnn_redundancy_frames)
881
+ emit_end_g = core_end_g + self.cnn_redundancy_frames
882
+
883
+ # global -> local index
884
+ emit_start = max(0, emit_start_g - self.base_T)
885
+ emit_end = emit_end_g - self.base_T
886
+ emit_start = max(0, min(emit_start, T_full))
887
+ emit_end = max(emit_start, min(emit_end, T_full))
888
+
889
+ mel_output = mel_full[:, :, emit_start:emit_end]
890
+ self.last_emitted_T = core_end_g # only advance the core frame pointer (global)
891
+ else:
892
+ mel_output = mel_full[:, :, 0:0]
893
+
894
+ self.total_samples_processed += len(audio_chunk)
895
+ self.is_first = False
896
+
897
+ info = {
898
+ "type": "exact_chunk",
899
+ "chunk_number": self.chunk_count,
900
+ "emitted_frames": mel_output.shape[-1],
901
+ "stable_T": stable_T,
902
+ "T_full": T_full,
903
+ "base_T": self.base_T,
904
+ "stable_T_global": stable_T_global,
905
+ "buffer_len_samples": int(self.buffer.shape[0]),
906
+ "left_samples_dropped": self.left_samples_dropped,
907
+ "core_start": core_start_g, # if keep the original field name, use the global value here
908
+ "core_end": core_end_g, # same as above
909
+ }
910
+ return mel_output, info
911
+
912
+ def flush(self) -> torch.Tensor:
913
+ """Called when the stream ends, output the remaining unemitted frames, ensuring consistency with offline (calculated by global coordinates)."""
914
+ if len(self.buffer) == 0:
915
+ return torch.zeros(1, 80, 0)
916
+
917
+ mel_full = self._extract_full()
918
+ T_local = mel_full.shape[-1]
919
+ T_global = self.base_T + T_local
920
+
921
+ if self.last_emitted_T < T_global:
922
+ start_l = max(0, self.last_emitted_T - self.base_T)
923
+ tail = mel_full[:, :, start_l:]
924
+ self.last_emitted_T = T_global
925
+ return tail
926
+ return mel_full[:, :, 0:0]
927
+
928
+ def get_config(self) -> Dict:
929
+ return {
930
+ "chunk_ms": self.chunk_ms,
931
+ "first_chunk_ms": self.first_chunk_ms,
932
+ "effective_first_chunk_ms": self.first_chunk_samples / self.sample_rate * 1000.0,
933
+ "sample_rate": self.sample_rate,
934
+ "n_fft": self.n_fft,
935
+ "hop_length": self.hop_length,
936
+ "cnn_redundancy_ms": self.cnn_redundancy_ms,
937
+ "cnn_redundancy_frames": self.cnn_redundancy_frames,
938
+ "enable_sliding_window": self.enable_sliding_window,
939
+ "trigger_seconds": self.trigger_seconds,
940
+ "slide_seconds": self.slide_seconds,
941
+ }
942
+
943
+ def get_state(self) -> Dict:
944
+ return {
945
+ "chunk_count": self.chunk_count,
946
+ "last_emitted_T": self.last_emitted_T,
947
+ "total_samples_processed": self.total_samples_processed,
948
+ "buffer_len": int(self.buffer.shape[0]),
949
+ "base_T": self.base_T,
950
+ "left_samples_dropped": self.left_samples_dropped,
951
+ }
952
+
953
+ def get_snapshot(self) -> Dict:
954
+ """Get a complete state snapshot (including buffer), used for recovery from a fast start.
955
+
956
+ Returns:
957
+ A dictionary containing the complete state, which can be used to restore the snapshot
958
+ """
959
+ buffer_copy = self.buffer.copy()
960
+ snapshot = {
961
+ "chunk_count": self.chunk_count,
962
+ "last_emitted_T": self.last_emitted_T,
963
+ "total_samples_processed": self.total_samples_processed,
964
+ "buffer": buffer_copy,
965
+ "base_T": self.base_T,
966
+ "left_samples_dropped": self.left_samples_dropped,
967
+ "is_first": self.is_first,
968
+ # save the state of the feature_extractor (key: ensure determinism of mel feature extraction)
969
+ "fe_dynamic_log_norm": getattr(self.feature_extractor, "dynamic_log_norm", None),
970
+ "fe_dynamic_range_db": getattr(self.feature_extractor, "dynamic_range_db", None),
971
+ "fe_log_floor_db": getattr(self.feature_extractor, "log_floor_db", None),
972
+ }
973
+
974
+ return snapshot
975
+
976
+ def restore_snapshot(self, snapshot: Dict) -> None:
977
+ """Restore state from a snapshot
978
+
979
+ Args:
980
+ snapshot: the snapshot dictionary returned by get_snapshot
981
+ """
982
+ # record the state before restoration
983
+ prev_state = {
984
+ "chunk_count": self.chunk_count,
985
+ "last_emitted_T": self.last_emitted_T,
986
+ "buffer_len": len(self.buffer),
987
+ }
988
+
989
+ # restore state
990
+ self.chunk_count = snapshot["chunk_count"]
991
+ self.last_emitted_T = snapshot["last_emitted_T"]
992
+ self.total_samples_processed = snapshot["total_samples_processed"]
993
+ self.buffer = snapshot["buffer"].copy() # copy buffer
994
+ self.base_T = snapshot["base_T"]
995
+ self.left_samples_dropped = snapshot["left_samples_dropped"]
996
+ self.is_first = snapshot["is_first"]
997
+
998
+ # restore the state of the feature_extractor (key: ensure determinism of mel feature extraction)
999
+ if snapshot.get("fe_dynamic_log_norm") is not None:
1000
+ self.feature_extractor.dynamic_log_norm = snapshot["fe_dynamic_log_norm"]
1001
+ if snapshot.get("fe_dynamic_range_db") is not None:
1002
+ self.feature_extractor.dynamic_range_db = snapshot["fe_dynamic_range_db"]
1003
+ if snapshot.get("fe_log_floor_db") is not None:
1004
+ self.feature_extractor.log_floor_db = snapshot["fe_log_floor_db"]
1005
+
1006
+
1007
+ class MiniCPMOProcessor(ProcessorMixin):
1008
+ attributes = ["image_processor", "audio_processor", "tokenizer"]
1009
+ audio_processor_class = "AutoFeatureExtractor"
1010
+ image_processor_class = "AutoImageProcessor"
1011
+ tokenizer_class = "AutoTokenizer"
1012
+
1013
+ def __init__(self, image_processor=None, audio_processor=None, tokenizer=None, **kwargs):
1014
+ super().__init__(image_processor, audio_processor, tokenizer)
1015
+
1016
+ self.version = image_processor.version if image_processor else None
1017
+ # audio feature pooling step, needs to be consistent with config.audio_pool_step
1018
+ self.pool_step = kwargs.get("audio_pool_step", 5)
1019
+
1020
+ # initialize the streaming audio processor
1021
+ self._streaming_mel_processor = None
1022
+ if audio_processor is not None:
1023
+ self._init_streaming_processor()
1024
+
1025
+ def get_audio_placeholder(
1026
+ self,
1027
+ audio_lens: int,
1028
+ chunk_input: bool = True,
1029
+ chunk_length: int = 1,
1030
+ ) -> str:
1031
+ """
1032
+ Public method to get audio placeholder string for vLLM integration.
1033
+
1034
+ Args:
1035
+ audio_lens: Length of audio in samples
1036
+ chunk_input: Whether to use chunked processing
1037
+ chunk_length: Chunk length in seconds
1038
+
1039
+ Returns:
1040
+ Audio placeholder string
1041
+ """
1042
+ pool_step = self.pool_step
1043
+ feature_lens = math.ceil(audio_lens / self.audio_processor.hop_length)
1044
+
1045
+ feature_lens = (feature_lens - 1) // 2 + 1
1046
+ output_lens = (feature_lens - pool_step) // pool_step + 1
1047
+
1048
+ if chunk_input:
1049
+ fbank_feat_in_chunk = int(chunk_length * 100)
1050
+ cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
1051
+ audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
1052
+ num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
1053
+
1054
+ place_holders = ""
1055
+ total_unk_len = 0
1056
+ for _ in range(num_audio_chunks):
1057
+ unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
1058
+ place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
1059
+ total_unk_len += unk_len
1060
+ audio_placeholder = place_holders
1061
+ else:
1062
+ audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
1063
+
1064
+ return audio_placeholder
1065
+
1066
+ def _init_streaming_processor(
1067
+ self,
1068
+ chunk_ms: int = 100,
1069
+ cnn_redundancy_ms: int = 0,
1070
+ *,
1071
+ mode: str = "exact",
1072
+ first_chunk_ms: Optional[int] = None,
1073
+ enable_sliding_window: bool = False,
1074
+ slide_trigger_seconds: float = 30.0,
1075
+ slide_stride_seconds: float = 10.0,
1076
+ ):
1077
+ """Initialize the streaming processor
1078
+
1079
+ Args:
1080
+ chunk_ms: Chunk size in milliseconds, also the sliding step.
1081
+ cnn_redundancy_ms: CNN boundary redundancy in milliseconds (before and after), 0 means standard mode.
1082
+ mode: streaming processing mode, currently only supports "exact"
1083
+ first_chunk_ms: the size of the first chunk (milliseconds), if not specified, it is the same as chunk_ms
1084
+ enable_sliding_window: whether to enable sliding window (trigger mode)
1085
+ slide_trigger_seconds: trigger threshold for sliding window in seconds
1086
+ slide_stride_seconds: stride for sliding window in seconds
1087
+ """
1088
+ if mode == "exact":
1089
+ self._streaming_mel_processor = StreamingMelProcessorExact(
1090
+ feature_extractor=self.audio_processor,
1091
+ chunk_ms=chunk_ms,
1092
+ first_chunk_ms=first_chunk_ms,
1093
+ sample_rate=16000,
1094
+ cnn_redundancy_ms=cnn_redundancy_ms,
1095
+ enable_sliding_window=enable_sliding_window,
1096
+ slide_trigger_seconds=slide_trigger_seconds,
1097
+ slide_stride_seconds=slide_stride_seconds,
1098
+ )
1099
+ else:
1100
+ raise ValueError(f"Unsupported mode: {mode}, only 'exact' is supported")
1101
+ self._streaming_mode = mode if mode in ["exact"] else ("exact")
1102
+
1103
+ def set_streaming_mode(
1104
+ self,
1105
+ mode: str = "exact",
1106
+ chunk_ms: int = 100,
1107
+ cnn_redundancy_ms: int = 0,
1108
+ *,
1109
+ first_chunk_ms: Optional[int] = None,
1110
+ enable_sliding_window: bool = False,
1111
+ slide_trigger_seconds: float = 30.0,
1112
+ slide_stride_seconds: float = 10.0,
1113
+ ):
1114
+ """Set streaming processing mode
1115
+
1116
+ Args:
1117
+ mode: streaming processing mode, currently only supports "exact"
1118
+ chunk_ms: chunk size in milliseconds, also the sliding step.
1119
+ cnn_redundancy_ms: CNN boundary redundancy in milliseconds (before and after), 0 means standard mode.
1120
+ first_chunk_ms: the size of the first chunk (milliseconds), if not specified, it is the same as chunk_ms
1121
+ enable_sliding_window: whether to enable sliding window (trigger mode)
1122
+ slide_trigger_seconds: trigger threshold for sliding window in seconds
1123
+ slide_stride_seconds: stride for sliding window in seconds
1124
+ """
1125
+ if self.audio_processor is None:
1126
+ raise ValueError("audio_processor is not set, cannot initialize the streaming processor")
1127
+ self._init_streaming_processor(
1128
+ chunk_ms=chunk_ms,
1129
+ cnn_redundancy_ms=cnn_redundancy_ms,
1130
+ mode=mode,
1131
+ first_chunk_ms=first_chunk_ms,
1132
+ enable_sliding_window=enable_sliding_window,
1133
+ slide_trigger_seconds=slide_trigger_seconds,
1134
+ slide_stride_seconds=slide_stride_seconds,
1135
+ )
1136
+
1137
+ def process_image(
1138
+ self,
1139
+ images: Optional[ImageInput] = None,
1140
+ do_pad: bool = True,
1141
+ max_slice_nums: int = 1,
1142
+ return_tensors: str = "pt",
1143
+ ) -> MiniCPMOBatchFeature:
1144
+ """Process image data
1145
+
1146
+ Args:
1147
+ images: input images
1148
+ do_pad: whether to pad
1149
+ max_slice_nums: maximum number of slices
1150
+ return_tensors: return tensor type
1151
+ Returns:
1152
+ MiniCPMOBatchFeature object
1153
+ """
1154
+ if images is None:
1155
+ return MiniCPMOBatchFeature(data={"pixel_values": [[]], "image_sizes": [[]], "tgt_sizes": [[]]})
1156
+
1157
+ result = self.image_processor(
1158
+ images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
1159
+ )
1160
+
1161
+ model_inputs = {
1162
+ "pixel_values": result.get("pixel_values", [[]]),
1163
+ "image_sizes": result.get("image_sizes", [[]]),
1164
+ "tgt_sizes": result.get("tgt_sizes", [[]]),
1165
+ }
1166
+
1167
+ return MiniCPMOBatchFeature(data=model_inputs)
1168
+
1169
+ def process_audio(
1170
+ self,
1171
+ audios: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
1172
+ sampling_rate: int = 16000,
1173
+ regroup_to_seconds: Optional[int] = None,
1174
+ fps: int = 100,
1175
+ ) -> MiniCPMOBatchFeature:
1176
+ """Process audio data in batch
1177
+
1178
+ Args:
1179
+ audios: audio data
1180
+ sampling_rate: sampling rate
1181
+ regroup_to_seconds: regroup duration in seconds
1182
+ fps: frames per second
1183
+ Returns:
1184
+ MiniCPMOBatchFeature object
1185
+ """
1186
+ if audios is None:
1187
+ return MiniCPMOBatchFeature(data={"audio_features": [], "audio_feature_lens": []})
1188
+
1189
+ audio_features, audio_feature_lens = process_audio_batch(
1190
+ audios=audios,
1191
+ feature_extractor=self.audio_processor,
1192
+ sampling_rate=sampling_rate,
1193
+ max_duration_seconds=30,
1194
+ return_attention_mask=True,
1195
+ )
1196
+
1197
+ if regroup_to_seconds is not None and len(audio_features) > 0:
1198
+ audio_features, audio_feature_lens = regroup_audio_features(
1199
+ audio_features=audio_features,
1200
+ audio_feature_lens=audio_feature_lens,
1201
+ regroup_seconds=regroup_to_seconds,
1202
+ fps=fps,
1203
+ )
1204
+
1205
+ model_inputs = {"audio_features": audio_features, "audio_feature_lens": audio_feature_lens}
1206
+
1207
+ return MiniCPMOBatchFeature(data=model_inputs)
1208
+
1209
+ def process_audio_streaming(
1210
+ self,
1211
+ audio_chunk: np.ndarray,
1212
+ reset: bool = False,
1213
+ return_batch_feature: bool = False,
1214
+ is_last_chunk: bool = False,
1215
+ ) -> Union[Tuple[torch.Tensor, dict], MiniCPMOBatchFeature]:
1216
+ """Process audio chunk in streaming
1217
+
1218
+ Args:
1219
+ audio_chunk: audio data chunk (any audio, e.g. first process 125ms, then process 100ms)
1220
+ reset: whether to reset the processor state
1221
+ return_batch_feature: whether to return MiniCPMOBatchFeature format (consistent with process_audio)
1222
+ Returns:
1223
+ If return_batch_feature=False:
1224
+ (audio_features, info)
1225
+ - audio_features: [1, 80, n_frames] mel features
1226
+ - info: processing information dictionary
1227
+ If return_batch_feature=True:
1228
+ MiniCPMOBatchFeature object, containing:
1229
+ - audio_features: [1, 80, n_frames] mel features
1230
+ - audio_feature_lens: [tensor([n_frames])]
1231
+ - info: processing information (as an extra attribute)
1232
+ """
1233
+ if self._streaming_mel_processor is None:
1234
+ raise ValueError("Streaming processor not initialized, please ensure audio_processor is set")
1235
+
1236
+ if reset:
1237
+ self._streaming_mel_processor.reset()
1238
+
1239
+ # process chunk
1240
+ mel_features, info = self._streaming_mel_processor.process(audio_chunk, is_last_chunk=is_last_chunk)
1241
+
1242
+ # determine the return format based on the parameters
1243
+ if return_batch_feature:
1244
+ # return the format consistent with process_audio
1245
+ # note: info returns emitted_frames, which represents the actual output frames
1246
+ n_frames = info.get("emitted_frames", mel_features.shape[-1])
1247
+ model_inputs = {
1248
+ "audio_features": mel_features,
1249
+ "audio_feature_lens": [torch.tensor([n_frames])],
1250
+ "streaming_info": info, # add streaming processing information
1251
+ }
1252
+ return MiniCPMOBatchFeature(data=model_inputs)
1253
+ else:
1254
+ return mel_features, info
1255
+
1256
+ def reset_streaming(self):
1257
+ if self._streaming_mel_processor is not None:
1258
+ self._streaming_mel_processor.reset()
1259
+
1260
+ def get_streaming_chunk_size(self) -> int:
1261
+ if self._streaming_mel_processor is None:
1262
+ raise ValueError("Streaming processor not initialized")
1263
+ return self._streaming_mel_processor.get_chunk_size()
1264
+
1265
+ def configure_streaming(
1266
+ self,
1267
+ chunk_ms: int = 100,
1268
+ enable_sliding_window: bool = False,
1269
+ slide_trigger_seconds: float = 30.0,
1270
+ slide_stride_seconds: float = 10.0,
1271
+ ):
1272
+ """Configure streaming processor parameters
1273
+
1274
+ Args:
1275
+ chunk_ms: chunk size in milliseconds
1276
+ enable_sliding_window: whether to enable sliding window (trigger mode)
1277
+ slide_trigger_seconds: trigger threshold for sliding window in seconds
1278
+ slide_stride_seconds: stride for sliding window in seconds
1279
+ """
1280
+ if self.audio_processor is None:
1281
+ raise ValueError("audio_processor is not set")
1282
+
1283
+ self._init_streaming_processor(
1284
+ chunk_ms=chunk_ms,
1285
+ enable_sliding_window=enable_sliding_window,
1286
+ slide_trigger_seconds=slide_trigger_seconds,
1287
+ slide_stride_seconds=slide_stride_seconds,
1288
+ )
1289
+
1290
+ def get_streaming_config(self) -> dict:
1291
+ if self._streaming_mel_processor is None:
1292
+ return {}
1293
+ return self._streaming_mel_processor.get_config()
1294
+
1295
+ def get_streaming_state(self) -> dict:
1296
+ if self._streaming_mel_processor is None:
1297
+ return {}
1298
+ return self._streaming_mel_processor.get_state()
1299
+
1300
+ def get_streaming_snapshot(self) -> dict:
1301
+ if self._streaming_mel_processor is None:
1302
+ return {}
1303
+ return self._streaming_mel_processor.get_snapshot()
1304
+
1305
+ def restore_streaming_snapshot(self, snapshot: dict) -> None:
1306
+ if self._streaming_mel_processor is None:
1307
+ return
1308
+ if not snapshot:
1309
+ return
1310
+ self._streaming_mel_processor.restore_snapshot(snapshot)
1311
+
1312
+ def __call__(
1313
+ self,
1314
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
1315
+ images: ImageInput = None,
1316
+ audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]] = None,
1317
+ audio_parts: Optional[list] = None,
1318
+ max_length: Optional[int] = None,
1319
+ do_pad: Optional[bool] = True,
1320
+ max_slice_nums: int = None,
1321
+ use_image_id: bool = True,
1322
+ stream_input: bool = False,
1323
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
1324
+ sampling_rate: Optional[int] = 16000,
1325
+ online_streaming: bool = False,
1326
+ audio_chunk_idx: int = 0,
1327
+ is_last_chunk: bool = False,
1328
+ **kwargs,
1329
+ ) -> MiniCPMOBatchFeature:
1330
+ if images is not None:
1331
+ image_inputs = self.process_image(
1332
+ images=images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
1333
+ )
1334
+ else:
1335
+ image_inputs = None
1336
+
1337
+ audio_features, audio_feature_lens, audio_phs = self.audio_feature_extract(
1338
+ audios,
1339
+ audio_parts,
1340
+ stream_input,
1341
+ sampling_rate,
1342
+ online_streaming=online_streaming,
1343
+ is_last_chunk=is_last_chunk,
1344
+ )
1345
+
1346
+ model_inputs = self._convert_omni_to_inputs(
1347
+ image_inputs,
1348
+ audio_phs,
1349
+ text,
1350
+ max_slice_nums=max_slice_nums,
1351
+ use_image_id=use_image_id,
1352
+ max_length=max_length,
1353
+ **kwargs,
1354
+ )
1355
+
1356
+ model_inputs["audio_features"] = audio_features
1357
+ model_inputs["audio_feature_lens"] = audio_feature_lens
1358
+
1359
+ result = MiniCPMOBatchFeature(data={**model_inputs})
1360
+
1361
+ if online_streaming:
1362
+ result.use_extra_context = True
1363
+ result.prefix_extra_frames = 0 if audio_chunk_idx == 0 else 2
1364
+ result.suffix_extra_frames = 2
1365
+ result.chunk_idx = audio_chunk_idx
1366
+
1367
+ return result
1368
+
1369
+ def audio_feature_extract(
1370
+ self,
1371
+ audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]], None] = None,
1372
+ audio_parts: Optional[list] = None,
1373
+ stream_input: Optional[bool] = False,
1374
+ sampling_rate: Optional[int] = None,
1375
+ chunk_length: Optional[int] = 1,
1376
+ online_streaming: bool = False,
1377
+ is_last_chunk: bool = False,
1378
+ **kwargs,
1379
+ ):
1380
+ if audios is None:
1381
+ return [], [], []
1382
+
1383
+ if isinstance(audios, np.ndarray):
1384
+ audios_list = [[audios]]
1385
+ elif isinstance(audios[0], np.ndarray):
1386
+ audios_list = [audios]
1387
+ else:
1388
+ audios_list = audios
1389
+
1390
+ if audio_parts is not None:
1391
+ assert len(audio_parts) == len(audios_list)
1392
+ for parts, audios in zip(audio_parts, audios_list):
1393
+ assert len(parts) == len(audios)
1394
+
1395
+ audio_feature_lens_list = []
1396
+ audio_ph_list = []
1397
+ audio_features_all = []
1398
+
1399
+ # audio placeholder not dependent on audio_parts
1400
+ for audios in audios_list:
1401
+ if audios:
1402
+ audio_ph_list.append(
1403
+ [
1404
+ self.get_audio_placeholder(len(a), chunk_input=stream_input, chunk_length=chunk_length)
1405
+ for a in audios
1406
+ ]
1407
+ )
1408
+ else:
1409
+ audio_ph_list.append([])
1410
+
1411
+ for idx, audios in enumerate(audios_list):
1412
+ if audio_parts is not None:
1413
+ # same audio part merge
1414
+ audio_part = audio_parts[idx]
1415
+ merge_audio = []
1416
+ cur_audio = []
1417
+ for aid, (part, audio) in enumerate(zip(audio_part, audios)):
1418
+ if aid == 0 or audio_part[aid] == audio_part[aid - 1]:
1419
+ cur_audio.append(audio)
1420
+ else:
1421
+ merge_audio.append(np.hstack(cur_audio))
1422
+ cur_audio = [audio]
1423
+ if cur_audio:
1424
+ merge_audio.append(np.hstack(cur_audio))
1425
+ else:
1426
+ merge_audio = audios
1427
+
1428
+ # If the audio exceeds 30 seconds, split it into chunks every 30 seconds.
1429
+ final_merge_audio = []
1430
+ max_audio_inp_len = 30 * sampling_rate
1431
+ for audio in merge_audio:
1432
+ if len(audio) <= max_audio_inp_len:
1433
+ final_merge_audio.append(audio)
1434
+ else:
1435
+ for i in range(math.ceil(len(audio) / max_audio_inp_len)):
1436
+ final_merge_audio.append(audio[i * max_audio_inp_len : (i + 1) * max_audio_inp_len])
1437
+
1438
+ audio_feature_lens = []
1439
+
1440
+ if audios:
1441
+ if online_streaming:
1442
+ # online streaming: only support single audio, directly use process_audio_streaming return format
1443
+ assert (
1444
+ len(final_merge_audio) == 1
1445
+ ), f"online streaming mode only supports single audio, currently there are {len(final_merge_audio)}"
1446
+ audio = final_merge_audio[0]
1447
+ result = self.process_audio_streaming(
1448
+ audio, reset=False, return_batch_feature=True, is_last_chunk=is_last_chunk
1449
+ )
1450
+ audio_features_all.append(
1451
+ result["audio_features"].squeeze(0)
1452
+ ) # [1, 80, T] -> [80, T], keep consistent with batch processing
1453
+ audio_feature_lens_list.append(result["audio_feature_lens"][0])
1454
+ else:
1455
+ # batch processing
1456
+ audio_inputs = self.audio_processor(
1457
+ final_merge_audio,
1458
+ sampling_rate=sampling_rate,
1459
+ return_attention_mask=True,
1460
+ padding="max_length",
1461
+ return_tensors="pt",
1462
+ **kwargs,
1463
+ )
1464
+ audio_feature = audio_inputs["input_features"]
1465
+ actual_lens = audio_inputs["attention_mask"].sum(dim=1)
1466
+
1467
+ for feat, lens in zip(audio_feature, actual_lens):
1468
+ audio_features_all.append(feat[:, :lens])
1469
+ audio_feature_lens.append(lens)
1470
+
1471
+ audio_feature_lens = torch.hstack(audio_feature_lens)
1472
+ audio_feature_lens_list.append(audio_feature_lens)
1473
+ else:
1474
+ audio_feature_lens_list.append([])
1475
+
1476
+ if audio_features_all:
1477
+ audio_features = [i.permute(1, 0) for i in audio_features_all]
1478
+ audio_features = torch.nn.utils.rnn.pad_sequence(
1479
+ audio_features, batch_first=True, padding_value=0.0
1480
+ ).permute(0, 2, 1)
1481
+ else:
1482
+ audio_features = []
1483
+
1484
+ return audio_features, audio_feature_lens_list, audio_ph_list
1485
+
1486
+ def _convert(self, input_str, max_inp_length: Optional[int] = None):
1487
+ old_input_ids = self.tokenizer.encode(input_str)
1488
+
1489
+ listen_token_id = self.tokenizer.convert_tokens_to_ids("<|listen|>")
1490
+ input_ids = []
1491
+ for token in old_input_ids:
1492
+ if token != listen_token_id:
1493
+ input_ids.append(token)
1494
+
1495
+ if max_inp_length is not None:
1496
+ input_ids = input_ids[:max_inp_length]
1497
+ input_ids = torch.tensor(input_ids, dtype=torch.int32)
1498
+
1499
+ ## image bound
1500
+ start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
1501
+ end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
1502
+
1503
+ image_start_idx = torch.where(start_cond)[0]
1504
+ image_start_idx += 1
1505
+ image_end_idx = torch.where(end_cond)[0]
1506
+
1507
+ valid_image_nums = max(len(image_start_idx), len(image_end_idx))
1508
+
1509
+ image_bounds = torch.hstack(
1510
+ [
1511
+ image_start_idx[:valid_image_nums].unsqueeze(-1),
1512
+ image_end_idx[:valid_image_nums].unsqueeze(-1),
1513
+ ]
1514
+ )
1515
+
1516
+ ## audio bound
1517
+ audio_start_idx = torch.where(input_ids == self.tokenizer.audio_start_id)[0]
1518
+ audio_end_idx = torch.where(input_ids == self.tokenizer.audio_end_id)[0]
1519
+ assert len(audio_start_idx) == len(audio_end_idx)
1520
+ audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
1521
+
1522
+ spk_start_idx = torch.where(input_ids == self.tokenizer.spk_start_id)[0]
1523
+ spk_end_idx = torch.where(input_ids == self.tokenizer.spk_end_id)[0]
1524
+ assert len(spk_start_idx) == len(spk_end_idx)
1525
+ spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
1526
+
1527
+ return input_ids, image_bounds, audio_bounds, spk_bounds
1528
+
1529
+ def _convert_omni_to_inputs(
1530
+ self,
1531
+ images,
1532
+ audio_phs,
1533
+ texts: Union[str, List[str]],
1534
+ truncation=None,
1535
+ max_length=None,
1536
+ max_slice_nums=None,
1537
+ use_image_id=None,
1538
+ return_tensors=None,
1539
+ **kwargs,
1540
+ ):
1541
+ if images is None and audio_phs is None:
1542
+ model_inputs = self.tokenizer(
1543
+ texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs
1544
+ )
1545
+ return MiniCPMOBatchFeature(data={**model_inputs})
1546
+
1547
+ image_pattern = "<image>./</image>"
1548
+ audio_pattern = "<audio>./</audio>"
1549
+ split_pattern = f"({image_pattern}|{audio_pattern})"
1550
+
1551
+ if isinstance(texts, str):
1552
+ texts = [texts]
1553
+
1554
+ bs = len(texts)
1555
+ if images is not None:
1556
+ images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
1557
+ else:
1558
+ images, image_sizes, tgt_sizes = [[]] * bs, [[]] * bs, [[]] * bs
1559
+
1560
+ input_ids_list = []
1561
+ image_bounds_list = []
1562
+ audio_bounds_list = []
1563
+ spk_bounds_list = []
1564
+
1565
+ for index, text in enumerate(texts):
1566
+ text_chunks = re.split(split_pattern, text)
1567
+
1568
+ image_tags = re.findall(image_pattern, text)
1569
+ audio_tags = re.findall(audio_pattern, text)
1570
+
1571
+ if image_tags:
1572
+ assert images is not None
1573
+ assert len(image_tags) == len(image_sizes[index])
1574
+ if audio_tags:
1575
+ assert audio_phs is not None
1576
+ assert len(audio_tags) == len(audio_phs[index])
1577
+
1578
+ image_id = 0
1579
+ audio_id = 0
1580
+ for i, chunk in enumerate(text_chunks):
1581
+ if chunk == image_pattern:
1582
+ image_placeholder = self.image_processor.get_slice_image_placeholder(
1583
+ image_sizes[index][image_id], image_id, max_slice_nums, use_image_id
1584
+ )
1585
+ image_id += 1
1586
+ text_chunks[i] = image_placeholder
1587
+ elif chunk == audio_pattern:
1588
+ audio_placeholder = audio_phs[index][audio_id]
1589
+ audio_id += 1
1590
+ text_chunks[i] = audio_placeholder
1591
+
1592
+ final_text = "".join(text_chunks)
1593
+ input_ids, image_bounds, audio_bounds, spk_bounds = self._convert(final_text, max_length)
1594
+
1595
+ input_ids_list.append(input_ids)
1596
+ image_bounds_list.append(image_bounds)
1597
+ audio_bounds_list.append(audio_bounds)
1598
+ spk_bounds_list.append(spk_bounds)
1599
+
1600
+ padded_input_ids, padding_lengths = self.pad(input_ids_list, padding_side="left")
1601
+ attention_mask = torch.ones_like(padded_input_ids, dtype=torch.bool)
1602
+ for i, length in enumerate(padding_lengths):
1603
+ image_bounds_list[i] = image_bounds_list[i] + length
1604
+ audio_bounds_list[i] = audio_bounds_list[i] + length
1605
+ spk_bounds_list[i] = spk_bounds_list[i] + length
1606
+ attention_mask[i, :length] = False
1607
+
1608
+ data = {
1609
+ "input_ids": padded_input_ids,
1610
+ "attention_mask": attention_mask,
1611
+ "pixel_values": images,
1612
+ "image_sizes": image_sizes,
1613
+ "image_bound": image_bounds_list,
1614
+ "tgt_sizes": tgt_sizes,
1615
+ "audio_bounds": audio_bounds_list,
1616
+ "spk_bounds": spk_bounds_list,
1617
+ }
1618
+
1619
+ return data
1620
+
1621
+ def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
1622
+ items = []
1623
+ if isinstance(inputs[0], list):
1624
+ assert isinstance(inputs[0][0], torch.Tensor)
1625
+ for it in inputs:
1626
+ for tr in it:
1627
+ items.append(tr)
1628
+ else:
1629
+ assert isinstance(inputs[0], torch.Tensor)
1630
+ items = inputs
1631
+
1632
+ batch_size = len(items)
1633
+ shape = items[0].shape
1634
+ dim = len(shape)
1635
+ assert dim <= 2
1636
+ if max_length is None:
1637
+ max_length = 0
1638
+ max_length = max(max_length, max(item.shape[-1] for item in items))
1639
+ min_length = min(item.shape[-1] for item in items)
1640
+ dtype = items[0].dtype
1641
+
1642
+ if dim == 0:
1643
+ return torch.stack([item for item in items], dim=0), [0]
1644
+ elif dim == 1:
1645
+ if max_length == min_length:
1646
+ return torch.stack([item for item in items], dim=0), [0] * batch_size
1647
+ tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
1648
+ else:
1649
+ tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
1650
+
1651
+ padding_length = []
1652
+ for i, item in enumerate(items):
1653
+ if dim == 1:
1654
+ if padding_side == "left":
1655
+ tensor[i, -len(item) :] = item.clone()
1656
+ else:
1657
+ tensor[i, : len(item)] = item.clone()
1658
+ elif dim == 2:
1659
+ if padding_side == "left":
1660
+ tensor[i, -len(item) :, :] = item.clone()
1661
+ else:
1662
+ tensor[i, : len(item), :] = item.clone()
1663
+ padding_length.append(tensor.shape[-1] - len(item))
1664
+
1665
+ return tensor, padding_length
special_tokens_map.json ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<image>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "</image>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<ref>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "</ref>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<box>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "</box>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<quad>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "</quad>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<point>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "</point>",
75
+ "lstrip": false,
76
+ "normalized": false,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "<slice>",
82
+ "lstrip": false,
83
+ "normalized": false,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "</slice>",
89
+ "lstrip": false,
90
+ "normalized": false,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "<image_id>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "</image_id>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "<unit>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ },
115
+ {
116
+ "content": "</unit>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false
121
+ },
122
+ {
123
+ "content": "<answer>",
124
+ "lstrip": false,
125
+ "normalized": false,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ {
130
+ "content": "</answer>",
131
+ "lstrip": false,
132
+ "normalized": false,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ {
137
+ "content": "<focus>",
138
+ "lstrip": false,
139
+ "normalized": false,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ },
143
+ {
144
+ "content": "</focus>",
145
+ "lstrip": false,
146
+ "normalized": false,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ },
150
+ {
151
+ "content": "<line>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false
156
+ },
157
+ {
158
+ "content": "</line>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false
163
+ },
164
+ {
165
+ "content": "<perception>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false
170
+ },
171
+ {
172
+ "content": "</perception>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false
177
+ },
178
+ {
179
+ "content": "<source_image>",
180
+ "lstrip": false,
181
+ "normalized": false,
182
+ "rstrip": false,
183
+ "single_word": false
184
+ },
185
+ {
186
+ "content": "</source_image>",
187
+ "lstrip": false,
188
+ "normalized": false,
189
+ "rstrip": false,
190
+ "single_word": false
191
+ },
192
+ {
193
+ "content": "<image_save_to>",
194
+ "lstrip": false,
195
+ "normalized": false,
196
+ "rstrip": false,
197
+ "single_word": false
198
+ },
199
+ {
200
+ "content": "</image_save_to>",
201
+ "lstrip": false,
202
+ "normalized": false,
203
+ "rstrip": false,
204
+ "single_word": false
205
+ },
206
+ {
207
+ "content": "<|audio_start|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false
212
+ },
213
+ {
214
+ "content": "<|audio|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false
219
+ },
220
+ {
221
+ "content": "<|audio_end|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false
226
+ },
227
+ {
228
+ "content": "<|spk_bos|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false
233
+ },
234
+ {
235
+ "content": "<|spk|>",
236
+ "lstrip": false,
237
+ "normalized": false,
238
+ "rstrip": false,
239
+ "single_word": false
240
+ },
241
+ {
242
+ "content": "<|spk_eos|>",
243
+ "lstrip": false,
244
+ "normalized": false,
245
+ "rstrip": false,
246
+ "single_word": false
247
+ },
248
+ {
249
+ "content": "<|tts_bos|>",
250
+ "lstrip": false,
251
+ "normalized": false,
252
+ "rstrip": false,
253
+ "single_word": false
254
+ },
255
+ {
256
+ "content": "<|tts_eos|>",
257
+ "lstrip": false,
258
+ "normalized": false,
259
+ "rstrip": false,
260
+ "single_word": false
261
+ },
262
+ {
263
+ "content": "<|listen|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false
268
+ },
269
+ {
270
+ "content": "<|speak|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false
275
+ },
276
+ {
277
+ "content": "<|interrupt|>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false
282
+ },
283
+ {
284
+ "content": "<|vad_start|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false
289
+ },
290
+ {
291
+ "content": "<|vad_end|>",
292
+ "lstrip": false,
293
+ "normalized": false,
294
+ "rstrip": false,
295
+ "single_word": false
296
+ },
297
+ {
298
+ "content": "<|emotion_start|>",
299
+ "lstrip": false,
300
+ "normalized": false,
301
+ "rstrip": false,
302
+ "single_word": false
303
+ },
304
+ {
305
+ "content": "<|emotion_end|>",
306
+ "lstrip": false,
307
+ "normalized": false,
308
+ "rstrip": false,
309
+ "single_word": false
310
+ },
311
+ {
312
+ "content": "<|speed_start|>",
313
+ "lstrip": false,
314
+ "normalized": false,
315
+ "rstrip": false,
316
+ "single_word": false
317
+ },
318
+ {
319
+ "content": "<|speed_end|>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false
324
+ },
325
+ {
326
+ "content": "<|pitch_start|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false
331
+ },
332
+ {
333
+ "content": "<|pitch_end|>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false
338
+ },
339
+ {
340
+ "content": "<|turn_bos|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false
345
+ },
346
+ {
347
+ "content": "<|turn_eos|>",
348
+ "lstrip": false,
349
+ "normalized": false,
350
+ "rstrip": false,
351
+ "single_word": false
352
+ },
353
+ {
354
+ "content": "<|chunk_eos|>",
355
+ "lstrip": false,
356
+ "normalized": false,
357
+ "rstrip": false,
358
+ "single_word": false
359
+ },
360
+ {
361
+ "content": "<|chunk_bos|>",
362
+ "lstrip": false,
363
+ "normalized": false,
364
+ "rstrip": false,
365
+ "single_word": false
366
+ },
367
+ {
368
+ "content": "<|chunk_tts_bos|>",
369
+ "lstrip": false,
370
+ "normalized": false,
371
+ "rstrip": false,
372
+ "single_word": false
373
+ },
374
+ {
375
+ "content": "<|chunk_tts_eos|>",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false
380
+ },
381
+ {
382
+ "content": "<|tts_pad|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false
387
+ },
388
+ {
389
+ "content": "<|timbre_7|>",
390
+ "lstrip": false,
391
+ "normalized": false,
392
+ "rstrip": false,
393
+ "single_word": false
394
+ },
395
+ {
396
+ "content": "<|timbre_8|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false
401
+ },
402
+ {
403
+ "content": "<|timbre_9|>",
404
+ "lstrip": false,
405
+ "normalized": false,
406
+ "rstrip": false,
407
+ "single_word": false
408
+ },
409
+ {
410
+ "content": "<|timbre_10|>",
411
+ "lstrip": false,
412
+ "normalized": false,
413
+ "rstrip": false,
414
+ "single_word": false
415
+ },
416
+ {
417
+ "content": "<|timbre_11|>",
418
+ "lstrip": false,
419
+ "normalized": false,
420
+ "rstrip": false,
421
+ "single_word": false
422
+ },
423
+ {
424
+ "content": "<|timbre_12|>",
425
+ "lstrip": false,
426
+ "normalized": false,
427
+ "rstrip": false,
428
+ "single_word": false
429
+ },
430
+ {
431
+ "content": "<|timbre_13|>",
432
+ "lstrip": false,
433
+ "normalized": false,
434
+ "rstrip": false,
435
+ "single_word": false
436
+ },
437
+ {
438
+ "content": "<|timbre_14|>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false
443
+ },
444
+ {
445
+ "content": "<|timbre_15|>",
446
+ "lstrip": false,
447
+ "normalized": false,
448
+ "rstrip": false,
449
+ "single_word": false
450
+ },
451
+ {
452
+ "content": "<|timbre_16|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false
457
+ },
458
+ {
459
+ "content": "<|timbre_17|>",
460
+ "lstrip": false,
461
+ "normalized": false,
462
+ "rstrip": false,
463
+ "single_word": false
464
+ },
465
+ {
466
+ "content": "<|timbre_18|>",
467
+ "lstrip": false,
468
+ "normalized": false,
469
+ "rstrip": false,
470
+ "single_word": false
471
+ },
472
+ {
473
+ "content": "<|timbre_19|>",
474
+ "lstrip": false,
475
+ "normalized": false,
476
+ "rstrip": false,
477
+ "single_word": false
478
+ },
479
+ {
480
+ "content": "<|timbre_20|>",
481
+ "lstrip": false,
482
+ "normalized": false,
483
+ "rstrip": false,
484
+ "single_word": false
485
+ },
486
+ {
487
+ "content": "<|timbre_21|>",
488
+ "lstrip": false,
489
+ "normalized": false,
490
+ "rstrip": false,
491
+ "single_word": false
492
+ },
493
+ {
494
+ "content": "<|timbre_22|>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false
499
+ },
500
+ {
501
+ "content": "<|timbre_23|>",
502
+ "lstrip": false,
503
+ "normalized": false,
504
+ "rstrip": false,
505
+ "single_word": false
506
+ },
507
+ {
508
+ "content": "<|timbre_24|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false
513
+ },
514
+ {
515
+ "content": "<|timbre_25|>",
516
+ "lstrip": false,
517
+ "normalized": false,
518
+ "rstrip": false,
519
+ "single_word": false
520
+ },
521
+ {
522
+ "content": "<|timbre_26|>",
523
+ "lstrip": false,
524
+ "normalized": false,
525
+ "rstrip": false,
526
+ "single_word": false
527
+ },
528
+ {
529
+ "content": "<|timbre_27|>",
530
+ "lstrip": false,
531
+ "normalized": false,
532
+ "rstrip": false,
533
+ "single_word": false
534
+ },
535
+ {
536
+ "content": "<|timbre_28|>",
537
+ "lstrip": false,
538
+ "normalized": false,
539
+ "rstrip": false,
540
+ "single_word": false
541
+ },
542
+ {
543
+ "content": "<|timbre_29|>",
544
+ "lstrip": false,
545
+ "normalized": false,
546
+ "rstrip": false,
547
+ "single_word": false
548
+ },
549
+ {
550
+ "content": "<|timbre_30|>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false
555
+ },
556
+ {
557
+ "content": "<|timbre_31|>",
558
+ "lstrip": false,
559
+ "normalized": false,
560
+ "rstrip": false,
561
+ "single_word": false
562
+ }
563
+ ],
564
+ "bos_token": "<|im_start|>",
565
+ "eos_token": {
566
+ "content": "<|im_end|>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false
571
+ },
572
+ "pad_token": {
573
+ "content": "<|endoftext|>",
574
+ "lstrip": false,
575
+ "normalized": false,
576
+ "rstrip": false,
577
+ "single_word": false
578
+ },
579
+ "unk_token": "<unk>"
580
+ }
tokenization_minicpmo_fast.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright 2026 The OpenBMB Team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from typing import List
18
+
19
+ from transformers import Qwen2TokenizerFast
20
+
21
+
22
+ class MiniCPMOTokenizerFast(Qwen2TokenizerFast):
23
+ def __init__(self, **kwargs):
24
+ self._bad_token_ids = kwargs.pop("bad_token_ids", [])
25
+
26
+ super().__init__(**kwargs)
27
+
28
+ # image
29
+ self.im_start = "<image>"
30
+ self.im_end = "</image>"
31
+ self.ref_start = "<ref>"
32
+ self.ref_end = "</ref>"
33
+ self.box_start = "<box>"
34
+ self.box_end = "</box>"
35
+ self.quad_start = "<quad>"
36
+ self.quad_end = "</quad>"
37
+ self.slice_start = "<slice>"
38
+ self.slice_end = "</slice>"
39
+ self.im_id_start = "<image_id>"
40
+ self.im_id_end = "</image_id>"
41
+
42
+ # audio
43
+ self.audio_start = "<|audio_start|>"
44
+ self.audio_end = "<|audio_end|>"
45
+ self.spk_start = "<|spk_bos|>"
46
+ self.spk_end = "<|spk_eos|>"
47
+ self.tts_start = "<|tts_bos|>"
48
+ self.tts_end = "<|tts_eos|>"
49
+
50
+ @property
51
+ def eos_id(self):
52
+ return self.eos_token_id
53
+
54
+ @property
55
+ def bos_id(self):
56
+ return self.bos_token_id
57
+
58
+ @property
59
+ def unk_id(self):
60
+ return self.unk_token_id
61
+
62
+ @property
63
+ def im_start_id(self):
64
+ return self.convert_tokens_to_ids(self.im_start)
65
+
66
+ @property
67
+ def im_end_id(self):
68
+ return self.convert_tokens_to_ids(self.im_end)
69
+
70
+ @property
71
+ def slice_start_id(self):
72
+ return self.convert_tokens_to_ids(self.slice_start)
73
+
74
+ @property
75
+ def slice_end_id(self):
76
+ return self.convert_tokens_to_ids(self.slice_end)
77
+
78
+ @property
79
+ def im_id_start_id(self):
80
+ return self.convert_tokens_to_ids(self.im_id_start)
81
+
82
+ @property
83
+ def im_id_end_id(self):
84
+ return self.convert_tokens_to_ids(self.im_id_end)
85
+
86
+ @property
87
+ def audio_start_id(self):
88
+ return self.convert_tokens_to_ids(self.audio_start)
89
+
90
+ @property
91
+ def audio_end_id(self):
92
+ return self.convert_tokens_to_ids(self.audio_end)
93
+
94
+ @property
95
+ def spk_start_id(self):
96
+ return self.convert_tokens_to_ids(self.spk_start)
97
+
98
+ @property
99
+ def spk_end_id(self):
100
+ return self.convert_tokens_to_ids(self.spk_end)
101
+
102
+ @property
103
+ def tts_start_id(self):
104
+ return self.convert_tokens_to_ids(self.tts_start)
105
+
106
+ @property
107
+ def tts_end_id(self):
108
+ return self.convert_tokens_to_ids(self.tts_end)
109
+
110
+ @staticmethod
111
+ def escape(text: str) -> str:
112
+ return text
113
+
114
+ @staticmethod
115
+ def unescape(text: str) -> str:
116
+ return text
117
+
118
+ @property
119
+ def bad_token_ids(self) -> List[int]:
120
+ return self._bad_token_ids