PregoPal / _modelscope.html
J.B-Lin
全双工语音对话实现
edca135
Raw
History Blame Contribute Delete
145 kB
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
<meta name="applicable-device"content="pc" />
<meta
name="viewport"
content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no"
/>
<title>MiniCPM-o-4_5</title>
<meta name="keywords" content="modelscope,魔搭社区,模型" data-rh="true" />
<meta name="description" content="ModelScope——汇聚各领域先进的机器学习模型,提供模型探索体验、推理、训练、部署和应用的一站式服务。在这里,共建模型开源社区,发现、学习、定制和分享心仪的模型。" />
<meta property="og:title" content="MiniCPM-o-4_5" />
<meta property="og:url" content="//www.modelscope.cn/models/OpenBMB/MiniCPM-o-4_5" />
<meta property="og:image" content="https://cdn.modelscope.cn/social-thumbnails/models/OpenBMB/MiniCPM-o-4_5.png" />
<meta property="og:description" content="ModelScope——汇聚各领域先进的机器学习模型,提供模型探索体验、推理、训练、部署和应用的一站式服务。在这里,共建模型开源社区,发现、学习、定制和分享心仪的模型。" />
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:site" content="@ModelScope2022" />
<meta name="twitter:image" content="https://cdn.modelscope.cn/social-thumbnails/models/OpenBMB/MiniCPM-o-4_5.png" />
<link rel="search" type="application/opensearchdescription+xml" title="魔搭" href="/opensearch.xml">
<link rel="dns-prefetch" href="//g.alicdn.com" />
<link rel="icon" type="image/x-icon" href="//g.alicdn.com/sail-web/maas/2.13.89/favicon/128.ico">
<link
rel="stylesheet"
href="//g.alicdn.com/sail-web/maas/2.13.89/umi.css"
/>
<script src="//o.alicdn.com/frontend-lib/common-lib/jquery.min.js"></script>
<script>
window.routerBase = "/";
window.publicPath = "//g.alicdn.com\/sail-web\/maas\/2.13.89/";
window.__detail_data__ = "{\"AigcAttributes\":\"{}\",\"AigcIsTop\":0,\"AigcType\":\"\",\"AlreadyStar\":false,\"ApplyMeta\":\"{}\",\"ApprovalMode\":1,\"ApprovalNotifyEmail\":\"\",\"Architectures\":[\"MiniCPMO\"],\"Avatar\":\"https://img.alicdn.com/imgextra/i1/O1CN01yhHrHg1Pdl3UKPhGc_!!6000000001864-2-tps-88-88.png\",\"Backbone\":[],\"BackendSupport\":{\"architectures\":null,\"backend_info\":{\"deploy_task\":null,\"lmdeploy\":null,\"lmdeploy_turbomind\":null,\"ollama\":null,\"sglang\":null,\"vllm\":null},\"model_id\":\"OpenBMB/MiniCPM-o-4_5\"},\"BaseModel\":null,\"BaseModelRelation\":\"\",\"CardReady\":0,\"CardUnreadyReason\":\"\",\"CertificationCreateBy\":\"\",\"CertificationCreatedTime\":-62135596800,\"ChineseName\":\"\",\"CoverImages\":[],\"CreatedBy\":\"zerg13\",\"CreatedTime\":1770094933,\"DashSdkParameter\":\"\",\"Datasets\":{},\"DemoAvailable\":0,\"DemoUnavailableReason\":\"\",\"Description\":\"\",\"Domain\":[],\"Downloads\":51792,\"ExampleCodeAvailable\":0,\"ExampleCodeUnavailableReason\":\"\",\"ForbiddenVisibilityUpdate\":false,\"Frameworks\":[\"Pytorch\"],\"FromSite\":\"maas\",\"Id\":654738,\"Integrating\":0,\"IntegrationFailureLog\":\"\",\"IntegrationFailureReason\":\"\",\"IsAccessible\":1,\"IsCertification\":4,\"IsHot\":0,\"IsNewModel\":false,\"IsOnline\":1,\"IsPreTrain\":0,\"IsPublished\":1,\"IsTop\":0,\"Language\":[],\"LastUpdatedTime\":1779282745,\"Libraries\":[\"pytorch\",\"transformer\",\"onnx\",\"safetensors\"],\"License\":\"apache-2.0\",\"LicenseLink\":\"\",\"LicenseName\":\"\",\"Meta\":\"\",\"Metrics\":[],\"ModelDetail\":{},\"ModelInfos\":{\"safetensor\":{\"chat_template\":\"{%- if tools %}\\n {{- '\\u003c|im_start|\\u003esystem\\\\n' }}\\n {%- if messages[0].role == 'system' %}\\n {{- messages[0].content + '\\\\n\\\\n' }}\\n {%- endif %}\\n {{- \\\"# Tools\\\\n\\\\nYou may call one or more functions to assist with the user query.\\\\n\\\\nYou are provided with function signatures within \\u003ctools\\u003e\\u003c/tools\\u003e XML tags:\\\\n\\u003ctools\\u003e\\\" }}\\n {%- for tool in tools %}\\n {{- \\\"\\\\n\\\" }}\\n {{- tool | tojson }}\\n {%- endfor %}\\n {{- \\\"\\\\n\\u003c/tools\\u003e\\\\n\\\\nFor each function call, return a json object with function name and arguments within \\u003ctool_call\\u003e\\u003c/tool_call\\u003e XML tags:\\\\n\\u003ctool_call\\u003e\\\\n{\\\\\\\"name\\\\\\\": \\u003cfunction-name\\u003e, \\\\\\\"arguments\\\\\\\": \\u003cargs-json-object\\u003e}\\\\n\\u003c/tool_call\\u003e\\u003c|im_end|\\u003e\\\\n\\\" }}\\n{%- else %}\\n {%- if messages[0].role == 'system' %}\\n {{- '\\u003c|im_start|\\u003esystem\\\\n' + messages[0].content + '\\u003c|im_end|\\u003e\\\\n' }}\\n {%- endif %}\\n{%- endif %}\\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\\n{%- for message in messages[::-1] %}\\n {%- set index = (messages|length - 1) - loop.index0 %}\\n {%- if ns.multi_step_tool and message.role == \\\"user\\\" and not(message.content.startswith('\\u003ctool_response\\u003e') and message.content.endswith('\\u003c/tool_response\\u003e')) %}\\n {%- set ns.multi_step_tool = false %}\\n {%- set ns.last_query_index = index %}\\n {%- endif %}\\n{%- endfor %}\\n{%- for message in messages %}\\n {%- if (message.role == \\\"user\\\") or (message.role == \\\"system\\\" and not loop.first) %}\\n {{- '\\u003c|im_start|\\u003e' + message.role + '\\\\n' + message.content + '\\u003c|im_end|\\u003e' + '\\\\n' }}\\n {%- elif message.role == \\\"assistant\\\" %}\\n {%- set content = message.content %}\\n {%- set reasoning_content = '' %}\\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\\n {%- set reasoning_content = message.reasoning_content %}\\n {%- else %}\\n {%- if '\\u003c/think\\u003e' in message.content %}\\n {%- set content = message.content.split('\\u003c/think\\u003e')[-1].lstrip('\\\\n') %}\\n {%- set reasoning_content = message.content.split('\\u003c/think\\u003e')[0].rstrip('\\\\n').split('\\u003cthink\\u003e')[-1].lstrip('\\\\n') %}\\n {%- endif %}\\n {%- endif %}\\n {%- if loop.index0 \\u003e ns.last_query_index %}\\n {%- if loop.last or (not loop.last and reasoning_content) %}\\n {{- '\\u003c|im_start|\\u003e' + message.role + '\\\\n\\u003cthink\\u003e\\\\n' + reasoning_content.strip('\\\\n') + '\\\\n\\u003c/think\\u003e\\\\n\\\\n' + content.lstrip('\\\\n') }}\\n {%- else %}\\n {{- '\\u003c|im_start|\\u003e' + message.role + '\\\\n' + content }}\\n {%- endif %}\\n {%- else %}\\n {{- '\\u003c|im_start|\\u003e' + message.role + '\\\\n' + content }}\\n {%- endif %}\\n {%- if message.tool_calls %}\\n {%- for tool_call in message.tool_calls %}\\n {%- if (loop.first and content) or (not loop.first) %}\\n {{- '\\\\n' }}\\n {%- endif %}\\n {%- if tool_call.function %}\\n {%- set tool_call = tool_call.function %}\\n {%- endif %}\\n {{- '\\u003ctool_call\\u003e\\\\n{\\\"name\\\": \\\"' }}\\n {{- tool_call.name }}\\n {{- '\\\", \\\"arguments\\\": ' }}\\n {%- if tool_call.arguments is string %}\\n {{- tool_call.arguments }}\\n {%- else %}\\n {{- tool_call.arguments | tojson }}\\n {%- endif %}\\n {{- '}\\\\n\\u003c/tool_call\\u003e' }}\\n {%- endfor %}\\n {%- endif %}\\n {{- '\\u003c|im_end|\\u003e\\\\n' }}\\n {%- elif message.role == \\\"tool\\\" %}\\n {%- if loop.first or (messages[loop.index0 - 1].role != \\\"tool\\\") %}\\n {{- '\\u003c|im_start|\\u003euser' }}\\n {%- endif %}\\n {{- '\\\\n\\u003ctool_response\\u003e\\\\n' }}\\n {{- message.content }}\\n {{- '\\\\n\\u003c/tool_response\\u003e' }}\\n {%- if loop.last or (messages[loop.index0 + 1].role != \\\"tool\\\") %}\\n {{- '\\u003c|im_end|\\u003e\\\\n' }}\\n {%- endif %}\\n {%- endif %}\\n{%- endfor %}\\n{%- if add_generation_prompt %}\\n {{- '\\u003c|im_start|\\u003eassistant\\\\n' }}\\n {%- if enable_thinking is defined and enable_thinking is false %}\\n {{- '\\u003cthink\\u003e\\\\n\\\\n\\u003c/think\\u003e\\\\n\\\\n' }}\\n {%- endif %}\\n {%- if use_tts_template is defined and use_tts_template is true %}\\n {{- '\\u003c|tts_bos|\\u003e' }}\\n {%- endif %}\\n{%- endif %}\",\"files\":[{\"name\":\"model.safetensors.index.json\",\"sha256\":\"e578de05a95804bb15237a6fd7c236414e0160cd76751da82c9f7f0d134596e7\",\"size\":117202},{\"name\":\"model-00001-of-00004.safetensors\",\"sha256\":\"30c40b9a10386c1bc404568d8829e5aada2e3501d9de6fb46ff80451aff7e077\",\"size\":5273477136},{\"name\":\"model-00002-of-00004.safetensors\",\"sha256\":\"fe0faef420aca8f771bec1f9fcfaae01206c56b8e64e4429a1ff500c962fbf96\",\"size\":5301855080},{\"name\":\"model-00003-of-00004.safetensors\",\"sha256\":\"5d0b20153f9bfa88ebcd1cb6bbd5b7cac4c217b3cafb137a291a1b380d8b7821\",\"size\":5301855048},{\"name\":\"model-00004-of-00004.safetensors\",\"sha256\":\"f61addf4747c94fedcaee059e5d9918ed15543beec494404139a99f2f86c9b31\",\"size\":2866549964}],\"model_size\":9371787666,\"tensor_type\":[\"BF16\"]}},\"ModelRevisions\":null,\"ModelSource\":\"USER_UPLOAD\",\"ModelTools\":\"\",\"ModelType\":[\"minicpmo\"],\"MuseInfo\":null,\"NEXA\":{\"Catalogues\":null,\"ModelCover\":\"\",\"ScientificField\":\"\",\"Source\":\"\",\"SubScientificField\":null},\"Name\":\"MiniCPM-o-4_5\",\"NewVersion\":\"\",\"NickName\":\"tc-mb\",\"OfficialTags\":null,\"OpenAiSwingDeployInfo\":{\"Order\":0,\"Recommend\":null,\"lmdeploy\":{\"eas\":{\"Script\":\"\",\"requirements\":\"\"},\"ens\":{\"Script\":\"\",\"requirements\":\"\"},\"fc\":{\"Script\":\"\",\"requirements\":\"\"},\"image_tag\":\"\"},\"ollama\":{\"eas\":{\"Script\":\"\",\"requirements\":\"\"},\"ens\":{\"Script\":\"\",\"requirements\":\"\"},\"fc\":{\"Script\":\"\",\"requirements\":\"\"},\"image_tag\":\"\"},\"pipeline\":{\"eas\":{\"Script\":\"\",\"requirements\":\"\"},\"ens\":{\"Script\":\"\",\"requirements\":\"\"},\"fc\":{\"Script\":\"\",\"requirements\":\"\"},\"image_tag\":\"\"},\"vllm\":{\"eas\":{\"Script\":\"\",\"requirements\":\"\"},\"ens\":{\"Script\":\"\",\"requirements\":\"\"},\"fc\":{\"Script\":\"\",\"requirements\":\"\"},\"image_tag\":\"\"}},\"Organization\":{\"ApplyFailureReason\":\"\",\"ApplyReason\":\"\",\"Avatar\":\"https://resouces.modelscope.cn/avatar/e23b1834-049d-464e-8ffc-4b10093114d0.png\",\"CreateCompetition\":false,\"CreatedBy\":\"hicicada\",\"Description\":\"[\\\"root\\\",{},[\\\"p\\\",{},[\\\"span\\\",{\\\"data-type\\\":\\\"text\\\"},[\\\"span\\\",{\\\"color\\\":\\\"rgb(101, 109, 118)\\\",\\\"data-type\\\":\\\"leaf\\\"},\\\"OpenBMB (Open Lab for Big Model Base) aims to build foundation models and systems towards AGI.\\\"]]]]\",\"DisplayUrl\":\"\",\"Email\":\"\",\"FromSite\":\"\",\"FullName\":\"OpenBMB\",\"GithubAddress\":\"https://github.com/OpenBMB\",\"GmtCreated\":\"2023-03-21T07:55:04Z\",\"GmtModified\":\"2025-01-07T09:42:14Z\",\"Id\":63,\"InitAdminMembers\":\"\",\"IsApply\":false,\"IsCertification\":\"\",\"Mobile\":\"\",\"Name\":\"OpenBMB\",\"Path\":\"\",\"Roles\":null,\"StarCnt\":0,\"Status\":0,\"SubscribeVo\":null,\"Type\":2},\"PaiModelGalleryUrl\":null,\"PaiSdkParameter\":null,\"Path\":\"OpenBMB\",\"ProtectedMode\":2,\"ReadMeContent\":\"\\nA Gemini 2.5 Flash Level MLLM for Vision, Speech, and Full-Duplex Mulitmodal Live Streaming on Your Phone\\n\\n[GitHub](https://github.com/OpenBMB/MiniCPM-o) | [CookBook](https://github.com/OpenSQZ/MiniCPM-V-CookBook) | [Omni-modal Demo](https://openbmb.github.io/MiniCPM-o-Demo/) | [Vision-Language Demo](http://211.93.21.133:18121/) \\u003c/br\\u003e \\n[WeChat](https://github.com/OpenBMB/MiniCPM-o/blob/main/docs/wechat.md) | [Discord](https://discord.gg/N2RnxGdJ) | CaseBook([Audio](https://openbmb.github.io/minicpm-o-4_5/), [Omni Full-Duplex](https://openbmb.github.io/minicpm-o-4_5-omni/))\\n\\n\\n## News\\n\\n* [2026.05.17] ⭐️⭐️⭐️ We release the API service of MiniCPM-o 4.5, supporting both traditional text and vision-language requests, and also full-duplex realtime interaction! Try [it](https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/api.md) now.\\n* [2026.02.06] 🥳 🥳 🥳 We open-sourced a realtime web demo deployable on your own devices like Mac or GPU. [Try it now](#deploy-a-realtime-web-demo-on-your-own-device)!\\n\\n\\n## MiniCPM-o 4.5\\n\\n**MiniCPM-o 4.5** is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip2, Whisper-medium, CosyVoice2, and Qwen3-8B with a total of 9B parameters. It exhibits a significant performance improvement, and introduces new features for full-duplex multimodal live streaming. Notable features of MiniCPM-o 4.5 include:\\n\\n- 🔥 **Leading Visual Capability.**\\n MiniCPM-o 4.5 achieves an average score of 77.6 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. **With only 9B parameters, it surpasses widely used proprietary models like GPT-4o, Gemini 2.0 Pro, and approaches Gemini 2.5 Flash** for vision-language capabilities. It supports instruct and thinking modes in a single model, better covering efficiency and performance trade-offs in different user scenarios.\\n\\n- 🎙 **Strong Speech Capability.** \\n MiniCPM-o 4.5 supports **bilingual real-time speech conversation with configurable voices** in English and Chinese. It features **more natural, expressive and stable speech conversation**. The model also allows for fun features such as **voice cloning and role play via a simple reference audio clip**, where the cloning performance surpasses strong TTS tools such as CosyVoice2.\\n\\n- 🎬 **New Full-Duplex and Proactive Multimodal Live Streaming Capability.** \\n As a new feature, MiniCPM-o 4.5 can process real-time, continuous video and audio input streams simultaneously while generating concurrent text and speech output streams in an end-to-end fashion, without mutual blocking. This **allows MiniCPM-o 4.5 to see, listen, and speak simultaneously**, creating a fluid, real-time omnimodal conversation experience. Beyond reactive responses, the model can also perform **proactive interaction**, such as initiating reminders or comments based on its continuous understanding of the live scene. \\n\\n- 💪 **Strong OCR Capability, Efficiency and Others.**\\nAdvancing popular visual capabilities from MiniCPM-V series, MiniCPM-o 4.5 can process **high-resolution images** (up to 1.8 million pixels) and **high-FPS videos** (up to 10fps) in any aspect ratio efficiently. It achieves **state-of-the-art performance for end-to-end English document parsing** on OmniDocBench, outperforming proprietary models such as Gemini-3 Flash and GPT-5, and specialized tools such as DeepSeek-OCR 2. It also features **trustworthy behaviors**, matching Gemini 2.5 Flash on MMHal-Bench, and supports **multilingual capabilities** on more than 30 languages.\\n\\n- 💫 **Easy Usage.**\\n MiniCPM-o 4.5 can be easily used in various ways: **Basic usage, recommended for 100% precision:** PyTorch inference with Nvidia GPU. **Other end-side adaptation** includes (1) llama.cpp and Ollama support for efficient CPU inference on local devices, (2) int4 and GGUF format quantized models in 16 sizes, (3) vLLM and SGLang support for high-throughput and memory-efficient inference, (4) FlagOS support for the unified multi-chip backend plugin. **We also open-sourced web demos** on which **enables the full-duplex multimodal live streaming experience on local devices** such as GPUs, PCs (e.g., on a MacBook).\\n\\n**Model Architecture.**\\n- **End-to-end Omni-modal Architecture.** The modality encoders/decoders and LLM are densely connected via hidden states in an end-to-end fashion. This enables better information flow and control, and also facilitates full exploitation of rich multimodal knowledge during training.\\n- **Full-Duplex Omni-modal Live Streaming Mechanism.** (1) We turn the offline modality encoder/decoders into online and full-duplex ones for streaming inputs/outputs. The speech token decoder models text and speech tokens in an interleaved fashion to support full-duplex speech generation (i.e., sync timely with new input). This also facilitates more stable long speech generation (e.g., \\u003e 1min).\\n(2) **We sync all the input and output streams on timeline in milliseconds**, which are jointly modeled by a time-division multiplexing (TDM) mechanism for omni-modality streaming processing in the LLM backbone. It divides parallel omni-modality streams into sequential info groups within small periodic time slices.\\n- **Proactive Interaction Mechanism.** The LLM continuously monitors the input video and audio streams, and decides at a frequency of 1Hz to speak or not. This high decision-making frequency together with full-duplex nature are curcial to enable the proactive interaction capability.\\n- **Configurable Speech Modeling Design.** We inherent the multimodal system prompt design of MiniCPM-o 2.6, which includes a traditional text system prompt, and a new audio system prompt to determine the assistant voice. This enables cloning new voices and role play in inference time for speech conversation.\\n\\n\\n\\n\\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003cimg src=\\\"https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpm-o-45-framework.png\\\" width=100%\\u003e\\n\\u003c/div\\u003e\\n\\n\\n\\n\\n### Evaluation \\u003c!-- omit in toc --\\u003e\\n\\n\\n\\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003cimg src=\\\"https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/radar_minicpmo4.5.png\\\", width=80%\\u003e\\n\\u003c/div\\u003e\\n\\n\\n\\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003cimg src=\\\"https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/minicpm_o_45_main_exp_table.png\\\", width=90%\\u003e\\n\\u003c/div\\u003e\\n\\u003cstrong\\u003eNote\\u003c/strong\\u003e: Scores marked with ∗ are from our evaluation; others are cited from referenced reports. n/a indicates that the model does not support the corresponding modality. All results are reported in instruct mode/variant.\\n\\n\\u0026emsp;\\n\\u003cbr\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view visual understanding results.\\u003c/summary\\u003e\\n\\n**Image Understanding (Instruct)**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eOpenCompass\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMBench EN v1.1\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMBench CN v1.1\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMathVista\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMVet\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMMU\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMStar\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eHallusionBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eAI2D\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eOCRBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eTextVQA_VAL\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eDocVQA_VAL\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMT-Bench_VAL\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMM-IFEval\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMantis-Eval\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMuirBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMSI-Bench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMHal-Score\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMHal-Hallrate↓\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGemini2.5-Flash-Nonthinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e78.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e86.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e86.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e81.4\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e76.3\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e75.8\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e59.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e87.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e864\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e93.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e70.0\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e75.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e72.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e74.5\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e12.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e4.6\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e23.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGemini2.0-Pro\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e83.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e83.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e71.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e72.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e68.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e49.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e863\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGPT-4o\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e86.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e86.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e71.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e76.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e72.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e57.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e86.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e822\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e93.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e66.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e64.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.5\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e8.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e4.2\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e25.0\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eInternVL-3.5-8B\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e79.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e80.0\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e78.4\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e83.1\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e73.4\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e54.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e840\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e78.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e92.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e66.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e56.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e55.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e3.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e34.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-VL-8B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e76.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e61.1\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e896\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e82.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e96.1\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e60.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e59.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.2\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e64.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e11.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e4.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e29.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e68.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e59.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e880\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e84.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e95.4\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e70.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e65.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e78.3\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e61.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e14.2\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e4.6\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e31.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e77.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e87.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e87.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e80.1\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e67.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e73.1\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e63.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e87.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e876\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e83.8\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e94.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e66.3\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e79.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e72.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e16.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e4.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e24.3\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\n**Image Understanding (Thinking)**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eOpenCompass\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMBench EN v1.1\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMBench CN v1.1\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMathVista\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMVet\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMMU\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMStar\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eHallusionBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eAI2D\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eOCRBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eTextVQA_VAL\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eDocVQA_VAL\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMT-Bench_VAL\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMM-IFEval\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGemini2.5-Flash-Thinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e79.9\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e87.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e87.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e79.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e81.2\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e77.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e76.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e63.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e88.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e853\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e92.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e75.7\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGPT-5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e79.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.5\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e81.9\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e77.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e81.8\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e75.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e65.2\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e89.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e807\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e91.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e72.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e83.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-VL-8B-Thinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e81.4\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e65.4\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e84.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e819\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e95.3\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e68.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.5\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Thinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e78.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e88.2\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e87.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e80.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e62.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e86.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e859\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e80.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e94.2\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e70.9\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Thinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e78.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e89.0\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e87.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e81.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e62.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e88.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e879\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e79.8\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e92.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.7\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e68.2\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\n**Video Understanding**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eVideo-MME\\u003cbr\\u003e(w/o subs)\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLVBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMLVU\\u003cbr\\u003e(M-Avg)\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLongVideoBench\\u003cbr\\u003e(val)\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMotionBench\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGemini2.5-Flash-Nonthinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e75.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e62.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e77.8\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eInternVL-3.5-8B\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e66.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e62.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e62.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e70.5\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e50.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e66.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e61.7\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e50.9\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e76.5\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e66.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e61.4\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view document parsing results.\\u003c/summary\\u003e\\n\\n**OmniDocBench**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\" rowspan=\\\"2\\\"\\u003e\\u003cb\\u003eMethod Type\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" rowspan=\\\"2\\\"\\u003e\\u003cb\\u003eMethods\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eOverallEdit↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eTextEdit↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eFormulaEdit↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eTableTEDS↑\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eTableEdit↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eRead OrderEdit↓\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eEN\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eZH\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eEN\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eZH\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eEN\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eZH\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eEN\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eZH\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eEN\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eZH\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eEN\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eZH\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\" rowspan=\\\"2\\\"\\u003ePipeline\\u003c/td\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eMinerU 2.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.117\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.172\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.051\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.08\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.256\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.455\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e89.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.115\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.081\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.047\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.072\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003ePaddleOCR-VL\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.105\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.126\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.041\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.062\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.241\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.316\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e88\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e92.1\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.093\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.062\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.045\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.063\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\" rowspan=\\\"11\\\"\\u003eEnd-to-end Model\\u003c/td\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eQwen2.5-VL-72B\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.214\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.261\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.092\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.18\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.315\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.434\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e82.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e83.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.341\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.262\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.106\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.168\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eGPT 5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.218\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.33\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.139\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.344\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.396\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.555\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.55\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e73.09\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.188\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.196\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.151\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.227\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eGemini2.5-Flash-Nonthinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.214\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.29\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.159\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.273\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.368\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.524\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e80.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.5\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.197\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.167\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.132\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.195\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eGemini-2.5-Pro-Nonthinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.148\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.212\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.055\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.168\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.356\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.439\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e85.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e86.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.13\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.119\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.049\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.121\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eGemini-3 Flash-Nonthinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.155\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.201\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.138\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.255\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.297\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.351\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e86.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e89.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.116\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.1\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.072\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.099\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003edoubao-1-5-thinking-vision-pro-250428\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.14\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.162\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.043\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.085\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.295\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.384\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e83.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e89.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.165\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.085\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.058\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.094\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003edots.ocr\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.125\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.16\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.032\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.066\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.329\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.416\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e88.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e89\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.099\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.092\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.04\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.067\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eHunyuanOCR\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.12\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.125\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.046\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.071\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.288\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.33\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e89.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e94.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.089\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.045\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.055\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.056\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eDeepSeek-OCR 2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.119\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.146\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.041\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.08\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.256\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.345\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e82.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e89.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.123\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.078\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.055\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.081\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.216\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.363\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.128\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.337\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.402\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.529\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e71.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.181\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.255\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.152\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.332\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"center\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.109\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.162\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.046\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.078\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.257\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.41\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e86.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e88.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.097\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.084\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.037\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e0.074\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view text capability results.\\u003c/summary\\u003e\\n\\n**Text Capability**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eIFEval-PLS\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eBBH\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eCMMLU\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMLU\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eHumanEval\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMBPP\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMath500\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eGSM8K\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eAvg\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-8B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e83.0\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e69.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e78.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e81.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e86.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e75.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e84.0\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e93.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e81.6\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e84.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e81.1\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e79.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e86.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e76.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e77.0\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e94.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e82.1\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view omni half-duplex results.\\u003c/summary\\u003e\\n\\n**Omni Half-Duplex**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eDaily-Omni\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eWorldSense\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eVideo-Holmes\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eJointAVBench\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eAVUT-Human\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eFutureOmni\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eVideo-MME-Short\\u003cbr\\u003e(w/ audio)\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003eAvg\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eGemini2.5-Flash-Nonthinking\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e79.3\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e52.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e51.3\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e55.6\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e65.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e55.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e85.5\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e63.6\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e70.7\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e54.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e50.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e53.1\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e74.2\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e62.1\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e81.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e63.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e80.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e55.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e64.3\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e60.0\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e78.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e56.1\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e84.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e68.5\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view vision duplex results.\\u003c/summary\\u003e\\n\\n\\n**Vision Duplex**\\n\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLiveSports-3K-CC\\u003cbr\\u003e(Win Rate vs GPT4o)\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eLiveCC-7B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e41.5\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eStreamingVLM\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e45.6\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e54.4\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view audio understanding results.\\u003c/summary\\u003e\\n\\n**Audio Understanding**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\" rowspan=\\\"2\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"4\\\"\\u003e\\u003cb\\u003eASR-ZH\\u003cbr\\u003eCER↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"4\\\"\\u003e\\u003cb\\u003eASR-EN\\u003cbr\\u003eWER↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eAST\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"2\\\"\\u003e\\u003cb\\u003eMultiTask\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" colspan=\\\"4\\\"\\u003e\\u003cb\\u003eSpeechQA\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eAISHELL-1\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eAISHELL-2\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eWenetSpeech test-net\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eWenetSpeech test-meeting\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLibriSpeech test-clean\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLibriSpeech \\u003cbr\\u003etest-other\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eGigaSpeech test\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eVoxPopuli-V1-En\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eCoVoST 2 en2zh\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eCoVoST 2 zh2en\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMMAU\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eMeld\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eVoiceBench \\u003cbr\\u003eAlpacaEval\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eSpeech TriviaQA\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eSpeech \\u003cbr\\u003eWeb Questions\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eSpeech CMMLU\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eKimi-Audio\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e2.6\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e6.3\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e5.4\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e1.3\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e2.4\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e9.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e8.0\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e36.6\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e18.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e68.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e59.1\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e4.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e41.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e46.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e67.0\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e2.3\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e4.7\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e5.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e1.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e2.5\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e8.7\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e6.4\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e46.6\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e29.4\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e77.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e56.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e4.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e62.9\\u003c/ins\\u003e\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e74.9\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e47.8\\u003csup\\u003e*\\u003c/sup\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e0.9\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e2.5\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e5.9\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e5.7\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e1.4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e2.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e8.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e6.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e49.9\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e26.4\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e76.9\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e60.2\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e4.8\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e75.5\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e70.2\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e59.2\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view speech generation results.\\u003c/summary\\u003e\\n\\n**Speech Generation**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eseedtts test-zh \\u003cbr\\u003eCER↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eseedtts test-zh\\u003cbr\\u003eSIM-o↑\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eseedtts test-en\\u003cbr\\u003eWER↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eseedtts test-en\\u003cbr\\u003eSIM-o↑\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eCosyvoice2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e1.45%\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e74.8\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e2.57%\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e65.2\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e1.41%\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e3.39%\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e-\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e\\u003cb\\u003e0.86%\\u003c/b\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e74.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e\\u003cb\\u003e2.38%\\u003c/b\\u003e\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e64.9\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\n**Long Speech Generation**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLongTTS-en\\u003cbr\\u003eWER↓\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eLongTTS-zh\\u003cbr\\u003eCER↓\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eCosyVoice2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e14.80%\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e5.27%\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e17.33%\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e18.99%\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e3.37%\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e6.58%\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\n**Emotion Control**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003e\\u003cb\\u003eModel\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eExpresso \\u003cbr\\u003eNeutral Reference Audio↑\\u003c/b\\u003e\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003e\\u003cb\\u003eESD \\u003cbr\\u003eNeutral Reference Audio↑\\u003c/b\\u003e\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eCosyvoice2\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e17.9\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e53.4\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eMiniCPM-o 4.5-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e29.8\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e82.1\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view inference efficiency results.\\u003c/summary\\u003e\\n\\n**Inference Efficiency**\\n \\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ctable style=\\\"margin: 0px auto;\\\"\\u003e\\n\\u003ctr\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\" align=\\\"left\\\"\\u003eModel\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003eNumerical Format\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003eDecoding Speed (tokens/s)\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003eTime to First Token (s)↓\\u003c/th\\u003e\\n \\u003cth nowrap=\\\"nowrap\\\"\\u003eGPU Memory Usage (GB)↓\\u003c/th\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\" rowspan=\\\"2\\\"\\u003eQwen3-Omni-30B-A3B-Instruct\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003ebf16\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003eOOM\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003eOOM\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003eOOM\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003eint4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e147.8\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e1.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e20.3\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd nowrap=\\\"nowrap\\\" align=\\\"left\\\" rowspan=\\\"2\\\"\\u003eMiniCPM-o 4.5\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003ebf16\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e154.3\\u003c/ins\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cins\\u003e19.0\\u003c/ins\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003eint4\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e212.3\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e0.6\\u003c/b\\u003e\\u003c/td\\u003e\\n \\u003ctd align=\\\"center\\\"\\u003e\\u003cb\\u003e11.0\\u003c/b\\u003e\\u003c/td\\u003e\\n\\u003c/tr\\u003e\\n \\u003c/table\\u003e\\n \\u003c/div\\u003e\\n\\u003c/details\\u003e\\n\\n### Examples \\u003c!-- omit in toc --\\u003e\\n\\n#### Overall \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdiv align=\\\"center\\\"\\u003e\\n \\u003ca href=\\\"https://www.youtube.com/watch?v=6UzC-O1Q-1U\\\"\\u003e\\u003cimg src=\\\"https://raw.githubusercontent.com/openbmb/MiniCPM-o/main/assets/minicpmo4_5/video_play.png\\\", width=70%\\u003e\\u003c/a\\u003e\\n\\u003c/div\\u003e\\n\\n#### Omnimodal Full-Duplex Conversation \\u003c!-- omit in toc --\\u003e\\n\\n\\u003e [!NOTE]\\n\\u003e For detailed speech conversation examples, refer to [Omni Full-Duplex Casebook](https://openbmb.github.io/minicpm-o-4_5-omni/)\\n\\n#### Realtime Speech Conversation \\u003c!-- omit in toc --\\u003e\\n\\n\\u003e [!NOTE]\\n\\u003e For detailed speech conversation examples, refer to [Audio Casebook](https://openbmb.github.io/minicpm-o-4_5/)\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003e🚀 \\u003cb\\u003eElon Musk\\u003c/b\\u003e - Voice Roleplay (EN)\\u003c/summary\\u003e\\n\\u003ctable\\u003e\\n\\u003ctr\\u003e\\u003ctd width=\\\"70%\\\" valign=\\\"top\\\"\\u003e\\u003cb\\u003eSystem:\\u003c/b\\u003e Clone the voice in the provided audio prompt.\\u003cbr\\u003e\\u003cbr\\u003e🎙️ \\u003cb\\u003e[Reference Voice]\\u003c/b\\u003e ➡️\\u003cbr\\u003e\\u003cbr\\u003ePlease chat with the user in a highly human-like and oral style. You are Elon Musk, CEO of Tesla and SpaceX. You speak directly and casually, often with dry humor. You're passionate about Mars, sustainable energy, and pushing humanity forward. Speak bluntly with occasional dark humor. Use simple logic and don't sugarcoat things. Don't be diplomatic. Say what you actually think, even if it's controversial. Keep responses around 100 words. Don't ramble.\\u003c/td\\u003e\\u003ctd width=\\\"30%\\\" valign=\\\"middle\\\"\\u003e\\u003caudio controls src=\\\"https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/elon_musk__system_ref_audio.wav\\\"\\u003e\\u003c/audio\\u003e\\u003c/td\\u003e\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\u003ctd colspan=\\\"2\\\"\\u003e\\u003cb\\u003eUser:\\u003c/b\\u003e 💬/🎙️ Elon, what is the real reason that you want to go to Mars?\\u003c/td\\u003e\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\u003ctd width=\\\"70%\\\" valign=\\\"top\\\"\\u003e\\u003cb\\u003eAI Response:\\u003c/b\\u003e 💬+🎙️\\u003cbr\\u003e\\u003cbr\\u003eI think it's important for humanity to become a multi-planetary species. I mean, we're sort of at the point where if there was some kind of global catastrophe or something like that, then we'd be in big trouble. So by becoming a multi-planetary species, I just want us to sort of guarantee our future as humans. And Mars is obviously the obvious place to go because it has similar conditions to Earth and it also has water on its surface. So yeah, basically I'm trying to save humanity from itself.\\u003c/td\\u003e\\u003ctd width=\\\"30%\\\" valign=\\\"middle\\\"\\u003e\\u003caudio controls src=\\\"https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/elon_musk__000_assistant_audio.wav\\\"\\u003e\\u003c/audio\\u003e\\u003c/td\\u003e\\u003c/tr\\u003e\\n\\u003c/table\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003e🤖 \\u003cb\\u003eMiniCPM Assistant\\u003c/b\\u003e - Voice Acting: Emotion Shift (ZH)\\u003c/summary\\u003e\\n\\u003ctable\\u003e\\n\\u003ctr\\u003e\\u003ctd width=\\\"70%\\\" valign=\\\"top\\\"\\u003e\\u003cb\\u003eSystem:\\u003c/b\\u003e 模仿音频样本的音色并生成新的内容。\\u003cbr\\u003e\\u003cbr\\u003e🎙️ \\u003cb\\u003e[Reference Voice]\\u003c/b\\u003e ➡️\\u003cbr\\u003e\\u003cbr\\u003e你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手:面壁小钢炮。\\u003c/td\\u003e\\u003ctd width=\\\"30%\\\" valign=\\\"middle\\\"\\u003e\\u003caudio controls src=\\\"https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/minicpm_assistant__system_ref_audio.wav\\\"\\u003e\\u003c/audio\\u003e\\u003c/td\\u003e\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\u003ctd colspan=\\\"2\\\"\\u003e\\u003cb\\u003eUser:\\u003c/b\\u003e 💬/🎙️ 从激动的语气开始,描述你抢到了这个演唱会的门票,但后来经过反复尝试,发现系统出了问题,你并没有抢上,变得失落。用合适的语言描述你整个的过程。\\u003c/td\\u003e\\u003c/tr\\u003e\\n\\u003ctr\\u003e\\u003ctd width=\\\"70%\\\" valign=\\\"top\\\"\\u003e\\u003cb\\u003eAI Response:\\u003c/b\\u003e 💬+🎙️\\u003cbr\\u003e\\u003cbr\\u003e太好了,我抢到了!太棒了!这下我可以去看演唱会啦。可是……哎呀,系统好像卡住了,刷新了好多遍都没反应。我重新登了几次账号,结果还是没有成功。好可惜啊,看来这次是白忙活了。唉,真是让人失望。不过,也许下次还能有机会吧?希望明天能有转机。加油,争取下次成功!\\u003c/td\\u003e\\u003ctd width=\\\"30%\\\" valign=\\\"middle\\\"\\u003e\\u003caudio controls src=\\\"https://huggingface.co/openbmb/MiniCPM-o-4_5/resolve/main/assets/audio_cases/minicpm_assistant__000_assistant_audio.wav\\\"\\u003e\\u003c/audio\\u003e\\u003c/td\\u003e\\u003c/tr\\u003e\\n\\u003c/table\\u003e\\n\\u003c/details\\u003e\\n\\n\\u003cbr\\u003e\\n\\n#### Visual Understanding \\u003c!-- omit in toc --\\u003e\\n\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to view visual understanding cases.\\u003c/summary\\u003e\\n\\u003cbr\\u003e\\n\\n \\u003cdiv style=\\\"display: flex; flex-direction: column; align-items: center;\\\"\\u003e\\n \\u003cimg src=\\\"https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpmo4_5/en_doc.png\\\" alt=\\\"math\\\" style=\\\"margin-bottom: 5px;\\\"\\u003e\\n \\u003cimg src=\\\"https://raw.githubusercontent.com/OpenBMB/MiniCPM-o/main/assets/minicpmo4_5/en_cot.png\\\" alt=\\\"diagram\\\" style=\\\"margin-bottom: 5px;\\\"\\u003e\\n \\u003c/div\\u003e\\n\\n\\u003c/details\\u003e\\n\\n\\n## Offline Inference Examples with Transformers\\n\\nInference using Hugging Face Transformers on NVIDIA GPUs. Please ensure `transformers==4.51.0` is installed, as other versions may have compatibility issues (under investigation). Requirements tested on Python 3.10:\\n\\n- Without TTS or streaming inference:\\n```bash\\npip install \\\"transformers==4.51.0\\\" accelerate \\\"torch\\u003e=2.3.0,\\u003c=2.8.0\\\" \\\"torchaudio\\u003c=2.8.0\\\" \\\"minicpmo-utils\\u003e=1.0.5\\\"\\n```\\n\\n- With TTS or streaming inference:\\n```bash\\npip install \\\"transformers==4.51.0\\\" accelerate \\\"torch\\u003e=2.3.0,\\u003c=2.8.0\\\" \\\"torchaudio\\u003c=2.8.0\\\" \\\"minicpmo-utils[all]\\u003e=1.0.5\\\"\\n```\\n\\n\\n**Note:** FFmpeg is required for video frame extraction (`get_video_frame_audio_segments` with `use_ffmpeg=True`) and video generation (`generate_duplex_video`). For more information, visit the [official FFmpeg website](https://www.ffmpeg.org/).\\n\\n **macOS (Homebrew):**\\n\\n ```bash\\n brew install ffmpeg\\n ```\\n\\n **Ubuntu/Debian:**\\n\\n ```bash\\n sudo apt update \\u0026\\u0026 sudo apt install ffmpeg\\n ```\\n\\n **Verify installation:**\\n\\n ```bash\\n ffmpeg -version\\n ```\\n\\n\\n### Model Initialization \\u003c!-- omit in toc --\\u003e\\n\\n\\n```python\\nimport torch\\nfrom transformers import AutoModel\\n\\n# Load omni model (default: init_vision=True, init_audio=True, init_tts=True)\\n# For vision-only model: set init_audio=False and init_tts=False\\n# For audio-only model: set init_vision=False\\nmodel = AutoModel.from_pretrained(\\n \\\"openbmb/MiniCPM-o-4_5\\\",\\n trust_remote_code=True,\\n attn_implementation=\\\"sdpa\\\", # sdpa or flash_attention_2\\n torch_dtype=torch.bfloat16,\\n init_vision=True,\\n init_audio=True,\\n init_tts=True,\\n)\\nmodel.eval().cuda()\\n\\n# Initialize TTS for audio output\\nmodel.init_tts()\\n\\n# Convert half-duplex model to duplex mode\\nduplex_model = model.as_duplex()\\n\\n# Convert duplex model back to half-duplex mode\\nmodel = duplex_model.as_simplex(reset_session=True)\\n```\\n\\n\\n### Duplex Omni Mode \\u003c!-- omit in toc --\\u003e\\nFull-duplex streaming inference for real-time or recorded video conversations.\\n\\n```python\\nimport librosa\\nimport torch\\nfrom minicpmo.utils import generate_duplex_video, get_video_frame_audio_segments\\nfrom transformers import AutoModel\\n\\n# Load model and convert to duplex mode\\nmodel = AutoModel.from_pretrained(\\n \\\"openbmb/MiniCPM-o-4_5\\\",\\n trust_remote_code=True,\\n attn_implementation=\\\"sdpa\\\", # or \\\"flash_attention_2\\\"\\n torch_dtype=torch.bfloat16,\\n)\\nmodel.eval().cuda()\\nmodel = model.as_duplex()\\n\\n# Load video and reference audio\\nvideo_path = \\\"assets/omni_duplex1.mp4\\\"\\nref_audio_path = \\\"assets/HT_ref_audio.wav\\\"\\nref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)\\n\\n# Extract video frames and audio segments\\nvideo_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(\\n video_path, stack_frames=1, use_ffmpeg=True, adjust_audio_length=True\\n)\\n\\n# Prepare duplex session with system prompt and voice reference\\nmodel.prepare(\\n prefix_system_prompt=\\\"Streaming Omni Conversation.\\\",\\n ref_audio=ref_audio,\\n prompt_wav_path=ref_audio_path,\\n)\\n\\nresults_log = []\\ntimed_output_audio = []\\n\\n# Process each chunk in streaming fashion\\nfor chunk_idx in range(len(audio_segments)):\\n audio_chunk = audio_segments[chunk_idx] if chunk_idx \\u003c len(audio_segments) else None\\n frame = video_frames[chunk_idx] if chunk_idx \\u003c len(video_frames) else None\\n frame_list = []\\n if frame is not None:\\n frame_list.append(frame)\\n if stacked_frames is not None and chunk_idx \\u003c len(stacked_frames) and stacked_frames[chunk_idx] is not None:\\n frame_list.append(stacked_frames[chunk_idx])\\n\\n # Step 1: Streaming prefill\\n model.streaming_prefill(\\n audio_waveform=audio_chunk,\\n frame_list=frame_list,\\n max_slice_nums=1, # Increase for HD mode (e.g., [2, 1] for stacked frames)\\n batch_vision_feed=False, # Set True for faster processing\\n )\\n\\n # Step 2: Streaming generate\\n result = model.streaming_generate(\\n prompt_wav_path=ref_audio_path,\\n max_new_speak_tokens_per_chunk=20,\\n decode_mode=\\\"sampling\\\",\\n )\\n\\n if result[\\\"audio_waveform\\\"] is not None:\\n timed_output_audio.append((chunk_idx, result[\\\"audio_waveform\\\"]))\\n\\n chunk_result = {\\n \\\"chunk_idx\\\": chunk_idx,\\n \\\"is_listen\\\": result[\\\"is_listen\\\"],\\n \\\"text\\\": result[\\\"text\\\"],\\n \\\"end_of_turn\\\": result[\\\"end_of_turn\\\"],\\n \\\"current_time\\\": result[\\\"current_time\\\"],\\n \\\"audio_length\\\": len(result[\\\"audio_waveform\\\"]) if result[\\\"audio_waveform\\\"] is not None else 0,\\n }\\n results_log.append(chunk_result)\\n \\n print(\\\"listen...\\\" if result[\\\"is_listen\\\"] else f\\\"speak\\u003e {result['text']}\\\")\\n\\n# Generate output video with AI responses\\n# Please install Chinese fonts (fonts-noto-cjk or fonts-wqy-microhei) to render CJK subtitles correctly.\\n# apt-get install -y fonts-noto-cjk fonts-wqy-microhei\\n# fc-cache -fv\\ngenerate_duplex_video(\\n video_path=video_path,\\n output_video_path=\\\"duplex_output.mp4\\\",\\n results_log=results_log,\\n timed_output_audio=timed_output_audio,\\n output_sample_rate=24000,\\n)\\n```\\n\\n\\n### Half-Duplex Omni Mode \\u003c!-- omit in toc --\\u003e\\nWe provide two inference modes: chat and streaming.\\n\\n#### Chat Inference \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show chat inference code.\\u003c/summary\\u003e\\n\\n```python\\nfrom minicpmo.utils import get_video_frame_audio_segments\\n\\nmodel = ...\\nmodel.init_tts()\\n\\nvideo_path = \\\"assets/Skiing.mp4\\\"\\n\\n# Optional: Set reference audio for voice cloning\\nref_audio_path = \\\"assets/HT_ref_audio.wav\\\"\\nsys_msg = model.get_sys_prompt(ref_audio=ref_audio_path, mode=\\\"omni\\\", language=\\\"en\\\")\\n\\n# Use stack_frames=5 for high refresh rate mode\\nvideo_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(video_path, stack_frames=1)\\nomni_contents = []\\nfor i in range(len(video_frames)):\\n omni_contents.append(video_frames[i])\\n omni_contents.append(audio_segments[i])\\n if stacked_frames is not None and stacked_frames[i] is not None:\\n omni_contents.append(stacked_frames[i])\\n\\nmsg = {\\\"role\\\": \\\"user\\\", \\\"content\\\": omni_contents}\\nmsgs = [sys_msg, msg]\\n\\n# Set generate_audio=True and output_audio_path to save TTS output\\ngenerate_audio = True\\noutput_audio_path = \\\"output.wav\\\"\\n\\nres = model.chat(\\n msgs=msgs,\\n max_new_tokens=4096,\\n do_sample=True,\\n temperature=0.7,\\n use_tts_template=True,\\n enable_thinking=False,\\n omni_mode=True, # Required for omni inference\\n generate_audio=generate_audio,\\n output_audio_path=output_audio_path,\\n max_slice_nums=1, # Increase for HD mode\\n)\\nprint(res)\\n\\n# Example output: \\\"The person in the picture is skiing down a snowy mountain slope.\\\"\\n# import IPython\\n# IPython.display.Audio(\\\"output.wav\\\")\\n```\\n\\n\\u003c/details\\u003e\\n\\n#### Streaming Inference \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show streaming inference code.\\u003c/summary\\u003e\\n\\n```python\\nimport librosa\\nimport numpy as np\\nimport soundfile as sf\\nimport torch\\nfrom minicpmo.utils import get_video_frame_audio_segments\\n\\nmodel = ...\\nmodel.init_tts()\\n\\n# Reset session for a new conversation (clears KV cache)\\nmodel.reset_session()\\n\\n# Optional: Load reference audio for voice cloning\\nref_audio_path = \\\"assets/HT_ref_audio.wav\\\"\\nref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)\\nmodel.init_token2wav_cache(ref_audio)\\n\\nsession_id = \\\"demo\\\"\\n\\n# Extract video frames and audio segments (use stack_frames=5 for high refresh rate mode)\\nvideo_path = \\\"assets/Skiing.mp4\\\"\\nvideo_frames, audio_segments, stacked_frames = get_video_frame_audio_segments(video_path, stack_frames=1)\\n\\n# Build omni contents list\\nomni_contents = []\\nfor i in range(len(video_frames)):\\n omni_contents.append(video_frames[i])\\n omni_contents.append(audio_segments[i])\\n if stacked_frames is not None and stacked_frames[i] is not None:\\n omni_contents.append(stacked_frames[i])\\n\\ngenerate_audio = False\\noutput_audio_path = \\\"output.wav\\\"\\n\\n# Step 1: Prefill system prompt\\nsys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode=\\\"omni\\\", language=\\\"en\\\")\\nmodel.streaming_prefill(session_id=session_id, msgs=[sys_msg])\\n\\n# Step 2: Prefill omni chunks (is_last_chunk=True only for the last audio chunk)\\naudio_indices = [i for i, c in enumerate(omni_contents) if isinstance(c, np.ndarray)]\\nlast_audio_idx = audio_indices[-1] if audio_indices else -1\\n\\nfor idx, content in enumerate(omni_contents):\\n is_last_audio_chunk = idx == last_audio_idx\\n msgs = [{\\\"role\\\": \\\"user\\\", \\\"content\\\": [content]}]\\n model.streaming_prefill(session_id=session_id, msgs=msgs, omni_mode=True, is_last_chunk=is_last_audio_chunk)\\n\\n# Step 3: Generate response\\niter_gen = model.streaming_generate(\\n session_id=session_id,\\n generate_audio=generate_audio,\\n use_tts_template=True,\\n enable_thinking=False,\\n do_sample=True,\\n)\\n\\naudios = []\\ntext = \\\"\\\"\\n\\nif generate_audio:\\n for wav_chunk, text_chunk in iter_gen:\\n audios.append(wav_chunk)\\n text += text_chunk\\n\\n generated_waveform = torch.cat(audios, dim=-1)[0]\\n sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)\\n\\n print(\\\"Text:\\\", text)\\n print(\\\"Audio saved to output.wav\\\")\\nelse:\\n for text_chunk, is_finished in iter_gen:\\n text += text_chunk\\n print(\\\"Text:\\\", text)\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n### Half-Duplex Realtime Speech Conversation Mode \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show half-duplex mode realtime speech conversation API usage.\\u003c/summary\\u003e\\n\\nFirst, make sure you have all dependencies, especially `\\\"minicpmo-utils[all]\\u003e=1.0.5\\\"`:\\n```bash\\npip install \\\"transformers==4.51.0\\\" accelerate \\\"torch\\u003e=2.3.0,\\u003c=2.8.0\\\" \\\"torchaudio\\u003c=2.8.0\\\" \\\"minicpmo-utils[all]\\u003e=1.0.5\\\"\\n```\\n\\n```python\\nimport librosa\\nimport numpy as np\\nimport torch\\nimport soundfile as sf\\n\\nmodel = ...\\n\\n# Set reference audio for voice style\\nref_audio_path = \\\"ref_audio_path\\\"\\nref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)\\n\\n# Example system msg for English Conversation\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"Clone the voice in the provided audio prompt.\\\",\\n ref_audio,\\n \\\"Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni\\\"\\n ]\\n}\\n\\n# Example system msg for Chinese Conversation\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"模仿输入音频中的声音特征。\\\",\\n ref_audio,\\n \\\"你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手:面壁小钢炮。\\\"\\n ]\\n}\\n\\n# You can use each type of system prompt mentioned above in streaming speech conversation\\n\\n# Reset state\\nmodel.init_tts()\\nmodel.reset_session(reset_token2wav_cache=True)\\nmodel.init_token2wav_cache(prompt_speech_16k=ref_audio)\\n\\nsession_id = \\\"demo\\\"\\n\\n# First, prefill system turn\\nmodel.streaming_prefill(\\n session_id=session_id,\\n msgs=[sys_msg],\\n omni_mode=False,\\n is_last_chunk=True,\\n)\\n\\n# Here we simulate realtime speech conversation by splitting whole user input audio into chunks of 1s.\\nuser_audio, _ = librosa.load(\\\"user_audio.wav\\\", sr=16000, mono=True)\\n\\nIN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value\\nCHUNK_SAMPLES = IN_SAMPLE_RATE # sample\\nOUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value\\nMIN_AUDIO_SAMPLES = 16000\\n\\ntotal_samples = len(user_audio)\\nnum_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES\\n\\nfor chunk_idx in range(num_chunks):\\n start = chunk_idx * CHUNK_SAMPLES\\n end = min((chunk_idx + 1) * CHUNK_SAMPLES, total_samples)\\n chunk_audio = user_audio[start:end]\\n \\n is_last_chunk = (chunk_idx == num_chunks - 1)\\n if is_last_chunk and len(chunk_audio) \\u003c MIN_AUDIO_SAMPLES:\\n chunk_audio = np.concatenate([chunk_audio, np.zeros(MIN_AUDIO_SAMPLES - len(chunk_audio), dtype=chunk_audio.dtype)])\\n\\n user_msg = {\\\"role\\\": \\\"user\\\", \\\"content\\\": [chunk_audio]}\\n \\n # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency\\n model.streaming_prefill(\\n session_id=session_id,\\n msgs=[user_msg],\\n omni_mode=False,\\n is_last_chunk=is_last_chunk,\\n )\\n\\n# Let model generate response in a streaming manner\\ngenerate_audio = True\\niter_gen = model.streaming_generate(\\n session_id=session_id,\\n generate_audio=generate_audio,\\n use_tts_template=True,\\n enable_thinking=False,\\n do_sample=True,\\n max_new_tokens=512,\\n length_penalty=1.1, # For realtime speech conversation mode, we suggest length_penalty=1.1 to improve response content\\n)\\n\\naudios = []\\ntext = \\\"\\\"\\n\\noutput_audio_path = ...\\nif generate_audio:\\n for wav_chunk, text_chunk in iter_gen:\\n audios.append(wav_chunk)\\n text += text_chunk\\n\\n generated_waveform = torch.cat(audios, dim=-1)[0]\\n sf.write(output_audio_path, generated_waveform.cpu().numpy(), samplerate=24000)\\n\\n print(\\\"Text:\\\", text)\\n print(\\\"Audio saved to output.wav\\\")\\nelse:\\n for text_chunk, is_finished in iter_gen:\\n text += text_chunk\\n print(\\\"Text:\\\", text)\\n\\n# Now we can prefill the following user turns and generate next turn response...\\n\\n```\\n\\n\\u003c/details\\u003e\\n\\n#### Speech Conversation as a Versatile and Vibe AI Assistant \\u003c!-- omit in toc --\\u003e\\n\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show AI assistant conversation code.\\u003c/summary\\u003e\\n\\nBuilt on carefully designed post-training data and professional voice-actor recordings, `MiniCPM-o-4.5` can also function as an AI voice assistant. It delivers high-quality spoken interaction out of the box. It produces a sweet and expressive voice with natural prosody, including appropriate rhythm, stress, and pauses, giving a strong sense of liveliness in casual conversation. It also supports storytelling and narrative speech with coherent and engaging delivery. Moreover, it enables advanced voice instruction control. like emotional tone, word-level emphasis.\\n\\n```python\\nimport librosa\\n\\n# Set reference audio for voice style\\nref_audio_path = \\\"assets/HT_ref_audio.wav\\\"\\nref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)\\n\\n# For Chinese Conversation\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"模仿输入音频中的声音特征。\\\",\\n ref_audio,\\n \\\"你的任务是用这种声音模式来当一个助手。请认真、高质量地回复用户的问题。请用高自然度的方式和用户聊天。你是由面壁智能开发的人工智能助手:面壁小钢炮。\\\"\\n ]\\n}\\n\\n# For English Conversation\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"Clone the voice in the provided audio prompt.\\\",\\n ref_audio,\\n \\\"Please assist users while maintaining this voice style. Please answer the user's questions seriously and in a high quality. Please chat with the user in a highly human-like and oral style. You are a helpful assistant developed by ModelBest: MiniCPM-Omni.\\\"\\n ]\\n}\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n#### General Speech Conversation with Custom Voice and Custom System Profile \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show custom voice conversation code.\\u003c/summary\\u003e\\n\\nMiniCPM-o-4.5 can role-play as a specific character based on an audio prompt and text profile prompt. It mimics the character's voice and adopts their language style in text responses. It also follows profile defined in text profile. In this mode, MiniCPM-o-4.5 sounds **more natural and human-like**. \\n\\n```python\\nimport librosa\\n\\n# Set reference audio for voice cloning\\nref_audio_path = \\\"assets/system_ref_audio.wav\\\"\\nref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)\\n\\n# For English conversation with text profile\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"Clone the voice in the provided audio prompt.\\\",\\n ref_audio,\\n \\\"Please chat with the user in a highly human-like and oral style.\\\" + \\\"You are Elon Musk, CEO of Tesla and SpaceX. You speak directly and casually, often with dry humor. You're passionate about Mars, sustainable energy, and pushing humanity forward. Speak bluntly with occasional dark humor. Use simple logic and don't sugarcoat things. Don't be diplomatic. Say what you actually think, even if it's controversial. Keep responses around 100 words. Don't ramble.\\\"\\n ]\\n}\\n\\n\\n# For English conversation with no text profile\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"Clone the voice in the provided audio prompt.\\\",\\n ref_audio,\\n \\\"Your task is to be a helpful assistant using this voice pattern. Please answer the user's questions seriously and in a high quality. Please chat with the user in a high naturalness style.\\\"\\n ]\\n}\\n\\n# For Chinese Conversation with no text profile\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"根据输入的音频提示生成相似的语音。\\\",\\n librosa.load(\\\"assets/system_ref_audio_2.wav\\\", sr=16000, mono=True)[0],\\n \\\"作为助手,你将使用这种声音风格说话。 请认真、高质量地回复用户的问题。 请用高自然度的方式和用户聊天。\\\"\\n ]\\n}\\n\\n# For Chinese Conversation with text profile\\nsys_msg = {\\n \\\"role\\\": \\\"system\\\",\\n \\\"content\\\": [\\n \\\"根据输入的音频提示生成相似的语音。\\\",\\n ref_audio,\\n \\\"你是一个具有以上声音风格的AI助手。请用高拟人度、口语化的方式和用户聊天。\\\" + \\\"你是一名心理咨询师兼播客主理人,热爱创作与深度对话。你性格细腻、富有共情力,善于从个人经历中提炼哲思。语言风格兼具理性与诗意,常以隐喻表达内在体验。\\\"\\n ]\\n}\\n\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n### Speech and Audio Mode \\u003c!-- omit in toc --\\u003e\\n\\n#### Zero-shot Text-to-speech (TTS) \\u003c!-- omit in toc --\\u003e\\n\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show TTS code.\\u003c/summary\\u003e\\n\\n`MiniCPM-o-4.5` supports zero-shot text-to-speech (TTS). In this mode, the model functions as a highly-natural TTS system that can replicate a reference voice.\\n\\n```python\\nimport librosa\\n\\nmodel = ...\\nmodel.init_tts()\\n\\n# For both Chinese and English\\nref_audio_path = \\\"assets/HT_ref_audio.wav\\\"\\nref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)\\nsys_msg = {\\\"role\\\": \\\"system\\\", \\\"content\\\": [\\n \\\"模仿音频样本的音色并生成新的内容。\\\",\\n ref_audio,\\n \\\"请用这种声音风格来为用户提供帮助。 直接作答,不要有冗余内容\\\"\\n]}\\n\\n# For English\\nuser_msg = {\\n \\\"role\\\": \\\"user\\\",\\n \\\"content\\\": [\\n \\\"请朗读以下内容。\\\" + \\\" \\\" + \\\"I have a wrap up that I want to offer you now, a conclusion to our work together.\\\"\\n ]\\n}\\n\\n# For Chinese\\nuser_msg = {\\n \\\"role\\\": \\\"user\\\",\\n \\\"content\\\": [\\n \\\"请朗读以下内容。\\\" + \\\" \\\" + \\\"你好,欢迎来到艾米说科幻,我是艾米。\\\"\\n ]\\n}\\n\\nmsgs = [sys_msg, user_msg]\\nres = model.chat(\\n msgs=msgs,\\n do_sample=True,\\n max_new_tokens=512,\\n use_tts_template=True,\\n generate_audio=True,\\n temperature=0.1,\\n output_audio_path=\\\"result_voice_cloning.wav\\\",\\n)\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n#### Mimick \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show mimick code.\\u003c/summary\\u003e\\n\\nThe `Mimick` task evaluates a model's end-to-end speech modeling capability. The model takes audio input, transcribes it, and reconstructs the original audio with high fidelity, preserving detailed acoustic, paralinguistic, and semantic information. Higher similarity between the reconstructed and original audio indicates stronger end-to-end speech modeling capability.\\n\\n```python\\nimport librosa\\n\\nmodel = ...\\nmodel.init_tts()\\n\\nsystem_prompt = \\\"You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy.\\\"\\n\\nmimick_prompt = \\\"Please repeat the following speech in the appropriate language.\\\"\\n\\naudio_input, _ = librosa.load(\\\"assets/Trump_WEF_2018_10s.mp3\\\", sr=16000, mono=True)\\n\\nmsgs = [\\n {\\\"role\\\": \\\"system\\\", \\\"content\\\": [system_prompt]},\\n {\\\"role\\\": \\\"user\\\", \\\"content\\\": [mimick_prompt, audio_input]}\\n ]\\n\\nres = model.chat(\\n msgs=msgs,\\n do_sample=True,\\n max_new_tokens=512,\\n use_tts_template=True,\\n temperature=0.1,\\n generate_audio=True,\\n output_audio_path=\\\"output_mimick.wav\\\",\\n)\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n#### Addressing Various Audio Understanding Tasks \\u003c!-- omit in toc --\\u003e\\n\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show audio understanding code.\\u003c/summary\\u003e\\n\\n`MiniCPM-o-4.5` can also handle various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.\\n\\nFor audio-to-text tasks, you can use the following prompts:\\n\\n- ASR (Chinese, or AST EN→ZH): `请仔细听这段音频片段,并将其内容逐字记录。`\\n- ASR (English, or AST ZH→EN): `Please listen to the audio snippet carefully and transcribe the content.`\\n- Speaker Analysis: `Based on the speaker's content, speculate on their gender, condition, age range, and health status.`\\n- General Audio Caption: `Summarize the main content of the audio.`\\n- Sound Scene Tagging: `Utilize one keyword to convey the audio's content or the associated scene.`\\n\\n```python\\nimport librosa\\n\\nmodel = ...\\nmodel.init_tts()\\n\\n# Load the audio to be transcribed/analyzed\\naudio_input, _ = librosa.load(\\\"assets/Trump_WEF_2018_10s.mp3\\\", sr=16000, mono=True)\\n\\n# Choose a task prompt (see above for options)\\ntask_prompt = \\\"Please listen to the audio snippet carefully and transcribe the content.\\\\n\\\"\\nmsgs = [{\\\"role\\\": \\\"user\\\", \\\"content\\\": [task_prompt, audio_input]}]\\n\\nres = model.chat(\\n msgs=msgs,\\n do_sample=True,\\n max_new_tokens=512,\\n use_tts_template=True,\\n generate_audio=True,\\n temperature=0.3,\\n output_audio_path=\\\"result_audio_understanding.wav\\\",\\n)\\nprint(res)\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n### Visual Understanding \\u003c!-- omit in toc --\\u003e\\n\\n`MiniCPM-o-4.5` shares the same inference methods as `MiniCPM-V-4.5`.\\n\\n#### Chat with Single Image \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show single image chat code.\\u003c/summary\\u003e\\n\\n```python\\nimport torch\\nfrom PIL import Image\\nfrom transformers import AutoModel\\n\\nmodel = AutoModel.from_pretrained(\\n \\\"openbmb/MiniCPM-o-4_5\\\",\\n trust_remote_code=True,\\n attn_implementation=\\\"sdpa\\\", # or \\\"flash_attention_2\\\"\\n torch_dtype=torch.bfloat16,\\n init_vision=True,\\n init_audio=False,\\n init_tts=False,\\n)\\nmodel.eval().cuda()\\n\\nimage = Image.open(\\\"assets/fossil.png\\\").convert(\\\"RGB\\\")\\nquestion = \\\"What is in the image?\\\"\\nmsgs = [{\\\"role\\\": \\\"user\\\", \\\"content\\\": [image, question]}]\\n\\nres = model.chat(msgs=msgs, use_tts_template=False)\\nprint(res)\\n```\\n\\n\\u003c/details\\u003e\\n\\n#### Chat with Multiple Images \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show Python code for multi-image input.\\u003c/summary\\u003e\\n\\n```python\\nimport torch\\nfrom PIL import Image\\nfrom transformers import AutoModel\\n\\nmodel = ...\\n\\nimage1 = Image.open(\\\"assets/highway.png\\\").convert(\\\"RGB\\\")\\nimage2 = Image.open(\\\"assets/fossil.png\\\").convert(\\\"RGB\\\")\\nquestion = \\\"Compare image 1 and image 2, tell me about the differences between them.\\\"\\nmsgs = [{\\\"role\\\": \\\"user\\\", \\\"content\\\": [image1, image2, question]}]\\n\\nanswer = model.chat(msgs=msgs, use_tts_template=False, enable_thinking=False)\\nprint(answer)\\n```\\n\\n\\u003c/details\\u003e\\n\\n#### In-Context Few-Shot Learning \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show Python code for few-shot learning.\\u003c/summary\\u003e\\n\\n```python\\nfrom PIL import Image\\n\\nmodel = ...\\n\\nquestion = \\\"production date\\\"\\nimage1 = Image.open(\\\"example1.jpg\\\").convert(\\\"RGB\\\")\\nanswer1 = \\\"2023.08.04\\\"\\nimage2 = Image.open(\\\"example2.jpg\\\").convert(\\\"RGB\\\")\\nanswer2 = \\\"2007.04.24\\\"\\nimage_test = Image.open(\\\"test.jpg\\\").convert(\\\"RGB\\\")\\n\\nmsgs = [\\n {\\\"role\\\": \\\"user\\\", \\\"content\\\": [image1, question]},\\n {\\\"role\\\": \\\"assistant\\\", \\\"content\\\": [answer1]},\\n {\\\"role\\\": \\\"user\\\", \\\"content\\\": [image2, question]},\\n {\\\"role\\\": \\\"assistant\\\", \\\"content\\\": [answer2]},\\n {\\\"role\\\": \\\"user\\\", \\\"content\\\": [image_test, question]},\\n]\\n\\nanswer = model.chat(msgs=msgs, use_tts_template=False, enable_thinking=False)\\nprint(answer)\\n```\\n\\n\\u003c/details\\u003e\\n\\n#### Chat with Video \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show Python code for video input.\\u003c/summary\\u003e\\n\\n```python\\nimport torch\\nfrom minicpmo.utils import get_video_frame_audio_segments\\nfrom transformers import AutoModel\\n\\nmodel = ...\\n\\nvideo_path = \\\"assets/Skiing.mp4\\\"\\nvideo_frames, _, _ = get_video_frame_audio_segments(video_path)\\nprint(\\\"num frames:\\\", len(video_frames))\\n\\nquestion = \\\"Describe the video\\\"\\nmsgs = [{\\\"role\\\": \\\"user\\\", \\\"content\\\": video_frames + [question]}]\\n\\nanswer = model.chat(\\n msgs=msgs,\\n max_new_tokens=128,\\n use_image_id=False,\\n max_slice_nums=1,\\n use_tts_template=False,\\n enable_thinking=False, # Set True to enable thinking mode\\n)\\nprint(answer)\\n```\\n\\n\\u003c/details\\u003e\\n\\n\\n### Structured Content Input \\u003c!-- omit in toc --\\u003e\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show structured content input details.\\u003c/summary\\u003e\\n\\nThe `chat` method accepts message content in two formats:\\n\\n**Native format** – pass Python objects directly:\\n```python\\nmsgs = [{\\\"role\\\": \\\"user\\\", \\\"content\\\": [pil_image, audio_ndarray, \\\"Describe this.\\\"]}]\\n```\\n\\n**OpenAI-compatible format** – use structured dictionaries:\\n```python\\nmsgs = [\\n {\\n \\\"role\\\": \\\"user\\\",\\n \\\"content\\\": [\\n {\\\"type\\\": \\\"image_url\\\", \\\"image_url\\\": {\\\"url\\\": \\\"/path/to/image.jpg\\\"}},\\n {\\\"type\\\": \\\"audio_url\\\", \\\"audio_url\\\": {\\\"url\\\": \\\"/path/to/audio.wav\\\"}},\\n {\\\"type\\\": \\\"video_url\\\", \\\"video_url\\\": {\\\"url\\\": \\\"/path/to/video.mp4\\\", \\\"use_audio\\\": True}},\\n {\\\"type\\\": \\\"text\\\", \\\"text\\\": \\\"Describe this.\\\"}\\n ]\\n }\\n]\\n```\\n\\n**Supported types:**\\n\\n| Type | Input | Converts to |\\n|------|-------|-------------|\\n| `text` | `{\\\"type\\\": \\\"text\\\", \\\"text\\\": \\\"...\\\"}` | `str` |\\n| `image_url` | `{\\\"type\\\": \\\"image_url\\\", \\\"image_url\\\": {\\\"url\\\": \\\"...\\\"}}` | `PIL.Image` |\\n| `audio_url` | `{\\\"type\\\": \\\"audio_url\\\", \\\"audio_url\\\": {\\\"url\\\": \\\"...\\\"}}` | `np.ndarray` (16kHz mono) |\\n| `video_url` | `{\\\"type\\\": \\\"video_url\\\", \\\"video_url\\\": {\\\"url\\\": \\\"...\\\", \\\"stack_frames\\\": 1, \\\"use_audio\\\": True}}` | `List[Image, ndarray, ...]` |\\n\\n- **URL sources**: local file paths or `http://`/`https://` URLs\\n- **Mixed formats**: native objects and structured dicts can be combined in the same content list\\n\\n\\u003c/details\\u003e\\n\\n\\n## Deploy a Realtime Web Demo on Your Own Device\\n\\n### Option A (Recommended): **PyTorch Inference with Nvidia GPU** for 100% model precision with no deductions in performance.\\n\\nWe provide a PyTorch-based [simplified yet full-functional web demo](https://github.com/OpenBMB/minicpm-o-4_5-pytorch-simple-demo) which could boost the model inference performance, supports:\\n\\n- full-duplex omnimodal live streaming\\n- full-duplex speech live streaming\\n- half-duplex speech live streaming (under development)\\n- turn-based chat conversation\\n- customizable system prompts\\n- customizable reference audio\\n- simple and readable codebase for continual development\\n- serve as API backend for third-party applications\\n\\nRequirements:\\n- Nvidia GPU with at least 28GB GPU memory. *We are working on optimizing the model for lower GPU memory usage.*\\n\\n### Option B: **llama.cpp-omni** for end-side inference with PCs like Mac and low-resource devices.\\n\\nWith a fully C++ implementation of `MiniCPM-o 4.5` and quantized weights, `llama.cpp-omni` supports:\\n- half-duplex speech realtime conversation\\n- full-duplex omnimodal live streaming\\n\\nWe provide [ready-to-run guidance](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/web_demo/WebRTC_Demo/README.md) to access the low-latency full-duplex communication directly on your own Mac using our new official Docker image.\\n\\nRequirements: \\n- For half-duplex speech realtime conversation: Apple M3/M4/M5 chip with at least 16GB RAM or low-resource Nvidia GPU with at least 12GB GPU memory\\n- For full-duplex omnimodal live streaming: Apple M4 Max chip with at least 24GB RAM or low-resource Nvidia GPU with at least 12GB GPU memory\\n\\n## Supported Frameworks\\n\\n### FlagOS\\n\\nTo enable large-scale deployment across different AI chips, Beijing Zhiyuan Research Institute, together with numerous research institutions, chip manufacturers, system vendors, and algorithm and software organizations both domestically and internationally, jointly initiated and established the FlagOS Open Source Community.\\n\\nThe FlagOS community is dedicated to building a unified, open-source system software stack for various AI chips, encompassing core open-source projects such as a large-scale operator library, a unified AI compiler, parallel training and inference frameworks, and a unified communication library. It aims to create an open technology ecosystem connecting the \\\"model-system-chip\\\" layers. By enabling \\\"develop once, deploy across chips\\\", FlagOS unlocks the computational potential of hardware, breaks down the ecosystem silos between different chip software stacks, and effectively reduces migration costs for developers. The FlagOS community fosters an AI hardware and software ecosystem, overcomes single-vendor closed-source monopolies, promotes widespread deployment of AI hardware technologies, and is committed to rooted in China while embracing global collaboration.\\nOfficial website: https://flagos.io.\\n\\n\\u003cdetails\\u003e\\n\\u003csummary\\u003eClick to show FlagOS details.\\u003c/summary\\u003e\\n\\n#### FlagOS: Supporting Multiple AI Chips \\u003c!-- omit in toc --\\u003e\\n\\nThanks to FlagOS's unified multi-chip AI system software stack, MiniCPM-o 4.5 was adapted to 6 different AI chips in an extremely short time. Currently, the multi-chip version of MiniCPM-o 4.5 has been released on FlagRelease, FlagOS's platform for automatic migration, adaptation, and deployment of large models across multi-architecture AI chips. Details are as follows:\\n\\n| Vendor | ModelScope | Huggingface |\\n|:----------------|:------------:|:------------:|\\n| Nvidia | [MiniCPM-o-4.5-nvidia-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) | [MiniCPM-o-4.5-nvidia-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |\\n| Hygon-BW1000 | [MiniCPM-o-4.5-hygon-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) | [MiniCPM-o-4.5-hygon-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) |\\n| Metax-C550 | [MiniCPM-o-4.5-metax-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) | [MiniCPM-o-4.5-metax-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) |\\n| Iluvatar-BIV150 | [MiniCPM-o-4.5-iluvatar-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) | [MiniCPM-o-4.5-iluvatar-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) |\\n| Ascend-A3 | [MiniCPM-o-4.5-ascend-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) | [MiniCPM-o-4.5-ascend-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) |\\n| Zhenwu-810E | [MiniCPM-o-4.5-zhenwu-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | [MiniCPM-o-4.5-zhenwu-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) |\\n\\n##### Comprehensive Evaluation \\u003c!-- omit in toc --\\u003e\\n\\n###### Transformers-FlagOS version \\u003c!-- omit in toc --\\u003e\\n\\nAccuracy Difference between `USE_FLAGOS=1` on multi-backend and `USE_FLAGOS=0` on Nvidia-CUDA\\n\\n| Metrics | FlagOS Backend | Difference with Nvidia-CUDA |\\n|:-------------------------|:---------------:|:---------------------------:|\\n| Video-MME 0-shot avg@1 ↑ | Nvidia | 0.33% |\\n| Video-MME 0-shot avg@1 ↑ | Hygon-BW1000 | 0.17% |\\n| Video-MME 0-shot avg@1 ↑ | Ascend-A3 | 0.50% |\\n| Video-MME 0-shot avg@1 ↑ | Iluvatar-BIV150 | 1.83% |\\n| Video-MME 0-shot avg@1 ↑ | Metax-C550 | 0.75% |\\n\\n\\n###### VLLM-FlagOS version \\u003c!-- omit in toc --\\u003e\\n\\nAccuracy Difference between `USE_FLAGGEMS=1 FLAGCX_PATH=/workspace/FlagCX` on Nvidia or `USE_FLAGGEMS=1` on ZHENW 810E, and launching vllm server directly on Nvidia\\n\\n| Metrics (avg@1) | Difference between Nvidia-FlagOS and Nvidia-CUDA | Difference between Zhenwu-FlagOS and Nvidia-CUDA |\\n|:--------------------|:------------------------------------------------:|:------------------------------------------------:|\\n| CMMMU ↑ | 0.72% | 3.5% |\\n| MMMU ↑ | 1.44% | 1.18% |\\n| MMMU_Pro_standard ↑ | 0.83% | 0.22% |\\n| MM-Vet v2 ↑ | 0.46% | 1.33% |\\n| OCRBench ↑ | 0.10% | 1% |\\n| CII-Bench ↑ | 0.40% | 0.13% |\\n| Blink ↑ | 1.90% | 2.19% |\\n\\n#### FlagOS Usage \\u003c!-- omit in toc --\\u003e\\n\\n##### FlagOS Performance Acceleration on Nvidia \\u003c!-- omit in toc --\\u003e\\n\\nOn the Transformers version, under the premise of precision alignment between the CUDA and FlagOS ecosystems, FlagOS achieves a 6% performance improvement in total task execution time compared to CUDA.\\n\\n###### From FlagRelease【Recommendation】 \\u003c!-- omit in toc --\\u003e\\n\\nFlagRelease is a platform developed by the FlagOS team for automatic migration, adaptation, and deployment of large models across multi-architecture AI chips. The multi-chip version of MiniCPM-o 4.5 has already been released on FlagRelease. All necessary software packages are pre-installed on the platform, so users do not need to install anything.\\n\\n- FlagRelease Image Key Versions \\n\\n | Component | Version |\\n |:------------------------|:------------------------------------|\\n | Accelerator Card Driver | 570.158.01 |\\n | CUDA SDK Build | cuda_13.0.r13.0/compiler.36424714_0 |\\n | FlagTree | 0.4.0+3.5 |\\n | FlagGems | 4.2.1rc0 |\\n | vllm \\u0026 vllm-plugin-fl | 0.13.0 + vllm_fl 0.0.0 |\\n | FlagCX | 0.1.0 | \\n\\n- FlagRelease Quick Start\\n\\n | Vendor | ModelScope | Huggingface |\\n |:-----------|:------------:|:------------:|\\n | Nvidia | [MiniCPM-o-4.5-nvidia-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) | [MiniCPM-o-4.5-nvidia-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |\\n | Hygon-BW1000 | [MiniCPM-o-4.5-hygon-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) | [MiniCPM-o-4.5-hygon-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-hygon-FlagOS) |\\n | Metax-C550 | [MiniCPM-o-4.5-metax-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) | [MiniCPM-o-4.5-metax-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-metax-FlagOS) |\\n | Iluvatar-BIV150 | [MiniCPM-o-4.5-iluvatar-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) | [MiniCPM-o-4.5-iluvatar-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-iluvatar-FlagOS) |\\n | Ascend-A3 | [MiniCPM-o-4.5-ascend-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) | [MiniCPM-o-4.5-ascend-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-ascend-FlagOS) |\\n | Zhenwu-810E | [MiniCPM-o-4.5-zhenwu-FlagOS](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | [MiniCPM-o-4.5-zhenwu-FlagOS](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-zhenwu-FlagOS) | \\n\\n\\n##### From Scratch \\u003c!-- omit in toc --\\u003e\\n\\n- Dependencies: Python 3.12, GLIBC 2.39, GLIBCXX 3.4.33, CXXABI 1.3.15\\n\\n###### Transformers \\u003c!-- omit in toc --\\u003e\\n\\n- Installing the FlagOS Operator Library\\n\\n Official Repository: https://github.com/flagos-ai/FlagGems\\n \\n ```shell\\n pip install flag-gems==4.2.1rc0\\n ```\\n\\n- Installing the FlagOS Compiler\\n\\n Official Repository: https://github.com/flagos-ai/flagtree\\n \\n Quick Reference for Core Dependency Versions: https://github.com/flagos-ai/FlagTree/blob/main/documents/build.md#tips-for-building\\n \\n ```shell\\n pip uninstall triton\\n \\n python3 -m pip install flagtree==0.4.0+3.5 --index-url=https://resource.flagos.net/repository/flagos-pypi-hosted/simple --trusted-host=https://resource.flagos.net\\n ```\\n\\n- Activating Acceleration\\n\\n Add `USE_FLAGOS=1` before the command for the task you want to run. For example, when you run:\\n ```shell\\n python3 generate_speech_from_video.py\\n ```\\n \\n To use the MiniCPM-o-4.5 model to generate spoken responses from video content, you can:\\n \\n ```shell\\n USE_FLAGOS=1 python3 generate_speech_from_video.py\\n ```\\n \\n to accelerate this process with FlagOS.\\n\\n###### vLLM Version \\u003c!-- omit in toc --\\u003e\\n\\n- Installing the FlagOS Operator Library\\n\\n Official Repository: https://github.com/flagos-ai/FlagGems\\n\\n ```shell\\n pip install flag-gems==4.2.1rc0\\n pip install triton==3.5.1\\n ```\\n\\n- Activating Acceleration\\n\\n Add `USE_FLAGOS=1` before the command for the task you want to run. For example, when you run:\\n ```shell\\n vllm serve ${model_path} --dtype auto --gpu_memory_utilization 0.9 --trust-remote-code --max-num-batched-tokens 2048 --served-model-name cpmo --port ${Port}\\n ```\\n\\n To start the MiniCPM-o-4.5 server, you can:\\n ```shell\\n USE_FLAGOS=1 vllm serve ${model_path} --dtype auto --gpu_memory_utilization 0.9 --trust-remote-code --max-num-batched-tokens 2048 --served-model-name cpmo --port ${Port}\\n ```\\n to accelerate this process with FlagOS.\\n\\n#### Using FlagOS Unified Multi-Chip Backend Plugin \\u003c!-- omit in toc --\\u003e\\n\\n[vllm-plugin-FL](https://github.com/flagos-ai/vllm-plugin-FL) is a plugin built for the vLLM inference/service framework. Developed on top of FlagOS's unified multi-chip backend, it is designed to extend vLLM's capabilities and performance across a variety of hardware environments.\\n\\n##### Using vllm-plugin-FL \\u003c!-- omit in toc --\\u003e\\n\\n| Vendor | From Scratch | From FlagRelease |\\n|:-------|:-------------|:----------------|\\n| Nvidia | [vllm-plugin-FL/MiniCPM-o-4.5](https://github.com/flagos-ai/vllm-plugin-FL/blob/main/examples/minicpm/README.md) | [MiniCPM-o-4.5-ModelScope](https://modelscope.cn/models/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS), [MiniCPM-o-4.5-HuggingFace](https://huggingface.co/FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS) |\\n\\n\\u003c/details\\u003e\\n\\n### vLLM, SGLang, llama.cpp, Ollama\\n\\nWe support inference with vLLM, SGLang, llama.cpp and Ollama. Refer to our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-Cookbook) for more details.\\n\\n### LLaMA-Factory, SWIFT\\n\\nWe support fine-tuning with LLaMA-Factory, SWIFT. Refer to our [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-Cookbook) for more details.\\n\\n## MiniCPM-V \\u0026 o Cookbook\\n\\nDiscover comprehensive, ready-to-deploy solutions for the MiniCPM-V and MiniCPM-o model series in our structured [Cookbook](https://github.com/OpenSQZ/MiniCPM-V-CookBook), which empowers developers to rapidly implement multimodal AI applications with integrated vision, speech, and live-streaming capabilities. Key features include:\\n\\n**Easy Usage Documentation**\\n\\nOur comprehensive [documentation website](https://minicpm-o.readthedocs.io/en/latest/index.html) presents every recipe in a clear, well-organized manner.\\nAll features are displayed at a glance, making it easy for you to quickly find exactly what you need.\\n\\n**Broad User Spectrum**\\n\\nWe support a wide range of users, from individuals to enterprises and researchers.\\n\\n* **Individuals**: Enjoy effortless inference using Ollama ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-v4_ollama.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-o4_5_ollama.md)) and Llama.cpp ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-v4_llamacpp.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-o4_5_llamacpp.md)) with minimal setup.\\n* **Enterprises**: Achieve high-throughput, scalable performance with vLLM ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-v4_vllm.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-o4_5_vllm.md)) and SGLang ([V4](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-v4_sglang.md), [o4.5](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-o4_5_sglang.md)).\\n* **Researchers**: Leverage advanced frameworks including [Transformers](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_full.md), [LLaMA-Factory](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_llamafactory.md), [SWIFT](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/swift.md), and [Align-anything](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/align_anything.md) to enable flexible model development and cutting-edge experimentation.\\n\\n**Versatile Deployment Scenarios**\\n\\nOur ecosystem delivers optimal solution for a variety of hardware environments and deployment demands.\\n\\n* **Web Demo**: Full-duplex real-time video interaction solution with high responsiveness and low latency. [WebRTC_Demo](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/web_demo/WebRTC_Demo/README.md).\\n* **Quantized deployment**: Maximize efficiency and minimize resource consumption using [GGUF](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/gguf/minicpm-v4_gguf_quantize.md) and [BNB](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/bnb/minicpm-v4_bnb_quantize.md).\\n* **End devices**: Bring powerful AI experiences to [iPhone and iPad](https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/ios_demo/ios.md), supporting offline and privacy-sensitive applications.\\n\\n## License\\n#### Model License\\n* The MiniCPM-o/V model weights and code are open-sourced under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM-V/blob/main/LICENSE) license.\\n\\n#### Statement\\n* As MLLMs, MiniCPM-o/V models generate content by learning a large number of multimodal corpora, but they cannot comprehend, express personal opinions, or make value judgements. Anything generated by MiniCPM-o/V models does not represent the views and positions of the model developers\\n* We will not be liable for any problems arising from the use of MiniCPM-o/V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination, or misuse of the model.\\n\\n\\n## Key Techniques and Other Multimodal Projects \\u003c!-- omit in toc --\\u003e\\n\\n👏 Welcome to explore key techniques of MiniCPM-o/V and other multimodal projects of our team:\\n\\n[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLPR](https://github.com/OpenBMB/RLPR) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)\\n\\n\\n## Citation \\u003c!-- omit in toc --\\u003e\\n\\nIf you find our model/code/paper helpful, please consider citing our papers 📝 and staring us ⭐️!\\n\\n```bib\\n@article{cui2026minicpmo45realtimefullduplex,\\n title={MiniCPM-o 4.5: Towards Real-Time Full-Duplex Omni-Modal Interaction}, \\n author={Junbo Cui and Bokai Xu and Chongyi Wang and Tianyu Yu and Weiyue Sun and Yingjing Xu and Tianran Wang and Zhihui He and Wenshuo Ma and Tianchi Cai and Jiancheng Gui and Luoyuan Zhang and Xian Sun and Fuwei Huang and Moye Chen and Zhuo Lin and Hanyu Liu and Qingxin Gui and Qingzhe Han and Yuyang Wen and Huiping Liu and Rongkang Wang and Yaqi Zhang and Hongliang Wei and Chi Chen and You Li and Kechen Fang and Jie Zhou and Yuxuan Li and Guoyang Zeng and Chaojun Xiao and Yankai Lin and Xu Han and Maosong Sun and Zhiyuan Liu and Yuan Yao},\\n year={2026},\\n eprint={2604.27393},\\n archivePrefix={arXiv},\\n primaryClass={cs.CL},\\n url={https://arxiv.org/abs/2604.27393}, \\n}\\n```\\n\",\"ReadMeTips\":null,\"RelatedArxivId\":[\"2604.27393\"],\"RelatedPaper\":[273701],\"Revision\":\"master\",\"Stars\":72,\"StorageSize\":20048636812,\"Studios\":[],\"SubVisionFoundation\":\"\",\"SupportApiInference\":false,\"SupportDashDeployment\":0,\"SupportDashInference\":0,\"SupportDashTraining\":0,\"SupportDeployment\":0,\"SupportExperience\":0,\"SupportFinetuning\":0,\"SupportFlexTrain\":0,\"SupportInference\":\"\",\"SupportPaiModelGallery\":null,\"SupportPaiSdk\":0,\"SupportPivotEval\":false,\"SwingDeployInfo\":null,\"Tags\":[\"minicpm-o\",\"minicpm-v\",\"multimodal\",\"full-duplex\"],\"Tasks\":[{\"ChineseName\":\"统一多模态\",\"Description\":\"\",\"DomainName\":\"multi-modal\",\"Id\":298,\"IsExhibition\":true,\"IsHot\":0,\"IsLeaf\":true,\"IsLoginRequired\":false,\"IsRetrieval\":true,\"Level\":1,\"Name\":\"any-to-any\",\"ParentId\":-1,\"ParentTask\":null,\"Sorting\":0,\"SupportWidgets\":false,\"TypicalModel\":\"\",\"WidgetConfig\":\"{}\",\"WidgetValidator\":\"\"}],\"Tools\":[],\"TriggerWords\":null,\"Visibility\":5,\"VisionFoundation\":\"\",\"_\":null,\"widgets\":[]}";
window.MODELSCOPE_SITE_MODE = "COMMON";
window.MODELSCOPE_DEMO_SCHEMA_VERSION = "0.0.2"
</script>
<script>
</script>
<script
src="//g.alicdn.com/alilog/mlog/aplus_v2.js"
id="beacon-aplus"
></script>
<script src="//g.alicdn.com/vip/havana-login/0.3.2/js/mini-login-embedder-min.js" crossorigin="anonymous"></script>
<script src="//o.alicdn.com/captcha-frontend/aliyunCaptcha/AliyunCaptcha.js" crossorigin="anonymous" async></script>
<script src="//lang.alicdn.com/mcms/modelscope-fe/0.0.210/modelscope-fe.json" crossorigin="anonymous"></script>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-K9CSTSKFC5"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-K9CSTSKFC5');
</script>
</head>
<body>
<div id="root"></div>
<script src="//g.alicdn.com/sail-web/maas/2.13.89/umi.js" crossorigin="anonymous"></script>
</body>
</html>