YummyYum commited on
Commit
948433e
·
verified ·
1 Parent(s): dd13327

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. MiniCPM-o-4.5-nvidia-FlagOS/.msc +0 -0
  3. MiniCPM-o-4.5-nvidia-FlagOS/.mv +1 -0
  4. MiniCPM-o-4.5-nvidia-FlagOS/README.md +357 -0
  5. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/audio_utils.cpython-312.pyc +0 -0
  6. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/chunk_prefill_generate.cpython-312.pyc +0 -0
  7. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/configuration_minicpmo.cpython-312.pyc +0 -0
  8. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/configuration_minicpmtts.cpython-312.pyc +0 -0
  9. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/modeling_minicpmo.cpython-312.pyc +3 -0
  10. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/modeling_navit_siglip.cpython-312.pyc +0 -0
  11. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_audio_minicpma.cpython-312.pyc +0 -0
  12. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_image_minicpmv.cpython-312.pyc +0 -0
  13. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_minicpmo.cpython-312.pyc +0 -0
  14. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_streaming_mel.cpython-312.pyc +0 -0
  15. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/resampler.cpython-312.pyc +0 -0
  16. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/sliding_utils.cpython-312.pyc +0 -0
  17. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/tts_streaming_generate.cpython-312.pyc +0 -0
  18. MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/utils.cpython-312.pyc +0 -0
  19. MiniCPM-o-4.5-nvidia-FlagOS/added_tokens.json +107 -0
  20. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/config.json +27 -0
  21. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/generation_config.json +14 -0
  22. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/merges.txt +0 -0
  23. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/model.safetensors +3 -0
  24. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/tokenizer_config.json +40 -0
  25. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/vocab.json +0 -0
  26. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/README.md +227 -0
  27. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/asset/dingding.png +0 -0
  28. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/campplus.onnx +3 -0
  29. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/config.json +27 -0
  30. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/configuration.json +1 -0
  31. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/cosyvoice2.yaml +233 -0
  32. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.cache.pt +3 -0
  33. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.decoder.estimator.fp32.onnx +3 -0
  34. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.encoder.fp16.zip +3 -0
  35. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.encoder.fp32.zip +3 -0
  36. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.pt +3 -0
  37. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/hift.pt +3 -0
  38. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/merges.txt +0 -0
  39. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/speech_tokenizer_v2.onnx +3 -0
  40. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/tokenizer_config.json +40 -0
  41. MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/vocab.json +0 -0
  42. MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/campplus.onnx +3 -0
  43. MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/flow.pt +3 -0
  44. MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/flow.yaml +34 -0
  45. MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/hift.pt +3 -0
  46. MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/speech_tokenizer_v2_25hz.onnx +3 -0
  47. MiniCPM-o-4.5-nvidia-FlagOS/audio_utils.py +236 -0
  48. MiniCPM-o-4.5-nvidia-FlagOS/chunk_prefill_generate.py +509 -0
  49. MiniCPM-o-4.5-nvidia-FlagOS/config.json +285 -0
  50. MiniCPM-o-4.5-nvidia-FlagOS/configuration_minicpmo.py +221 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/modeling_minicpmo.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
37
+ MiniCPM-o-4.5-nvidia-FlagOS/tokenizer.json filter=lfs diff=lfs merge=lfs -text
MiniCPM-o-4.5-nvidia-FlagOS/.msc ADDED
Binary file (6.2 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1769761691
MiniCPM-o-4.5-nvidia-FlagOS/README.md ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - ""
4
+ ---
5
+ # Introduction
6
+
7
+ **FlagOS** is a unified heterogeneous computing software stack for large models, co-developed with leading global chip manufacturers. With core technologies such as the **FlagScale**, together with vllm-plugin-fl, distributed training/inference framework, **FlagGems** universal operator library, **FlagCX** communication library, and **FlagTree** unified compiler, the **FlagRelease** platform leverages the **FlagOS** stack to automatically produce and release various combinations of \<chip + open-source model\>. This enables efficient and automated model migration across diverse chips, opening a new chapter for large model deployment and application.
8
+
9
+ Based on this, the **MiniCPM-o-4.5-nvidia-FlagOS** model is adapted for the Nvidia chip using the FlagOS software stack, enabling:
10
+
11
+ ### Integrated Deployment
12
+
13
+ - Out-of-the-box inference scripts with pre-configured hardware and software parameters
14
+ - Released **FlagOS-Nvidia** container image supporting deployment within minutes
15
+
16
+ ### Consistency Validation
17
+
18
+ - Rigorously evaluated through benchmark testing: Performance and results from the FlagOS software stack are compared against native stacks on multiple public.
19
+
20
+ # Technical Overview
21
+
22
+ ## FlagGems
23
+
24
+ FlagGems is a high-performance, generic operator library implemented in [Triton](https://github.com/openai/triton) language. It is built on a collection of backend-neutral kernels that aims to accelerate LLM (Large-Language Models) training and inference across diverse hardware platforms.
25
+
26
+ ## FlagTree
27
+
28
+ FlagTree is an open source, unified compiler for multiple AI chips project dedicated to developing a diverse ecosystem of AI chip compilers and related tooling platforms, thereby fostering and strengthening the upstream and downstream Triton ecosystem. Currently in its initial phase, the project aims to maintain compatibility with existing adaptation solutions while unifying the codebase to rapidly implement single-repository multi-backend support. For upstream model users, it provides unified compilation capabilities across multiple backends; for downstream chip manufacturers, it offers examples of Triton ecosystem integration.
29
+
30
+ ## FlagScale and vllm-plugin-fl
31
+
32
+ FlagScale is a comprehensive toolkit designed to support the entire lifecycle of large models. It builds on the strengths of several prominent open-source projects, including [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) and [vLLM](https://github.com/vllm-project/vllm), to provide a robust, end-to-end solution for managing and scaling large models.
33
+ vllm-plugin-fl is a vLLM plugin built on the FlagOS unified multi-chip backend, to help flagscale support multi-chip on vllm framework.
34
+
35
+ ## **FlagCX**
36
+
37
+ FlagCX is a scalable and adaptive cross-chip communication library. It serves as a platform where developers, researchers, and AI engineers can collaborate on various projects, contribute to the development of cutting-edge AI solutions, and share their work with the global community.
38
+
39
+ ## **FlagEval Evaluation Framework**
40
+
41
+ FlagEval is a comprehensive evaluation system and open platform for large models launched in 2023. It aims to establish scientific, fair, and open benchmarks, methodologies, and tools to help researchers assess model and training algorithm performance. It features:
42
+ - **Multi-dimensional Evaluation**: Supports 800+ model evaluations across NLP, CV, Audio, and Multimodal fields, covering 20+ downstream tasks including language understanding and image-text generation.
43
+ - **Industry-Grade Use Cases**: Has completed horizontal evaluations of mainstream large models, providing authoritative benchmarks for chip-model performance validation.
44
+
45
+ # Evaluation Results
46
+
47
+ ## Transformers version
48
+
49
+ Accuracy Difference between USE_FLAGOS=1 and USE_FLAGOS=0 on Nvidia-CUDA
50
+
51
+ | Metrics | Difference with Nvidia-CUDA |
52
+ | ---------------------- | --------------------- |
53
+ | Video-MME 0-shot avg@1 ↑ | 0.33% |
54
+
55
+ ## VLLM version
56
+
57
+ Accuracy Difference between USE_FLAGGEMS=1 FLAGCX_PATH=/workspace/FlagCX on Nvidia and launch vllm server directly on Nvidia。
58
+
59
+ | Metrics(avg@1) | Difference with Nvidia-CUDA |
60
+ | ---------------------- | --------------------- |
61
+ | CMMMU ↑ | 0.72% |
62
+ | MMMU ↑ | 1.44% |
63
+ | MMMU_Pro_standard ↑ | 0.83% |
64
+ | MMMU_Pro_vision ↑ | 0.38% |
65
+ | MM-Vet v2 ↑ | 0.46% |
66
+ | OCRBench ↑ | 0.10% |
67
+ | MathVision ↑ | 0.41% |
68
+ | CII-Bench ↑ | 0.40% |
69
+ | Blink ↑ | 1.90% |
70
+ | MathVista ↑ | 0.70% |
71
+
72
+ # User Guide
73
+
74
+ ## Environment Setup
75
+
76
+ | Accelerator Card Driver Version | Driver Version: 570.158.01 |
77
+ | ------------------------------- | ---------------------------------------- |
78
+ | CUDA SDK Build | Build cuda_13.0.r13.0/compiler.36424714_0 |
79
+ | FlagTree | Version: 0.4.0+3.5 |
80
+ | FlagGems | Version: 4.2.1rc0 |
81
+ | vllm & vllm-plugin-fl | Version: 0.13.0 + vllm_fl 0.0.0 |
82
+ | FlagCX | Version: 0.1.0 |
83
+
84
+ ## Transformers version
85
+
86
+ ### Download Open-source Model Weights
87
+
88
+ ```bash
89
+ pip install modelscope
90
+ modelscope download --model FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS --local_dir /share/MiniCPMO45
91
+ ```
92
+
93
+ ### Download FlagOS Image
94
+
95
+ ```bash
96
+ docker pull harbor.baai.ac.cn/flagrelease-public/flagrelease-nvidia-gems_4.2.1rc0-tree_0.4-flagos_1.6-amd64
97
+ ```
98
+
99
+ ### Start the Container
100
+
101
+ ```bash
102
+ #Container Startup
103
+ docker run --init --detach --net=host --user 0 --ipc=host \
104
+ -v /share:/share --security-opt=seccomp=unconfined \
105
+ --privileged --ulimit=stack=67108864 --ulimit=memlock=-1 \
106
+ --shm-size=512G --gpus all \
107
+ --name flagos harbor.baai.ac.cn/flagrelease-public/flagrelease-nvidia-gems_4.2.1rc0-tree_0.4-flagos_1.6-amd64
108
+ docker exec -it flagos bash
109
+ ```
110
+
111
+ ### Use MiniCPM-o-4.5
112
+
113
+ You can refers to OpenBMB/MiniCPM-o-4.5 to use the model as you want. For FlagOS, you can follow these steps to get better performance than CUDA:
114
+
115
+ 1. Write your own task script like generate_speech_from_video.py
116
+ 2. execute
117
+
118
+ ```bash
119
+ python3 generate_speech_from_video.py
120
+ ```
121
+
122
+ to launch your job, just the same as openBMB/MiniCPM-o-4.5
123
+
124
+ 3. execute
125
+
126
+ ```bash
127
+ USE_FLAGOS=1 python3 generate_speech_from_video.py
128
+ ```
129
+
130
+ to get better performance!
131
+
132
+ For example, you can write your generate_speech_from_video.py refers to the following codes, which are from openBMB/MiniCPM-o-4.5's README:
133
+
134
+ ```python3
135
+ import json
136
+ import os
137
+
138
+ import librosa
139
+ import torch
140
+ from transformers import AutoTokenizer, AutoProcessor
141
+
142
+ from MiniCPMO45.modeling_minicpmo import MiniCPMO
143
+ from MiniCPMO45.modeling_minicpmo import TTSSamplingParams
144
+ from MiniCPMO45.processing_minicpmo import MiniCPMOProcessor
145
+ from MiniCPMO45.utils import get_video_frame_audio_segments
146
+
147
+
148
+ def gen(stack_frames=1, max_slice_nums=None):
149
+ ref_audio_path = "haitian_ref_audio.wav"
150
+ ref_name = "haitian_ref_audio"
151
+
152
+ ckpt_name = "job_79706_ckpt_2000"
153
+
154
+ save_dir = "./outputs"
155
+ os.makedirs(save_dir, exist_ok=True)
156
+ os.makedirs(os.path.join(save_dir, ckpt_name), exist_ok=True)
157
+
158
+ name_or_path = "./MiniCPMO45"
159
+ model = MiniCPMO.from_pretrained(name_or_path, trust_remote_code=True, _attn_implementation="flash_attention_2")
160
+
161
+ model.bfloat16()
162
+ model.eval().cuda()
163
+
164
+ model.init_tts(streaming=False)
165
+
166
+ filenames = {
167
+ "record_cases_info": "record_cases_info.jsonl",
168
+ }
169
+
170
+ use_sys_modes = {
171
+ "omni",
172
+ "audio_assistant",
173
+ "audio_roleplay",
174
+ "voice_cloning",
175
+ "voice_cloning_new",
176
+ }
177
+ use_sys_mode = os.environ.get("USE_SYS_MODE", "default")
178
+ if use_sys_mode not in use_sys_modes:
179
+ use_sys_mode = None
180
+
181
+ sys_msg = None
182
+ if use_sys_mode:
183
+ ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
184
+ sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode=use_sys_mode, language="en")
185
+
186
+ error_stat = {}
187
+ for name, filename in filenames.items():
188
+ datas = [json.loads(line) for line in open(filename, encoding="utf-8")]
189
+ n_datas = len(datas)
190
+ error_stat[name] = {}
191
+
192
+ results = []
193
+ identity_prefix = name
194
+ if use_sys_mode:
195
+ identity_prefix = f"{identity_prefix}_{use_sys_mode}_{ref_name}"
196
+
197
+ for id_item, item in enumerate(datas):
198
+ print(f"{id_item}/{n_datas}: {filename}")
199
+ try:
200
+ video_path = item["video_path"]
201
+ if item["source"] == "record_cases":
202
+ last_vad_timestamp = item["last_vad_timestamp"]
203
+ video_segments, audio_segments, stack_segments = get_video_frame_audio_segments(
204
+ video_path, last_vad_timestamp=last_vad_timestamp, stack_frames=stack_frames
205
+ )
206
+ else:
207
+ mix_origin_question = item["mix_origin_question"]
208
+ video_segments, audio_segments, stack_segments = get_video_frame_audio_segments(
209
+ video_path, audio_path=mix_origin_question, stack_frames=stack_frames
210
+ )
211
+ except:
212
+ import traceback
213
+
214
+ traceback.print_exc()
215
+ print(f"video get frame error, item={item}")
216
+ error_items = error_stat.get(name, {}).get("get_video_frame_audio_segments", [])
217
+ error_items.append(id_item)
218
+ error_stat[name]["get_video_frame_audio_segments"] = error_items
219
+ continue
220
+
221
+ omni_segments = []
222
+ for i in range(len(video_segments)):
223
+ omni_segments.append(video_segments[i])
224
+ omni_segments.append(audio_segments[i])
225
+ if stack_segments is not None and stack_segments[i] is not None:
226
+ omni_segments.append(stack_segments[i])
227
+
228
+ msgs = []
229
+ if sys_msg:
230
+ msgs.append(sys_msg)
231
+
232
+ msgs.append({"role": "user", "content": omni_segments})
233
+
234
+ try:
235
+ identity = f"{identity_prefix}_{id_item}"
236
+ output_audio_path = f"{save_dir}/{ckpt_name}/{identity}___generated.wav"
237
+
238
+ with torch.no_grad():
239
+ res, prompt = model.chat(
240
+ image=None,
241
+ msgs=msgs,
242
+ do_sample=True,
243
+ max_new_tokens=512,
244
+ max_inp_length=8192,
245
+ stream=False,
246
+ stream_input=False,
247
+ use_tts_template=True,
248
+ enable_thinking=False,
249
+ generate_audio=False,
250
+ output_audio_path=output_audio_path,
251
+ output_tts_inputs_embeds_path=None,
252
+ omni_mode=True,
253
+ max_slice_nums=max_slice_nums,
254
+ use_image_id=False,
255
+ teacher_forcing=False,
256
+ return_prompt=True,
257
+ tts_proj_layer=-1,
258
+ )
259
+ print(f"prompt: {prompt}")
260
+ result = {
261
+ "idx": id_item,
262
+ "item": item,
263
+ "prompt": prompt,
264
+ "answer": res,
265
+ "gen_audio_path": output_audio_path,
266
+ }
267
+
268
+ if use_sys_mode:
269
+ result["ref_audio_path"] = ref_audio_path
270
+
271
+ results.append(result)
272
+ except:
273
+ import traceback
274
+
275
+ traceback.print_exc()
276
+ print(f"error: msgs={msgs}")
277
+ error_items = error_stat.get("items", [])
278
+ error_items.append(id_item)
279
+ error_stat[name]["items"] = error_items
280
+
281
+ if results:
282
+ print(f"save into: {save_dir}/{identity_prefix}_{ckpt_name}.jsonl")
283
+ with open(f"{save_dir}/{identity_prefix}_{ckpt_name}.jsonl", "w") as fd:
284
+ for line in results:
285
+ fd.write(json.dumps(line, ensure_ascii=False) + "\n")
286
+ else:
287
+ print("no data")
288
+
289
+ print(error_stat)
290
+
291
+
292
+ if __name__ == "__main__":
293
+ stack_frames = 5 # 1 - 常规, >1: 高刷 (=5, 额外 4帧合一张图)
294
+ max_slice_nums = 1 # 1 - 常规, >1: 高清
295
+
296
+ gen(stack_frames=stack_frames, max_slice_nums=max_slice_nums)
297
+ ```
298
+
299
+ ## VLLM version
300
+
301
+ ### Download Open-source Model Weights
302
+
303
+ ```bash
304
+ pip install modelscope
305
+ modelscope download --model FlagRelease/MiniCPM-o-4.5-nvidia-FlagOS --local_dir /share/MiniCPMO45
306
+ ```
307
+
308
+ ### Download FlagOS Image
309
+
310
+ ```bash
311
+ docker pull harbor.baai.ac.cn/flagrelease-public/flagrelease-nvidia-gems_4.2.1rc0-tree_0.4-flagos_1.6-vllmpluginfl_0.0.0-flagcx_0.1.0-vllm_0.13.0-amd64
312
+ ```
313
+
314
+ ### Start the Container
315
+
316
+ ```bash
317
+ #Container Startup
318
+ docker run --init --detach --net=host --user 0 --ipc=host \
319
+ -v /share:/share --security-opt=seccomp=unconfined \
320
+ --privileged --ulimit=stack=67108864 --ulimit=memlock=-1 \
321
+ --shm-size=512G --gpus all \
322
+ --name flagos harbor.baai.ac.cn/flagrelease-public/flagrelease-nvidia-gems_4.2.1rc0-tree_0.4-flagos_1.6-vllmpluginfl_0.0.0-flagcx_0.1.0-vllm_0.13.0-amd64
323
+ docker exec -it flagos bash
324
+ ```
325
+
326
+ ### Serve and use MiniCPM-o-4.5 with vllm
327
+
328
+ Notes: you can refers to https://github.com/vllm-project/vllm to know how to use vllm
329
+
330
+ You can use
331
+
332
+ ```bash
333
+ vllm serve /share/MiniCPMO45 --trust-remote-code
334
+ ```
335
+ to launch server without FlagOS, and use
336
+
337
+ ```bash
338
+ USE_FLAGGEMS=1 FLAGCX_PATH=/workspace/FlagCX vllm serve /share/MiniCPMO45 --trust-remote-code
339
+ ```
340
+ to launch server with FlagOS.
341
+
342
+ After that, you can do whatever you want with the vllm's server at 0.0.0.0:8000!
343
+
344
+
345
+ # Contributing
346
+
347
+ We warmly welcome global developers to join us:
348
+
349
+ 1. Submit Issues to report problems
350
+ 2. Create Pull Requests to contribute code
351
+ 3. Improve technical documentation
352
+ 4. Expand hardware adaptation support
353
+
354
+
355
+ # License
356
+
357
+ The weight files are from MiniCPM-o-4.5 from OpenBMB, open source with apache2.0 licensehttps://www.apache.org/licenses/LICENSE-2.0.txt.
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/audio_utils.cpython-312.pyc ADDED
Binary file (8.69 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/chunk_prefill_generate.cpython-312.pyc ADDED
Binary file (17.9 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/configuration_minicpmo.cpython-312.pyc ADDED
Binary file (8.81 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/configuration_minicpmtts.cpython-312.pyc ADDED
Binary file (4.82 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/modeling_minicpmo.cpython-312.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c83b258a64e214443752591b19dbed4c8ce37974aaa9ae735c48e3de1a96df5d
3
+ size 216265
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/modeling_navit_siglip.cpython-312.pyc ADDED
Binary file (46.5 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_audio_minicpma.cpython-312.pyc ADDED
Binary file (7.6 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_image_minicpmv.cpython-312.pyc ADDED
Binary file (19.9 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_minicpmo.cpython-312.pyc ADDED
Binary file (29.7 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/processing_streaming_mel.cpython-312.pyc ADDED
Binary file (15.6 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/resampler.cpython-312.pyc ADDED
Binary file (27.2 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/sliding_utils.cpython-312.pyc ADDED
Binary file (10.1 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/tts_streaming_generate.cpython-312.pyc ADDED
Binary file (9.08 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/__pycache__/utils.cpython-312.pyc ADDED
Binary file (16.7 kB). View file
 
MiniCPM-o-4.5-nvidia-FlagOS/added_tokens.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</answer>": 151686,
3
+ "</box>": 151674,
4
+ "</focus>": 151688,
5
+ "</image>": 151670,
6
+ "</image_id>": 151682,
7
+ "</image_save_to>": 151696,
8
+ "</line>": 151690,
9
+ "</perception>": 151692,
10
+ "</point>": 151678,
11
+ "</quad>": 151676,
12
+ "</ref>": 151672,
13
+ "</slice>": 151680,
14
+ "</source_image>": 151694,
15
+ "</think>": 151668,
16
+ "</tool_call>": 151658,
17
+ "</tool_response>": 151666,
18
+ "</unit>": 151684,
19
+ "<answer>": 151685,
20
+ "<box>": 151673,
21
+ "<focus>": 151687,
22
+ "<image>": 151669,
23
+ "<image_id>": 151681,
24
+ "<image_save_to>": 151695,
25
+ "<line>": 151689,
26
+ "<perception>": 151691,
27
+ "<point>": 151677,
28
+ "<quad>": 151675,
29
+ "<ref>": 151671,
30
+ "<slice>": 151679,
31
+ "<source_image>": 151693,
32
+ "<think>": 151667,
33
+ "<tool_call>": 151657,
34
+ "<tool_response>": 151665,
35
+ "<unit>": 151683,
36
+ "<|audio_end|>": 151699,
37
+ "<|audio_start|>": 151697,
38
+ "<|audio|>": 151698,
39
+ "<|box_end|>": 151649,
40
+ "<|box_start|>": 151648,
41
+ "<|emotion_end|>": 151711,
42
+ "<|emotion_start|>": 151710,
43
+ "<|endoftext|>": 151643,
44
+ "<|file_sep|>": 151664,
45
+ "<|fim_middle|>": 151660,
46
+ "<|fim_pad|>": 151662,
47
+ "<|fim_prefix|>": 151659,
48
+ "<|fim_suffix|>": 151661,
49
+ "<|im_end|>": 151645,
50
+ "<|im_start|>": 151644,
51
+ "<|image_pad|>": 151655,
52
+ "<|interrupt|>": 151707,
53
+ "<|listen|>": 151705,
54
+ "<|object_ref_end|>": 151647,
55
+ "<|object_ref_start|>": 151646,
56
+ "<|pitch_end|>": 151715,
57
+ "<|pitch_start|>": 151714,
58
+ "<|quad_end|>": 151651,
59
+ "<|quad_start|>": 151650,
60
+ "<|repo_name|>": 151663,
61
+ "<|speak|>": 151706,
62
+ "<|speed_end|>": 151713,
63
+ "<|speed_start|>": 151712,
64
+ "<|spk_bos|>": 151700,
65
+ "<|spk_eos|>": 151702,
66
+ "<|spk|>": 151701,
67
+ "<|turn_bos|>": 151716,
68
+ "<|timbre_10|>": 151726,
69
+ "<|timbre_11|>": 151727,
70
+ "<|timbre_12|>": 151728,
71
+ "<|timbre_13|>": 151729,
72
+ "<|timbre_14|>": 151730,
73
+ "<|timbre_15|>": 151731,
74
+ "<|timbre_16|>": 151732,
75
+ "<|timbre_17|>": 151733,
76
+ "<|timbre_18|>": 151734,
77
+ "<|timbre_19|>": 151735,
78
+ "<|turn_eos|>": 151717,
79
+ "<|timbre_20|>": 151736,
80
+ "<|timbre_21|>": 151737,
81
+ "<|timbre_22|>": 151738,
82
+ "<|timbre_23|>": 151739,
83
+ "<|timbre_24|>": 151740,
84
+ "<|timbre_25|>": 151741,
85
+ "<|timbre_26|>": 151742,
86
+ "<|timbre_27|>": 151743,
87
+ "<|timbre_28|>": 151744,
88
+ "<|timbre_29|>": 151745,
89
+ "<|chunk_eos|>": 151718,
90
+ "<|timbre_30|>": 151746,
91
+ "<|timbre_31|>": 151747,
92
+ "<|chunk_bos|>": 151719,
93
+ "<|chunk_tts_bos|>": 151720,
94
+ "<|chunk_tts_eos|>": 151721,
95
+ "<|tts_pad|>": 151722,
96
+ "<|timbre_7|>": 151723,
97
+ "<|timbre_8|>": 151724,
98
+ "<|timbre_9|>": 151725,
99
+ "<|tts_bos|>": 151703,
100
+ "<|tts_eos|>": 151704,
101
+ "<|vad_end|>": 151709,
102
+ "<|vad_start|>": 151708,
103
+ "<|video_pad|>": 151656,
104
+ "<|vision_end|>": 151653,
105
+ "<|vision_pad|>": 151654,
106
+ "<|vision_start|>": 151652
107
+ }
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 24,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 14,
16
+ "num_hidden_layers": 24,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": 32768,
21
+ "tie_word_embeddings": true,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.40.1",
24
+ "use_cache": true,
25
+ "use_sliding_window": false,
26
+ "vocab_size": 151936
27
+ }
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "pad_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_p": 0.8,
12
+ "top_k": 20,
13
+ "transformers_version": "4.37.0"
14
+ }
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130282af0dfa9fe5840737cc49a0d339d06075f83c5a315c3372c9a0740d0b96
3
+ size 988097824
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/CosyVoice-BlankEN/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/README.md ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)
2
+
3
+ ## 👉🏻 CosyVoice 👈🏻
4
+ **CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
5
+
6
+ **CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
7
+
8
+ ## Highlight🔥
9
+
10
+ **CosyVoice 2.0** has been released! Compared to version 1.0, the new version offers more accurate, more stable, faster, and better speech generation capabilities.
11
+ ### Multilingual
12
+ - **Supported Language**: Chinese, English, Japanese, Korean, Chinese dialects (Cantonese, Sichuanese, Shanghainese, Tianjinese, Wuhanese, etc.)
13
+ - **Crosslingual & Mixlingual**:Support zero-shot voice cloning for cross-lingual and code-switching scenarios.
14
+ ### Ultra-Low Latency
15
+ - **Bidirectional Streaming Support**: CosyVoice 2.0 integrates offline and streaming modeling technologies.
16
+ - **Rapid First Packet Synthesis**: Achieves latency as low as 150ms while maintaining high-quality audio output.
17
+ ### High Accuracy
18
+ - **Improved Pronunciation**: Reduces pronunciation errors by 30% to 50% compared to CosyVoice 1.0.
19
+ - **Benchmark Achievements**: Attains the lowest character error rate on the hard test set of the Seed-TTS evaluation set.
20
+ ### Strong Stability
21
+ - **Consistency in Timbre**: Ensures reliable voice consistency for zero-shot and cross-language speech synthesis.
22
+ - **Cross-language Synthesis**: Marked improvements compared to version 1.0.
23
+ ### Natural Experience
24
+ - **Enhanced Prosody and Sound Quality**: Improved alignment of synthesized audio, raising MOS evaluation scores from 5.4 to 5.53.
25
+ - **Emotional and Dialectal Flexibility**: Now supports more granular emotional controls and accent adjustments.
26
+
27
+ ## Roadmap
28
+
29
+ - [x] 2024/12
30
+
31
+ - [x] 25hz cosyvoice 2.0 released
32
+
33
+ - [x] 2024/09
34
+
35
+ - [x] 25hz cosyvoice base model
36
+ - [x] 25hz cosyvoice voice conversion model
37
+
38
+ - [x] 2024/08
39
+
40
+ - [x] Repetition Aware Sampling(RAS) inference for llm stability
41
+ - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
42
+
43
+ - [x] 2024/07
44
+
45
+ - [x] Flow matching training support
46
+ - [x] WeTextProcessing support when ttsfrd is not available
47
+ - [x] Fastapi server and client
48
+
49
+
50
+ ## Install
51
+
52
+ **Clone and install**
53
+
54
+ - Clone the repo
55
+ ``` sh
56
+ git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
57
+ # If you failed to clone submodule due to network failures, please run following command until success
58
+ cd CosyVoice
59
+ git submodule update --init --recursive
60
+ ```
61
+
62
+ - Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
63
+ - Create Conda env:
64
+
65
+ ``` sh
66
+ conda create -n cosyvoice python=3.10
67
+ conda activate cosyvoice
68
+ # pynini is required by WeTextProcessing, use conda to install it as it can be executed on all platform.
69
+ conda install -y -c conda-forge pynini==2.1.5
70
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
71
+
72
+ # If you encounter sox compatibility issues
73
+ # ubuntu
74
+ sudo apt-get install sox libsox-dev
75
+ # centos
76
+ sudo yum install sox sox-devel
77
+ ```
78
+
79
+ **Model download**
80
+
81
+ We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource.
82
+
83
+ ``` python
84
+ # SDK模型下载
85
+ from modelscope import snapshot_download
86
+ snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
87
+ snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
88
+ snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
89
+ snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
90
+ snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
91
+ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
92
+ ```
93
+
94
+ ``` sh
95
+ # git模型下载,请确保已安装git lfs
96
+ mkdir -p pretrained_models
97
+ git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
98
+ git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
99
+ git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
100
+ git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
101
+ git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
102
+ git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
103
+ ```
104
+
105
+ Optionally, you can unzip `ttsfrd` resouce and install `ttsfrd` package for better text normalization performance.
106
+
107
+ Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use WeTextProcessing by default.
108
+
109
+ ``` sh
110
+ cd pretrained_models/CosyVoice-ttsfrd/
111
+ unzip resource.zip -d .
112
+ pip install ttsfrd_dependency-0.1-py3-none-any.whl
113
+ pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
114
+ ```
115
+
116
+ **Basic Usage**
117
+
118
+ We strongly recommend using `CosyVoice2-0.5B` for better performance.
119
+ Follow code below for detailed usage of each model.
120
+
121
+ ``` python
122
+ import sys
123
+ sys.path.append('third_party/Matcha-TTS')
124
+ from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
125
+ from cosyvoice.utils.file_utils import load_wav
126
+ import torchaudio
127
+ ```
128
+
129
+ **CosyVoice2 Usage**
130
+ ```python
131
+ cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
132
+
133
+ # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
134
+ # zero_shot usage
135
+ prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
136
+ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
137
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
138
+
139
+ # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
140
+ for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
141
+ torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
142
+
143
+ # instruct usage
144
+ for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
145
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
146
+ ```
147
+
148
+ **CosyVoice Usage**
149
+ ```python
150
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
151
+ # sft usage
152
+ print(cosyvoice.list_available_spks())
153
+ # change stream=True for chunk stream inference
154
+ for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
155
+ torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
156
+
157
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M') # or change to pretrained_models/CosyVoice-300M-25Hz for 25Hz inference
158
+ # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
159
+ prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
160
+ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
161
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
162
+ # cross_lingual usage
163
+ prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
164
+ for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
165
+ torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
166
+ # vc usage
167
+ prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
168
+ source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
169
+ for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
170
+ torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
171
+
172
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
173
+ # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
174
+ for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
175
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
176
+ ```
177
+
178
+ **Start web demo**
179
+
180
+ You can use our web demo page to get familiar with CosyVoice quickly.
181
+
182
+ Please see the demo website for details.
183
+
184
+ ``` python
185
+ # change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference
186
+ python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
187
+ ```
188
+
189
+ **Advanced Usage**
190
+
191
+ For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
192
+
193
+ **Build for deployment**
194
+
195
+ Optionally, if you want service deployment,
196
+ you can run following steps.
197
+
198
+ ``` sh
199
+ cd runtime/python
200
+ docker build -t cosyvoice:v1.0 .
201
+ # change iic/CosyVoice-300M to iic/CosyVoice-300M-Instruct if you want to use instruct inference
202
+ # for grpc usage
203
+ docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/grpc && python3 server.py --port 50000 --max_conc 4 --model_dir iic/CosyVoice-300M && sleep infinity"
204
+ cd grpc && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
205
+ # for fastapi usage
206
+ docker run -d --runtime=nvidia -p 50000:50000 cosyvoice:v1.0 /bin/bash -c "cd /opt/CosyVoice/CosyVoice/runtime/python/fastapi && python3 server.py --port 50000 --model_dir iic/CosyVoice-300M && sleep infinity"
207
+ cd fastapi && python3 client.py --port 50000 --mode <sft|zero_shot|cross_lingual|instruct>
208
+ ```
209
+
210
+ ## Discussion & Communication
211
+
212
+ You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
213
+
214
+ You can also scan the QR code to join our official Dingding chat group.
215
+
216
+ <img src="./asset/dingding.png" width="250px">
217
+
218
+ ## Acknowledge
219
+
220
+ 1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR).
221
+ 2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec).
222
+ 3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS).
223
+ 4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
224
+ 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
225
+
226
+ ## Disclaimer
227
+ The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/asset/dingding.png ADDED
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CosyVoice2Model"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 24,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 14,
16
+ "num_hidden_layers": 24,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": 32768,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.40.1",
24
+ "use_cache": true,
25
+ "use_sliding_window": false,
26
+ "vocab_size": 151936
27
+ }
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-to-speech"}
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/cosyvoice2.yaml ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ llm_input_size: 896
10
+ llm_output_size: 896
11
+ spk_embed_dim: 192
12
+ qwen_pretrain_path: ''
13
+ token_frame_rate: 25
14
+ token_mel_ratio: 2
15
+
16
+ # stream related params
17
+ chunk_size: 25 # streaming inference chunk size, in token
18
+ num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
19
+
20
+ # model params
21
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
22
+ # for system/third_party class/function, we do not require this.
23
+ llm: !new:cosyvoice.llm.llm.Qwen2LM
24
+ llm_input_size: !ref <llm_input_size>
25
+ llm_output_size: !ref <llm_output_size>
26
+ speech_token_size: 6561
27
+ length_normalized_loss: True
28
+ lsm_weight: 0
29
+ mix_ratio: [5, 15]
30
+ llm: !new:cosyvoice.llm.llm.Qwen2Encoder
31
+ pretrain_path: !ref <qwen_pretrain_path>
32
+ sampling: !name:cosyvoice.utils.common.ras_sampling
33
+ top_p: 0.8
34
+ top_k: 25
35
+ win_size: 10
36
+ tau_r: 0.1
37
+
38
+ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
39
+ input_size: 512
40
+ output_size: 80
41
+ spk_embed_dim: !ref <spk_embed_dim>
42
+ output_type: 'mel'
43
+ vocab_size: 6561
44
+ input_frame_rate: !ref <token_frame_rate>
45
+ only_mask_loss: True
46
+ token_mel_ratio: !ref <token_mel_ratio>
47
+ pre_lookahead_len: 3
48
+ encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
49
+ output_size: 512
50
+ attention_heads: 8
51
+ linear_units: 2048
52
+ num_blocks: 6
53
+ dropout_rate: 0.1
54
+ positional_dropout_rate: 0.1
55
+ attention_dropout_rate: 0.1
56
+ normalize_before: True
57
+ input_layer: 'linear'
58
+ pos_enc_layer_type: 'rel_pos_espnet'
59
+ selfattention_layer_type: 'rel_selfattn'
60
+ input_size: 512
61
+ use_cnn_module: False
62
+ macaron_style: False
63
+ static_chunk_size: !ref <chunk_size>
64
+ decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
65
+ in_channels: 240
66
+ n_spks: 1
67
+ spk_emb_dim: 80
68
+ cfm_params: !new:omegaconf.DictConfig
69
+ content:
70
+ sigma_min: 1e-06
71
+ solver: 'euler'
72
+ t_scheduler: 'cosine'
73
+ training_cfg_rate: 0.2
74
+ inference_cfg_rate: 0.7
75
+ reg_loss_type: 'l1'
76
+ estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
77
+ in_channels: 320
78
+ out_channels: 80
79
+ channels: [256]
80
+ dropout: 0.0
81
+ attention_head_dim: 64
82
+ n_blocks: 4
83
+ num_mid_blocks: 12
84
+ num_heads: 8
85
+ act_fn: 'gelu'
86
+ static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
87
+ num_decoding_left_chunks: !ref <num_decoding_left_chunks>
88
+
89
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
90
+ in_channels: 80
91
+ base_channels: 512
92
+ nb_harmonics: 8
93
+ sampling_rate: !ref <sample_rate>
94
+ nsf_alpha: 0.1
95
+ nsf_sigma: 0.003
96
+ nsf_voiced_threshold: 10
97
+ upsample_rates: [8, 5, 3]
98
+ upsample_kernel_sizes: [16, 11, 7]
99
+ istft_params:
100
+ n_fft: 16
101
+ hop_len: 4
102
+ resblock_kernel_sizes: [3, 7, 11]
103
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
104
+ source_resblock_kernel_sizes: [7, 7, 11]
105
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
106
+ lrelu_slope: 0.1
107
+ audio_limit: 0.99
108
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
109
+ num_class: 1
110
+ in_channels: 80
111
+ cond_channels: 512
112
+
113
+ # gan related module
114
+ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
115
+ n_fft: 1920
116
+ num_mels: 80
117
+ sampling_rate: !ref <sample_rate>
118
+ hop_size: 480
119
+ win_size: 1920
120
+ fmin: 0
121
+ fmax: null
122
+ center: False
123
+ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
124
+ generator: !ref <hift>
125
+ discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
126
+ mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
127
+ mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
128
+ mel_spec_transform: [
129
+ !ref <mel_spec_transform1>
130
+ ]
131
+
132
+ # processor functions
133
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
134
+ get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
135
+ token_path: !ref <qwen_pretrain_path>
136
+ skip_special_tokens: True
137
+ allowed_special: 'all'
138
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
139
+ get_tokenizer: !ref <get_tokenizer>
140
+ allowed_special: !ref <allowed_special>
141
+ filter: !name:cosyvoice.dataset.processor.filter
142
+ max_length: 40960
143
+ min_length: 100
144
+ token_max_length: 200
145
+ token_min_length: 1
146
+ resample: !name:cosyvoice.dataset.processor.resample
147
+ resample_rate: !ref <sample_rate>
148
+ truncate: !name:cosyvoice.dataset.processor.truncate
149
+ truncate_length: 24480 # must be a multiplier of hop_size
150
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
151
+ n_fft: 1920
152
+ num_mels: 80
153
+ sampling_rate: !ref <sample_rate>
154
+ hop_size: 480
155
+ win_size: 1920
156
+ fmin: 0
157
+ fmax: 8000
158
+ center: False
159
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
160
+ feat_extractor: !ref <feat_extractor>
161
+ compute_f0: !name:cosyvoice.dataset.processor.compute_f0
162
+ sample_rate: !ref <sample_rate>
163
+ hop_size: 480
164
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
165
+ normalize: True
166
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
167
+ shuffle_size: 1000
168
+ sort: !name:cosyvoice.dataset.processor.sort
169
+ sort_size: 500 # sort_size should be less than shuffle_size
170
+ batch: !name:cosyvoice.dataset.processor.batch
171
+ batch_type: 'dynamic'
172
+ max_frames_in_batch: 2000
173
+ padding: !name:cosyvoice.dataset.processor.padding
174
+ use_spk_embedding: False # change to True during sft
175
+
176
+
177
+ # dataset processor pipeline
178
+ data_pipeline: [
179
+ !ref <parquet_opener>,
180
+ !ref <tokenize>,
181
+ !ref <filter>,
182
+ !ref <resample>,
183
+ !ref <compute_fbank>,
184
+ !ref <parse_embedding>,
185
+ !ref <shuffle>,
186
+ !ref <sort>,
187
+ !ref <batch>,
188
+ !ref <padding>,
189
+ ]
190
+ data_pipeline_gan: [
191
+ !ref <parquet_opener>,
192
+ !ref <tokenize>,
193
+ !ref <filter>,
194
+ !ref <resample>,
195
+ !ref <truncate>,
196
+ !ref <compute_fbank>,
197
+ !ref <compute_f0>,
198
+ !ref <parse_embedding>,
199
+ !ref <shuffle>,
200
+ !ref <sort>,
201
+ !ref <batch>,
202
+ !ref <padding>,
203
+ ]
204
+
205
+ # llm flow train conf
206
+ train_conf:
207
+ optim: adam
208
+ optim_conf:
209
+ lr: 1e-5 # change to 1e-5 during sft
210
+ scheduler: constantlr # change to constantlr during sft
211
+ scheduler_conf:
212
+ warmup_steps: 2500
213
+ max_epoch: 200
214
+ grad_clip: 5
215
+ accum_grad: 2
216
+ log_interval: 100
217
+ save_per_step: -1
218
+
219
+ # gan train conf
220
+ train_conf_gan:
221
+ optim: adam
222
+ optim_conf:
223
+ lr: 0.0002 # use small lr for gan training
224
+ scheduler: constantlr
225
+ optim_d: adam
226
+ optim_conf_d:
227
+ lr: 0.0002 # use small lr for gan training
228
+ scheduler_d: constantlr
229
+ max_epoch: 200
230
+ grad_clip: 5
231
+ accum_grad: 1 # in gan training, accum_grad must be 1
232
+ log_interval: 100
233
+ save_per_step: -1
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.cache.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ebde248652c6eed855e08bb6a263af3847039a1361f8019bdb27f5f680a1dc4
3
+ size 450496991
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.decoder.estimator.fp32.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5b37b9c065b41d88d04678300f788b30bf621ead115d5a6c2bd7f05cd7a99e
3
+ size 286644900
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.encoder.fp16.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dab6eeb31aeaf88b443a0fb44ee75b74a0937c32f4bf64ee3a2830dfc5fbf507
3
+ size 185950580
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.encoder.fp32.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541f5bb298ac03c3b37b2ec54389b07350d148ffd2f94b490a9ce4de6e31f66f
3
+ size 330818868
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4c2f867674411e0a08cee702996df13fa67c1cd864c06108da88d16d088541
3
+ size 450575567
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
3
+ size 83390254
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/speech_tokenizer_v2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
3
+ size 496082973
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
MiniCPM-o-4.5-nvidia-FlagOS/assets/CosyVoice2-0.5B/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15ccff24256ff61537c7f8b51e025116b83405f3fb017b54b008fc97da115446
3
+ size 623466603
MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/flow.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flow: !new:cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
2
+ input_size: 512
3
+ output_size: 80
4
+ spk_embed_dim: 192
5
+ output_type: 'mel'
6
+ vocab_size: 6561
7
+ encoder: !new:cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
8
+ input_size: 512
9
+ output_size: 512
10
+ input_layer: 'linear'
11
+ pre_lookahead_len: 3
12
+ num_blocks: 6
13
+ num_up_blocks: 4
14
+ up_stride: 2
15
+ up_scale_factor: 2
16
+ attention_heads: 8
17
+ pos_enc_layer_type: 'rel_pos_espnet'
18
+ selfattention_layer_type: 'rel_selfattn'
19
+ key_bias: true
20
+ linear_units: 2048
21
+ dropout_rate: 0.1
22
+ positional_dropout_rate: 0.1
23
+ attention_dropout_rate: 0.1
24
+ normalize_before: True
25
+ decoder: !new:cosyvoice2.flow.flow_matching.CausalConditionalCFM
26
+ inference_cfg_rate: 0.7
27
+ estimator: !new:cosyvoice2.flow.decoder_dit.DiT
28
+ in_channels: 320
29
+ out_channels: 80
30
+ mlp_ratio: 4.0
31
+ depth: 16
32
+ num_heads: 8
33
+ head_dim: 64
34
+ hidden_size: 512
MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
3
+ size 83390254
MiniCPM-o-4.5-nvidia-FlagOS/assets/token2wav/speech_tokenizer_v2_25hz.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
3
+ size 496082973
MiniCPM-o-4.5-nvidia-FlagOS/audio_utils.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from typing import List
5
+ from typing import Optional
6
+ from typing import Tuple
7
+ from typing import Union
8
+
9
+ import numpy as np
10
+ import torch
11
+
12
+
13
+ def chunk_audio(audio: np.ndarray, max_duration_seconds: int = 30, sample_rate: int = 16000) -> List[np.ndarray]:
14
+ """split long audio into chunks
15
+
16
+ Args:
17
+ audio:
18
+ max_duration_seconds:
19
+ sample_rate:
20
+
21
+ Returns:
22
+ chunks
23
+ """
24
+ max_len = int(max_duration_seconds * sample_rate)
25
+
26
+ if len(audio) <= max_len:
27
+ return [audio]
28
+
29
+ chunks = []
30
+ for i in range(0, len(audio), max_len):
31
+ chunk = audio[i : i + max_len]
32
+ chunks.append(chunk)
33
+
34
+ return chunks
35
+
36
+
37
+ def process_audio_batch(
38
+ audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
39
+ feature_extractor,
40
+ sampling_rate: int = 16000,
41
+ max_duration_seconds: int = 30,
42
+ return_attention_mask: bool = True,
43
+ ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
44
+ """extract audio mel features
45
+
46
+ Args:
47
+ audios:
48
+ feature_extractor: WhisperFeatureExtractor
49
+ sampling_rate:
50
+ max_duration_seconds:
51
+ return_attention_mask:
52
+
53
+ Returns:
54
+ (audio_features, audio_feature_lens)
55
+ audio_features: [batch_size, n_mels, max_frames]
56
+ audio_feature_lens:
57
+ """
58
+ if isinstance(audios, np.ndarray):
59
+ audios_list = [[audios]]
60
+ elif len(audios) > 0 and isinstance(audios[0], np.ndarray):
61
+ audios_list = [audios]
62
+ else:
63
+ audios_list = audios
64
+
65
+ audio_features_all = []
66
+ audio_feature_lens_list = []
67
+
68
+ for batch_audios in audios_list:
69
+ batch_lens = []
70
+
71
+ for audio in batch_audios:
72
+ chunks = chunk_audio(audio, max_duration_seconds, sampling_rate)
73
+
74
+ for chunk in chunks:
75
+ audio_input = feature_extractor(
76
+ chunk,
77
+ sampling_rate=sampling_rate,
78
+ return_tensors="pt",
79
+ padding="max_length",
80
+ return_attention_mask=return_attention_mask,
81
+ )
82
+
83
+ audio_feature = audio_input["input_features"] # [1, 80, frames]
84
+
85
+ if return_attention_mask:
86
+ actual_len = audio_input["attention_mask"].sum(dim=1) # Tensor([frames])
87
+ audio_feature = audio_feature[:, :, : actual_len[0]]
88
+ batch_lens.append(actual_len[0])
89
+ else:
90
+ batch_lens.append(torch.tensor(audio_feature.shape[2]))
91
+
92
+ audio_features_all.append(audio_feature.squeeze(0)) # [80, frames]
93
+
94
+ if len(batch_lens) > 0:
95
+ audio_feature_lens_list.append(torch.hstack(batch_lens))
96
+ else:
97
+ audio_feature_lens_list.append(torch.tensor([]))
98
+
99
+ # pad to same length
100
+ if audio_features_all:
101
+ audio_features = torch.nn.utils.rnn.pad_sequence(
102
+ [feat.transpose(0, 1) for feat in audio_features_all], batch_first=True, padding_value=0.0
103
+ ).transpose(
104
+ 1, 2
105
+ ) # [batch, 80, max_frames]
106
+ else:
107
+ audio_features = torch.tensor([])
108
+
109
+ return audio_features, audio_feature_lens_list
110
+
111
+
112
+ def regroup_audio_features(
113
+ audio_features: torch.Tensor, audio_feature_lens: List[torch.Tensor], regroup_seconds: int, fps: int = 100
114
+ ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
115
+ """regroup audio features to fixed duration
116
+
117
+ Args:
118
+ audio_features: [batch, n_mels, frames]
119
+ audio_feature_lens: each batch's actual length
120
+ regroup_seconds: regroup duration (seconds)
121
+ fps: frames per second
122
+
123
+ Returns:
124
+ (regrouped_features, regrouped_lens)
125
+ """
126
+ # flatten to continuous frames sequence
127
+ all_lens = []
128
+ for lens in audio_feature_lens:
129
+ if isinstance(lens, torch.Tensor):
130
+ all_lens.extend(lens.tolist())
131
+ elif isinstance(lens, list):
132
+ all_lens.extend([int(x) for x in lens])
133
+
134
+ if len(all_lens) == 0:
135
+ return torch.tensor([]), []
136
+
137
+ # concatenate all valid features
138
+ flat_slices = [audio_features[i, :, :L] for i, L in enumerate(all_lens)] # [n_mels, L]
139
+
140
+ if len(flat_slices) == 1:
141
+ full_feat = flat_slices[0]
142
+ else:
143
+ full_feat = torch.cat(flat_slices, dim=1) # [n_mels, total_frames]
144
+
145
+ # split to fixed frames
146
+ frames_per_seg = int(regroup_seconds * fps)
147
+ segments = []
148
+
149
+ for start in range(0, full_feat.size(1), frames_per_seg):
150
+ seg = full_feat[:, start : start + frames_per_seg]
151
+ if seg.size(1) > 0:
152
+ segments.append(seg)
153
+
154
+ if len(segments) == 0:
155
+ return torch.tensor([]), []
156
+
157
+ # pad and convert to batch
158
+ seg_lens = [s.size(1) for s in segments]
159
+ segs_transposed = [s.transpose(0, 1) for s in segments]
160
+
161
+ padded = torch.nn.utils.rnn.pad_sequence(segs_transposed, batch_first=True, padding_value=0.0) # [N, max_T, n_mels]
162
+
163
+ padded = padded.transpose(1, 2) # [N, n_mels, max_T]
164
+ lens_tensor = torch.tensor(seg_lens, dtype=torch.int32, device=padded.device)
165
+
166
+ return padded, [lens_tensor]
167
+
168
+
169
+ def calculate_mel_frames(audio_samples: int, n_fft: int = 400, hop_length: int = 160) -> int:
170
+ """calculate mel frames
171
+
172
+ Args:
173
+ audio_samples:
174
+ n_fft:
175
+ hop_length:
176
+
177
+ Returns:
178
+ mel frames
179
+ """
180
+ if audio_samples < n_fft:
181
+ return 0
182
+ return 1 + (audio_samples - n_fft) // hop_length
183
+
184
+
185
+ def samples_to_ms(samples: int, sample_rate: int = 16000) -> float:
186
+ """convert samples to milliseconds
187
+
188
+ Args:
189
+ samples:
190
+ sample_rate:
191
+
192
+ Returns:
193
+ milliseconds
194
+ """
195
+ return samples / sample_rate * 1000
196
+
197
+
198
+ def ms_to_samples(ms: float, sample_rate: int = 16000) -> int:
199
+ """convert milliseconds to samples
200
+
201
+ Args:
202
+ ms: milliseconds
203
+ sample_rate:
204
+
205
+ Returns:
206
+ samples
207
+ """
208
+ return int(ms * sample_rate / 1000)
209
+
210
+
211
+ def validate_audio_format(audio: np.ndarray, expected_rate: int = 16000, max_duration: Optional[float] = None) -> bool:
212
+ """validate audio format
213
+
214
+ Args:
215
+ audio: audio data
216
+ expected_rate:
217
+ max_duration:
218
+
219
+ Returns:
220
+ whether valid audio format or not
221
+ """
222
+ if not isinstance(audio, np.ndarray):
223
+ return False
224
+
225
+ if audio.ndim != 1:
226
+ return False
227
+
228
+ if audio.dtype not in [np.float32, np.float64, np.int16, np.int32]:
229
+ return False
230
+
231
+ if max_duration is not None:
232
+ max_samples = int(max_duration * expected_rate)
233
+ if len(audio) > max_samples:
234
+ return False
235
+
236
+ return True
MiniCPM-o-4.5-nvidia-FlagOS/chunk_prefill_generate.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class GenerateChunkOutput:
16
+ chunk_token_ids: torch.Tensor
17
+ current_inputs_embeds: torch.Tensor
18
+ input_last_hidden_states: Optional[torch.Tensor] # for tts use_speaker_embedding
19
+ last_hidden_states: Optional[torch.Tensor] # for tts input feature (projector_semantic)
20
+ past_key_values: Optional[torch.Tensor]
21
+ finished: bool
22
+
23
+
24
+ class ChunkPrefillChunkGenerate:
25
+ def __init__(self, model, tokenizer, terminators):
26
+ self.tokenizer = tokenizer
27
+ self.model = model
28
+ self.terminators = terminators
29
+ self.terminators_ids = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
30
+ self.embedding_layer = self.model.get_input_embeddings()
31
+
32
+ self.forbidden_tokens = [
33
+ ":",
34
+ ":",
35
+ ";",
36
+ "#",
37
+ "“",
38
+ "”",
39
+ "‘",
40
+ "’",
41
+ "@",
42
+ "*",
43
+ "【",
44
+ "】",
45
+ "「",
46
+ "」",
47
+ "(",
48
+ ")",
49
+ "(",
50
+ ")",
51
+ "[",
52
+ "]",
53
+ "&",
54
+ "/",
55
+ "$",
56
+ ]
57
+
58
+ self.forbidden_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in self.forbidden_tokens]
59
+ bad_token_ids = getattr(tokenizer, "bad_token_ids", [])
60
+ if bad_token_ids:
61
+ self.forbidden_token_ids.extend(bad_token_ids)
62
+
63
+ @staticmethod
64
+ def prepare_generation_config(do_sample, max_new_tokens=50, min_new_tokens=0, **kwargs):
65
+ num_beams = kwargs.get("num_beams", 1)
66
+ generation_config = {
67
+ "num_beams": num_beams,
68
+ "top_p": 0.8,
69
+ "top_k": 100,
70
+ "temperature": 0.7,
71
+ "do_sample": True,
72
+ "repetition_penalty": 1.05,
73
+ }
74
+
75
+ if num_beams > 1:
76
+ generation_config.update({"num_beams": 3, "repetition_penalty": 1.2, "do_sample": False})
77
+ elif do_sample:
78
+ generation_config.update(
79
+ {
80
+ "top_p": 0.8,
81
+ "top_k": 100,
82
+ "temperature": 0.7,
83
+ "do_sample": True,
84
+ "repetition_penalty": 1.05,
85
+ }
86
+ )
87
+ else:
88
+ generation_config.update({"do_sample": False, "repetition_penalty": 1.05})
89
+
90
+ generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
91
+ generation_config["min_new_tokens"] = min_new_tokens
92
+ generation_config["max_new_tokens"] = max_new_tokens
93
+
94
+ return generation_config
95
+
96
+ @staticmethod
97
+ def _get_cache_length(past_key_values) -> int:
98
+ if past_key_values is None:
99
+ return 0
100
+
101
+ if hasattr(past_key_values, "get_seq_length"):
102
+ return past_key_values.get_seq_length()
103
+
104
+ if isinstance(past_key_values, (tuple, list)) and len(past_key_values) > 0:
105
+ first_layer = past_key_values[0]
106
+ if isinstance(first_layer, (tuple, list)) and len(first_layer) > 0:
107
+ return first_layer[0].shape[2]
108
+ return 0
109
+
110
+ def non_chunk_generate(
111
+ self,
112
+ input_ids=None,
113
+ inputs_embeds=None,
114
+ attention_mask=None,
115
+ max_new_tokens=30,
116
+ min_new_tokens=0,
117
+ do_sample=True,
118
+ **kwargs,
119
+ ):
120
+ assert (input_ids is not None and inputs_embeds is None) or (input_ids is None and inputs_embeds is not None)
121
+
122
+ generation_config = self.prepare_generation_config(
123
+ do_sample=do_sample, max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, **kwargs
124
+ )
125
+
126
+ input_ids = input_ids.to(self.model.device) if input_ids is not None else None
127
+ inputs_embeds = inputs_embeds.to(self.model.device) if inputs_embeds is not None else None
128
+
129
+ if attention_mask is not None:
130
+ attention_mask = attention_mask.to(self.model.device)
131
+
132
+ model_inputs = {
133
+ "attention_mask": attention_mask,
134
+ "pad_token_id": self.tokenizer.eos_token_id,
135
+ "suppress_tokens": self.forbidden_token_ids,
136
+ "eos_token_id": self.terminators_ids,
137
+ "output_hidden_states": True,
138
+ "return_dict_in_generate": True,
139
+ }
140
+ if input_ids is not None:
141
+ model_inputs["input_ids"] = input_ids
142
+ if inputs_embeds is not None:
143
+ model_inputs["inputs_embeds"] = inputs_embeds
144
+
145
+ with torch.no_grad():
146
+ outputs = self.model.generate(**model_inputs, **generation_config)
147
+
148
+ return outputs
149
+
150
+ def chunk_generate(
151
+ self,
152
+ inputs_embeds: torch.Tensor,
153
+ past_key_values,
154
+ is_first_generate_chunk: bool,
155
+ chunk_size: int,
156
+ return_hidden_states: bool,
157
+ do_sample: bool,
158
+ temperature: float,
159
+ top_p: float,
160
+ top_k: int,
161
+ repetition_penalty: float = 1.05,
162
+ all_input_ids: Optional[torch.Tensor] = None,
163
+ ) -> GenerateChunkOutput:
164
+ finished = False
165
+ current_inputs_embeds = inputs_embeds.clone()
166
+ input_last_hidden_states = []
167
+ last_hidden_states = []
168
+ generated_tokens = []
169
+
170
+ for token_idx in range(chunk_size):
171
+ if is_first_generate_chunk and token_idx == 0:
172
+ # first generate chunk, prefill inputs_embeds
173
+ model_inputs = {
174
+ "inputs_embeds": current_inputs_embeds,
175
+ "past_key_values": past_key_values,
176
+ "use_cache": True,
177
+ "output_hidden_states": return_hidden_states,
178
+ }
179
+ else: # for all other cases: prefill the latest generated token
180
+ model_inputs = {
181
+ "inputs_embeds": current_inputs_embeds[:, -1:, :],
182
+ "past_key_values": past_key_values,
183
+ "use_cache": True,
184
+ "output_hidden_states": return_hidden_states,
185
+ }
186
+
187
+ with torch.no_grad():
188
+ outputs = self.model(**model_inputs)
189
+
190
+ # last token's logits
191
+ logits = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=inputs_embeds.device)
192
+
193
+ # forbid specific tokens decoding = model.generate@suppress_tokens
194
+ if self.forbidden_token_ids:
195
+ logits[:, self.forbidden_token_ids] = float("-inf")
196
+
197
+ past_key_values = outputs.past_key_values
198
+
199
+ PENALTY_WINDOW_SIZE = 128
200
+
201
+ # apply repetition penalty
202
+ if repetition_penalty != 1.0:
203
+ # get token ids for repetition penalty
204
+ if all_input_ids is not None:
205
+ # use global input ids (including original input and generated part)
206
+ if len(generated_tokens) > 0:
207
+ generated_token_ids = torch.cat(generated_tokens, dim=1)
208
+ current_sequence = torch.cat(
209
+ [
210
+ all_input_ids[:, -PENALTY_WINDOW_SIZE:],
211
+ generated_token_ids,
212
+ ],
213
+ dim=1,
214
+ )
215
+ else:
216
+ current_sequence = all_input_ids[:, -PENALTY_WINDOW_SIZE:]
217
+ unique_token_ids = torch.unique(current_sequence.squeeze(0))
218
+ elif len(generated_tokens) > 0:
219
+ # revert to original logic: only use generated tokens
220
+ generated_token_ids = torch.cat(generated_tokens, dim=1).squeeze(0)
221
+ unique_token_ids = torch.unique(generated_token_ids)
222
+ else:
223
+ unique_token_ids = torch.tensor([], dtype=torch.long, device=logits.device)
224
+
225
+ # apply repetition penalty
226
+ for token_id in unique_token_ids:
227
+ if logits[0, token_id] > 0:
228
+ logits[0, token_id] = logits[0, token_id] / repetition_penalty
229
+ else:
230
+ logits[0, token_id] = logits[0, token_id] * repetition_penalty
231
+
232
+ # apply temperature
233
+ if temperature != 1.0:
234
+ logits = logits / temperature
235
+
236
+ if do_sample:
237
+ # Top-k filtering
238
+ if top_k > 0:
239
+ top_k_logits, top_k_indices = torch.topk(logits, min(top_k, logits.size(-1)))
240
+ logits_filtered = torch.full_like(logits, float("-inf"))
241
+ logits_filtered.scatter_(1, top_k_indices, top_k_logits)
242
+ logits = logits_filtered
243
+
244
+ # Top-p filtering
245
+ if top_p < 1.0:
246
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
247
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
248
+
249
+ # remove tokens with cumulative probability greater than top_p
250
+ sorted_indices_to_remove = cumulative_probs > top_p
251
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
252
+ sorted_indices_to_remove[..., 0] = 0
253
+
254
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
255
+ logits[indices_to_remove] = float("-inf")
256
+
257
+ # sampling
258
+ probs = F.softmax(logits, dim=-1)
259
+ next_token = torch.multinomial(probs, num_samples=1)
260
+ else:
261
+ next_token = torch.argmax(logits, dim=-1, keepdim=True)
262
+
263
+ if return_hidden_states:
264
+ if is_first_generate_chunk and token_idx == 0:
265
+ input_last_hidden_states.append(outputs.hidden_states[-1])
266
+ else:
267
+ last_hidden_states.append(outputs.hidden_states[-1])
268
+
269
+ # if terminator token, stop generating
270
+ if next_token.item() in self.terminators_ids:
271
+ finished = True
272
+ break
273
+
274
+ generated_tokens.append(next_token)
275
+
276
+ # convert new token to embeddings and concatenate
277
+ next_token_embed = self.embedding_layer(next_token)
278
+
279
+ # update inputs_embeds, add one
280
+ current_inputs_embeds = torch.cat([current_inputs_embeds, next_token_embed], dim=1)
281
+
282
+ if len(generated_tokens) > 0:
283
+ chunk_token_ids = torch.cat(generated_tokens, dim=1)
284
+ else:
285
+ # special case: if last chunk and first predict is eos token, return last token of previous chunk. return a tensor with shape (1, 0)
286
+ if finished:
287
+ chunk_token_ids = torch.zeros((1, 0), dtype=torch.long, device=current_inputs_embeds.device)
288
+ else:
289
+ raise Exception("this should not happen")
290
+
291
+ if len(last_hidden_states) > 0:
292
+ last_hidden_states = torch.cat(last_hidden_states, dim=1)
293
+ else:
294
+ # special case: if last chunk, return last token of previous chunk.
295
+ if finished:
296
+ last_hidden_states = torch.cat(last_hidden_states, dim=1)
297
+ else:
298
+ raise Exception("this should not happen")
299
+
300
+ if len(input_last_hidden_states) > 0:
301
+ input_last_hidden_states = torch.cat(input_last_hidden_states, dim=1)
302
+ else:
303
+ input_last_hidden_states = None
304
+
305
+ return GenerateChunkOutput(
306
+ chunk_token_ids=chunk_token_ids,
307
+ current_inputs_embeds=current_inputs_embeds,
308
+ input_last_hidden_states=input_last_hidden_states,
309
+ last_hidden_states=last_hidden_states,
310
+ past_key_values=past_key_values,
311
+ finished=finished,
312
+ )
313
+
314
+ def chunk_generate_hf(
315
+ self,
316
+ inputs_embeds,
317
+ past_key_values,
318
+ is_first_generate_chunk,
319
+ chunk_size=30,
320
+ return_hidden_states=True,
321
+ do_sample=False,
322
+ **kwargs,
323
+ ) -> GenerateChunkOutput:
324
+ if not do_sample and kwargs.get("num_beams", None) is not None and kwargs.get("num_beams", None) > 1:
325
+ logger.warning("chunk generate does not support beam search, fail to greedy search")
326
+ kwargs["num_beams"] = 1
327
+
328
+ finished = False
329
+ current_inputs_embeds = inputs_embeds.clone()
330
+ input_last_hidden_states = None
331
+ last_hidden_states_list = []
332
+ generated_tokens = []
333
+
334
+ cache_length = self._get_cache_length(past_key_values)
335
+
336
+ for token_idx in range(chunk_size):
337
+ if is_first_generate_chunk and token_idx == 0:
338
+ gen_inputs_embeds = current_inputs_embeds
339
+ input_seq_len = current_inputs_embeds.shape[1]
340
+ else:
341
+ gen_inputs_embeds = current_inputs_embeds[:, -1:, :]
342
+ input_seq_len = 1
343
+
344
+ # construct attention_mask and cache_position
345
+ total_length = cache_length + input_seq_len
346
+ attention_mask = torch.ones((1, total_length), dtype=torch.long, device=self.model.device)
347
+ cache_position = torch.arange(cache_length, total_length, dtype=torch.long, device=self.model.device)
348
+
349
+ gen_config = self.prepare_generation_config(do_sample=do_sample, max_new_tokens=1, **kwargs)
350
+ outputs = self.model.generate(
351
+ inputs_embeds=gen_inputs_embeds,
352
+ past_key_values=past_key_values,
353
+ attention_mask=attention_mask,
354
+ cache_position=cache_position,
355
+ use_cache=True,
356
+ pad_token_id=self.tokenizer.eos_token_id,
357
+ suppress_tokens=self.forbidden_token_ids,
358
+ eos_token_id=self.terminators_ids,
359
+ output_hidden_states=return_hidden_states,
360
+ return_dict_in_generate=True,
361
+ **gen_config,
362
+ )
363
+
364
+ next_token = outputs.sequences[:, -1:]
365
+
366
+ # update past_key_values and cache_length
367
+ past_key_values = outputs.past_key_values
368
+ cache_length = self._get_cache_length(past_key_values)
369
+
370
+ # get hidden states
371
+ if return_hidden_states and hasattr(outputs, "hidden_states") and outputs.hidden_states is not None:
372
+ if is_first_generate_chunk and token_idx == 0:
373
+ if len(outputs.hidden_states) > 0:
374
+ input_last_hidden_states = outputs.hidden_states[0][-1]
375
+ if len(outputs.hidden_states) > 1:
376
+ last_hidden_states_list.append(outputs.hidden_states[1][-1])
377
+ else:
378
+ if len(outputs.hidden_states) > 0:
379
+ if len(outputs.hidden_states) > 1:
380
+ last_hidden_states_list.append(outputs.hidden_states[1][-1])
381
+ else:
382
+ last_hidden_states_list.append(outputs.hidden_states[0][-1])
383
+
384
+ if next_token.item() in self.terminators_ids:
385
+ finished = True
386
+ break
387
+
388
+ generated_tokens.append(next_token)
389
+
390
+ next_token_embed = self.embedding_layer(next_token)
391
+ current_inputs_embeds = torch.cat([current_inputs_embeds, next_token_embed], dim=1)
392
+
393
+ if len(generated_tokens) > 0:
394
+ chunk_token_ids = torch.cat(generated_tokens, dim=1)
395
+ else:
396
+ chunk_token_ids = torch.zeros((1, 0), dtype=torch.long, device=self.model.device)
397
+
398
+ if len(last_hidden_states_list) > 0:
399
+ last_hidden_states = torch.cat(last_hidden_states_list, dim=1)
400
+ else:
401
+ hidden_dim = self.model.config.hidden_size
402
+ last_hidden_states = torch.empty((1, 0, hidden_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
403
+
404
+ return GenerateChunkOutput(
405
+ chunk_token_ids=chunk_token_ids,
406
+ current_inputs_embeds=current_inputs_embeds,
407
+ input_last_hidden_states=input_last_hidden_states,
408
+ last_hidden_states=last_hidden_states,
409
+ past_key_values=past_key_values,
410
+ finished=finished,
411
+ )
412
+
413
+ def chunk_prefill_and_generate(
414
+ self,
415
+ inputs_embeds,
416
+ prefill_chunk_size=5,
417
+ generate_chunk_size=10,
418
+ return_hidden_states=True,
419
+ max_new_tokens=30,
420
+ min_new_tokens=0,
421
+ do_sample=True,
422
+ chunk_fn="chunk_generate",
423
+ **kwargs,
424
+ ):
425
+ assert inputs_embeds is not None
426
+ generation_config = self.prepare_generation_config(
427
+ do_sample=do_sample, max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, **kwargs
428
+ )
429
+ print(f"chunk_prefill_and_generate - generation config: {generation_config}")
430
+
431
+ inputs_embeds = inputs_embeds.to(self.model.device)
432
+ bs, seq_len = inputs_embeds.shape[:2]
433
+ assert bs == 1, "batch should be 1"
434
+
435
+ past_key_values = None
436
+ with torch.no_grad():
437
+ last_prefill_chunk_embeds = None
438
+ # prefill
439
+ for start_idx in range(0, seq_len, prefill_chunk_size):
440
+ end_idx = min(start_idx + prefill_chunk_size, seq_len)
441
+
442
+ chunk_embeds = inputs_embeds[:, start_idx:end_idx, :]
443
+ is_last_prefill_chunk = end_idx == seq_len
444
+
445
+ if not is_last_prefill_chunk:
446
+ model_inputs = {
447
+ "inputs_embeds": chunk_embeds,
448
+ "past_key_values": past_key_values,
449
+ "use_cache": True,
450
+ "output_hidden_states": return_hidden_states,
451
+ }
452
+
453
+ outputs = self.model(**model_inputs)
454
+ past_key_values = outputs.past_key_values
455
+ else:
456
+ last_prefill_chunk_embeds = chunk_embeds
457
+ break
458
+
459
+ # decode
460
+ if last_prefill_chunk_embeds is None:
461
+ raise ValueError("last prefill chunk not found")
462
+
463
+ generation_inputs_embeds = last_prefill_chunk_embeds.clone()
464
+ generated_ids = torch.empty((bs, 0), dtype=torch.long, device=self.model.device)
465
+ all_hidden_states = []
466
+
467
+ num_chunks_decode = (max_new_tokens + generate_chunk_size - 1) // generate_chunk_size
468
+ for chunk_idx in range(num_chunks_decode):
469
+ is_first_generate_chunk = chunk_idx == 0
470
+
471
+ if chunk_fn == "chunk_generate":
472
+ output = self.chunk_generate(
473
+ inputs_embeds=generation_inputs_embeds,
474
+ past_key_values=past_key_values,
475
+ is_first_generate_chunk=is_first_generate_chunk,
476
+ chunk_size=generate_chunk_size + 1 * is_first_generate_chunk,
477
+ return_hidden_states=return_hidden_states,
478
+ do_sample=do_sample,
479
+ temperature=generation_config.get("temperature", 0.7),
480
+ top_p=generation_config.get("top_p", 0.8),
481
+ top_k=generation_config.get("top_k", 20),
482
+ repetition_penalty=generation_config.get("repetition_penalty", 1.05),
483
+ all_input_ids=None,
484
+ )
485
+ elif chunk_fn == "chunk_generate_hf":
486
+ output = self.chunk_generate_hf(
487
+ inputs_embeds=generation_inputs_embeds,
488
+ past_key_values=past_key_values,
489
+ is_first_generate_chunk=is_first_generate_chunk,
490
+ chunk_size=generate_chunk_size + 1 * is_first_generate_chunk,
491
+ return_hidden_states=return_hidden_states,
492
+ min_new_tokens=min_new_tokens,
493
+ do_sample=do_sample,
494
+ **kwargs,
495
+ )
496
+ else:
497
+ raise NotImplementedError(f"not supported chunk_fn: {chunk_fn}")
498
+
499
+ generated_ids = torch.cat([generated_ids, output.chunk_token_ids], dim=1)
500
+ generation_inputs_embeds = output.current_inputs_embeds
501
+ past_key_values = output.past_key_values
502
+
503
+ if return_hidden_states and output.last_hidden_states is not None:
504
+ all_hidden_states.append(output.last_hidden_states)
505
+
506
+ if output.finished:
507
+ break
508
+
509
+ return generated_ids, all_hidden_states
MiniCPM-o-4.5-nvidia-FlagOS/config.json ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiniCPMO"
4
+ ],
5
+ "version": "4.5",
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "audio_chunk_length": 1.0,
9
+ "audio_config": {
10
+ "_attn_implementation_autoset": true,
11
+ "_name_or_path": "openai/whisper-medium",
12
+ "activation_dropout": 0.0,
13
+ "activation_function": "gelu",
14
+ "apply_spec_augment": false,
15
+ "architectures": [
16
+ "MiniCPMWhisperEncoder"
17
+ ],
18
+ "attention_dropout": 0.0,
19
+ "begin_suppress_tokens": [
20
+ 220,
21
+ 50257
22
+ ],
23
+ "bos_token_id": 50257,
24
+ "classifier_proj_size": 256,
25
+ "d_model": 1024,
26
+ "decoder_attention_heads": 16,
27
+ "decoder_ffn_dim": 4096,
28
+ "decoder_layerdrop": 0.0,
29
+ "decoder_layers": 24,
30
+ "decoder_start_token_id": 50258,
31
+ "dropout": 0.0,
32
+ "encoder_attention_heads": 16,
33
+ "encoder_ffn_dim": 4096,
34
+ "encoder_layerdrop": 0.0,
35
+ "encoder_layers": 24,
36
+ "eos_token_id": 50257,
37
+ "forced_decoder_ids": [
38
+ [
39
+ 1,
40
+ 50259
41
+ ],
42
+ [
43
+ 2,
44
+ 50359
45
+ ],
46
+ [
47
+ 3,
48
+ 50363
49
+ ]
50
+ ],
51
+ "init_std": 0.02,
52
+ "mask_feature_length": 10,
53
+ "mask_feature_min_masks": 0,
54
+ "mask_feature_prob": 0.0,
55
+ "mask_time_length": 10,
56
+ "mask_time_min_masks": 2,
57
+ "mask_time_prob": 0.05,
58
+ "max_length": 448,
59
+ "max_source_positions": 1500,
60
+ "max_target_positions": 448,
61
+ "median_filter_width": 7,
62
+ "model_type": "whisper",
63
+ "num_hidden_layers": 24,
64
+ "num_mel_bins": 80,
65
+ "pad_token_id": 50257,
66
+ "scale_embedding": false,
67
+ "suppress_tokens": [
68
+ 1,
69
+ 2,
70
+ 7,
71
+ 8,
72
+ 9,
73
+ 10,
74
+ 14,
75
+ 25,
76
+ 26,
77
+ 27,
78
+ 28,
79
+ 29,
80
+ 31,
81
+ 58,
82
+ 59,
83
+ 60,
84
+ 61,
85
+ 62,
86
+ 63,
87
+ 90,
88
+ 91,
89
+ 92,
90
+ 93,
91
+ 359,
92
+ 503,
93
+ 522,
94
+ 542,
95
+ 873,
96
+ 893,
97
+ 902,
98
+ 918,
99
+ 922,
100
+ 931,
101
+ 1350,
102
+ 1853,
103
+ 1982,
104
+ 2460,
105
+ 2627,
106
+ 3246,
107
+ 3253,
108
+ 3268,
109
+ 3536,
110
+ 3846,
111
+ 3961,
112
+ 4183,
113
+ 4667,
114
+ 6585,
115
+ 6647,
116
+ 7273,
117
+ 9061,
118
+ 9383,
119
+ 10428,
120
+ 10929,
121
+ 11938,
122
+ 12033,
123
+ 12331,
124
+ 12562,
125
+ 13793,
126
+ 14157,
127
+ 14635,
128
+ 15265,
129
+ 15618,
130
+ 16553,
131
+ 16604,
132
+ 18362,
133
+ 18956,
134
+ 20075,
135
+ 21675,
136
+ 22520,
137
+ 26130,
138
+ 26161,
139
+ 26435,
140
+ 28279,
141
+ 29464,
142
+ 31650,
143
+ 32302,
144
+ 32470,
145
+ 36865,
146
+ 42863,
147
+ 47425,
148
+ 49870,
149
+ 50254,
150
+ 50258,
151
+ 50358,
152
+ 50359,
153
+ 50360,
154
+ 50361,
155
+ 50362
156
+ ],
157
+ "torch_dtype": "float32",
158
+ "use_cache": true,
159
+ "use_weighted_layer_sum": false,
160
+ "vocab_size": 51865
161
+ },
162
+ "audio_pool_step": 5,
163
+ "auto_map": {
164
+ "AutoConfig": "configuration_minicpmo.MiniCPMOConfig",
165
+ "AutoModel": "modeling_minicpmo.MiniCPMO",
166
+ "AutoModelForCausalLM": "modeling_minicpmo.MiniCPMO"
167
+ },
168
+ "batch_vision_input": true,
169
+ "bos_token_id": 151643,
170
+ "drop_vision_last_layer": false,
171
+ "eos_token_id": 151645,
172
+ "head_dim": 128,
173
+ "hidden_act": "silu",
174
+ "hidden_size": 4096,
175
+ "image_size": 448,
176
+ "init_audio": true,
177
+ "init_tts": true,
178
+ "init_vision": true,
179
+ "initializer_range": 0.02,
180
+ "intermediate_size": 12288,
181
+ "listen_speak_type": "asr",
182
+ "max_position_embeddings": 40960,
183
+ "max_window_layers": 36,
184
+ "model_type": "minicpmo",
185
+ "num_attention_heads": 32,
186
+ "num_hidden_layers": 36,
187
+ "num_key_value_heads": 8,
188
+ "patch_size": 14,
189
+ "query_num": 64,
190
+ "rms_norm_eps": 1e-06,
191
+ "rope_scaling": null,
192
+ "rope_theta": 1000000,
193
+ "slice_config": {
194
+ "max_slice_nums": 1,
195
+ "model_type": "minicpmv",
196
+ "patch_size": 14,
197
+ "scale_resolution": 448
198
+ },
199
+ "slice_mode": true,
200
+ "sliding_window": null,
201
+ "stream_input": true,
202
+ "tie_word_embeddings": false,
203
+ "torch_dtype": "bfloat16",
204
+ "transformers_version": "4.51.0",
205
+ "tts_config": {
206
+ "_attn_implementation_autoset": true,
207
+ "attention_type": "full_attention",
208
+ "attn_implementation": "sdpa",
209
+ "audio_bos_token_id": 151687,
210
+ "audio_tokenizer_sample_rate": 16000,
211
+ "audio_tokenizer_type": "s3tokenizer",
212
+ "aug_layer_loss_weight": false,
213
+ "aug_loss_weight": false,
214
+ "backbone_model": "llama",
215
+ "condition_type": "hidden_text_merge",
216
+ "cosyvoice_config_path": null,
217
+ "cosyvoice_model_dir": null,
218
+ "filter_tts_loss": false,
219
+ "hidden_act": "silu",
220
+ "hidden_size": 768,
221
+ "interleaved": false,
222
+ "intermediate_size": 3072,
223
+ "llm_dim": 4096,
224
+ "llm_dim_model_base": 256,
225
+ "llm_down_scale": false,
226
+ "llm_hidden_size": 4096,
227
+ "llm_intermediate_size": 768,
228
+ "long_weight": 0.1,
229
+ "max_position_embeddings": 4096,
230
+ "model_type": "minicpmtts",
231
+ "normalize_projected_hidden": true,
232
+ "num_attention_heads": 12,
233
+ "num_audio_tokens": 6562,
234
+ "num_hidden_layers": 20,
235
+ "num_key_value_heads": 12,
236
+ "num_mel_bins": 100,
237
+ "num_text_tokens": 152064,
238
+ "num_vq": 1,
239
+ "projector_type": "mlp",
240
+ "recomputed_chunks": 1,
241
+ "s3_stream_chunk_size": 25,
242
+ "s3_stream_generate": false,
243
+ "s3_stream_n_timesteps": 10,
244
+ "s3_stream_prelook_size": 3,
245
+ "short_weight": 0.1,
246
+ "streaming": false,
247
+ "streaming_audio_chunk_size": 50,
248
+ "streaming_sliding_window": false,
249
+ "streaming_sliding_window_audio_frame_rate": 50,
250
+ "streaming_sliding_window_audio_init_text_length": 10,
251
+ "streaming_sliding_window_audio_window_size": 300,
252
+ "streaming_sliding_window_average_speed": 5,
253
+ "streaming_sliding_window_fast_speed": 7,
254
+ "streaming_sliding_window_max_text_len": 500,
255
+ "streaming_sliding_window_slow_speed": 3,
256
+ "streaming_sliding_window_text_window_size": 50,
257
+ "streaming_text_chunk_max": 7,
258
+ "streaming_text_chunk_min": 3,
259
+ "streaming_text_reserved_len": 300,
260
+ "text_eos_token_id": 151692,
261
+ "tts_filter_loss_fix": false,
262
+ "use_llm_hidden_state": false,
263
+ "use_text": true,
264
+ "window_size": 2
265
+ },
266
+ "use_cache": true,
267
+ "use_image_id": true,
268
+ "use_sliding_window": false,
269
+ "vision_batch_size": 16,
270
+ "vision_config": {
271
+ "_attn_implementation_autoset": true,
272
+ "attention_dropout": 0.0,
273
+ "hidden_act": "gelu_pytorch_tanh",
274
+ "hidden_size": 1152,
275
+ "image_size": 980,
276
+ "intermediate_size": 4304,
277
+ "layer_norm_eps": 1e-06,
278
+ "model_type": "siglip_vision_model",
279
+ "num_attention_heads": 16,
280
+ "num_channels": 3,
281
+ "num_hidden_layers": 27,
282
+ "patch_size": 14
283
+ },
284
+ "vocab_size": 151748
285
+ }
MiniCPM-o-4.5-nvidia-FlagOS/configuration_minicpmo.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import json
5
+ import os
6
+ from typing import Union
7
+
8
+ from transformers import PretrainedConfig
9
+ from transformers import Qwen3Config
10
+ from transformers import WhisperConfig
11
+ from transformers.utils import logging
12
+
13
+ from .configuration_minicpmtts import MiniCPMTTSConfig
14
+ from .modeling_navit_siglip import SiglipVisionConfig
15
+
16
+ logger = logging.get_logger(__name__)
17
+
18
+
19
+ class MiniCPMVSliceConfig(PretrainedConfig):
20
+ model_type = "minicpmv"
21
+
22
+ def __init__(
23
+ self,
24
+ patch_size=14,
25
+ max_slice_nums=9,
26
+ scale_resolution=448,
27
+ **kwargs,
28
+ ):
29
+ super().__init__(**kwargs)
30
+ self.patch_size = patch_size
31
+ self.max_slice_nums = max_slice_nums
32
+ self.scale_resolution = scale_resolution
33
+
34
+ @classmethod
35
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
36
+ cls._set_token_in_kwargs(kwargs)
37
+
38
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
39
+
40
+ if config_dict.get("model_type") == "minicpmv":
41
+ config_dict = config_dict["slice_config"]
42
+
43
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
44
+ logger.warning(
45
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
46
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
47
+ )
48
+
49
+ return cls.from_dict(config_dict, **kwargs)
50
+
51
+
52
+ class MiniCPMODuplexConfig(PretrainedConfig):
53
+ """Configuration class for MiniCPMODuplex."""
54
+
55
+ model_type = "minicpmo_duplex"
56
+
57
+ def __init__(
58
+ self,
59
+ # duplex init params
60
+ generate_audio: bool = True,
61
+ ls_mode: str = "explicit",
62
+ # llm generation config
63
+ max_new_speak_tokens_per_chunk: int = 20,
64
+ text_repetition_penalty: float = 1.05,
65
+ temperature: float = 0.7,
66
+ top_k: int = 20,
67
+ top_p: float = 0.8,
68
+ text_repetition_window_size: int = 512,
69
+ listen_prob_scale: float = 1.0,
70
+ # tts generation config
71
+ tts_temperature: float = 0.8,
72
+ tts_repetition_penalty: float = 1.05,
73
+ # stream config
74
+ chunk_ms: int = 1000,
75
+ first_chunk_ms: int = 1035,
76
+ cnn_redundancy_ms: int = 20,
77
+ sample_rate: int = 16000,
78
+ # attn implementation
79
+ attn_implementation: str = "flash_attention_2",
80
+ # sliding window config
81
+ sliding_window_mode: str = "off", # "off" / "basic" / "context"
82
+ basic_window_high_tokens: int = 8000,
83
+ basic_window_low_tokens: int = 4000,
84
+ context_previous_max_tokens: int = 500,
85
+ context_max_units: int = 24,
86
+ **kwargs,
87
+ ):
88
+ super().__init__(**kwargs)
89
+ self.generate_audio = generate_audio
90
+ self.ls_mode = ls_mode
91
+ self.max_new_speak_tokens_per_chunk = max_new_speak_tokens_per_chunk
92
+ self.text_repetition_penalty = text_repetition_penalty
93
+ self.temperature = temperature
94
+ self.top_k = top_k
95
+ self.top_p = top_p
96
+ self.text_repetition_window_size = text_repetition_window_size
97
+ self.listen_prob_scale = listen_prob_scale
98
+ self.tts_temperature = tts_temperature
99
+ self.tts_repetition_penalty = tts_repetition_penalty
100
+ self.chunk_ms = chunk_ms
101
+ self.first_chunk_ms = first_chunk_ms
102
+ self.cnn_redundancy_ms = cnn_redundancy_ms
103
+ self.sample_rate = sample_rate
104
+ self.attn_implementation = attn_implementation
105
+ # sliding window
106
+ self.sliding_window_mode = sliding_window_mode
107
+ self.basic_window_high_tokens = basic_window_high_tokens
108
+ self.basic_window_low_tokens = basic_window_low_tokens
109
+ self.context_previous_max_tokens = context_previous_max_tokens
110
+ self.context_max_units = context_max_units
111
+
112
+ @classmethod
113
+ def from_pretrained(
114
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
115
+ ) -> "MiniCPMODuplexConfig":
116
+ config_file = os.path.join(pretrained_model_name_or_path, "duplex_config.json")
117
+ if os.path.exists(config_file):
118
+ with open(config_file, "r", encoding="utf-8") as f:
119
+ config_dict = json.load(f)
120
+ # Override with any kwargs provided
121
+ config_dict.update(kwargs)
122
+ return cls(**config_dict)
123
+ else:
124
+ # Return default config if duplex_config.json doesn't exist
125
+ logger.info(
126
+ f"duplex_config.json not found at {pretrained_model_name_or_path}, using default MiniCPMODuplexConfig"
127
+ )
128
+ return cls(**kwargs)
129
+
130
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
131
+ os.makedirs(save_directory, exist_ok=True)
132
+ config_file = os.path.join(save_directory, "duplex_config.json")
133
+ with open(config_file, "w", encoding="utf-8") as f:
134
+ json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
135
+ logger.info(f"Duplex configuration saved to {config_file}")
136
+
137
+
138
+ class MiniCPMOConfig(Qwen3Config):
139
+ model_type = "minicpmo"
140
+ keys_to_ignore_at_inference = ["past_key_values"]
141
+
142
+ default_vision_config = {
143
+ "hidden_size": 1152,
144
+ "image_size": 980,
145
+ "intermediate_size": 4304,
146
+ "model_type": "siglip",
147
+ "num_attention_heads": 16,
148
+ "num_hidden_layers": 27,
149
+ "patch_size": 14,
150
+ }
151
+
152
+ def __init__(
153
+ self,
154
+ use_cache=True,
155
+ query_num=64,
156
+ image_size=448,
157
+ drop_vision_last_layer=True,
158
+ batch_vision_input=True,
159
+ slice_config=None,
160
+ vision_config=None,
161
+ audio_config=None,
162
+ tts_config=None,
163
+ use_image_id=True,
164
+ vision_batch_size=16,
165
+ audio_pool_step=5,
166
+ audio_chunk_length=1.0,
167
+ stream_input=False,
168
+ listen_speak_type="asr",
169
+ init_vision=True,
170
+ init_audio=True,
171
+ init_tts=True,
172
+ **kwargs,
173
+ ):
174
+ self.use_cache = use_cache
175
+ self.query_num = query_num
176
+ self.image_size = image_size
177
+ self.drop_vision_last_layer = drop_vision_last_layer
178
+ self.batch_vision_input = batch_vision_input
179
+ self.use_image_id = use_image_id
180
+ self.vision_batch_size = vision_batch_size
181
+ self.audio_pool_step = audio_pool_step
182
+ self.audio_chunk_length = audio_chunk_length
183
+ self.stream_input = stream_input
184
+ self.listen_speak_type = listen_speak_type
185
+
186
+ self.init_vision = init_vision
187
+ self.init_audio = init_audio
188
+ self.init_tts = init_tts
189
+
190
+ if slice_config is None:
191
+ self.slice_config = MiniCPMVSliceConfig(max_slice_nums=1)
192
+ else:
193
+ self.slice_config = MiniCPMVSliceConfig(**slice_config)
194
+ self.slice_mode = True
195
+
196
+ # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
197
+ if vision_config is None:
198
+ self.vision_config = SiglipVisionConfig(**self.default_vision_config)
199
+ logger.info("vision_config is None, using default vision config")
200
+ elif isinstance(vision_config, dict):
201
+ self.vision_config = SiglipVisionConfig(**vision_config)
202
+ elif isinstance(vision_config, SiglipVisionConfig):
203
+ self.vision_config = vision_config
204
+
205
+ if audio_config is None:
206
+ self.audio_config = WhisperConfig()
207
+ elif isinstance(audio_config, dict):
208
+ self.audio_config = WhisperConfig(**audio_config)
209
+ elif isinstance(audio_config, WhisperConfig):
210
+ self.audio_config = audio_config
211
+
212
+ if tts_config is None:
213
+ self.tts_config = MiniCPMTTSConfig()
214
+ elif isinstance(tts_config, dict):
215
+ self.tts_config = MiniCPMTTSConfig(**tts_config)
216
+ elif isinstance(tts_config, MiniCPMTTSConfig):
217
+ self.tts_config = tts_config
218
+
219
+ self.patch_size = self.vision_config.patch_size
220
+
221
+ super().__init__(**kwargs)