openbmb
/

MiniCPM-o-4_5

@@ -1054,8 +1054,8 @@ model = AutoModel.from_pretrained(
 )
 model.eval().cuda()
-# Initialize TTS for audio output in chat or streaming mode
-model.init_tts(streaming=False)  # or streaming=True
 # Convert simplex model to duplex mode
 duplex_model = model.as_duplex()
@@ -1170,7 +1170,7 @@ We provide two inference modes: chat and streaming.
 from minicpmo.utils import get_video_frame_audio_segments
 model = ...
-model.init_tts(streaming=False)
 video_path = "assets/Skiing.mp4"
@@ -1228,7 +1228,7 @@ import torch
 from minicpmo.utils import get_video_frame_audio_segments
 model = ...
-model.init_tts(streaming=True)
 # Reset session for a new conversation (clears KV cache)
 model.reset_session()
@@ -1344,7 +1344,7 @@ sys_msg = {
 # You can use each type of system prompt mentioned above in streaming speech conversation
 # Reset state
-model.init_tts(streaming=True)
 model.reset_session(reset_token2wav_cache=True)
 model.init_token2wav_cache(prompt_speech_16k=ref_audio)
@@ -1536,7 +1536,7 @@ sys_msg = {
 import librosa
 model = ...
-model.init_tts(streaming=False)
 # For both Chinese and English
 ref_audio_path = "assets/HT_ref_audio.wav"
@@ -1589,7 +1589,7 @@ The `Mimick` task evaluates a model's end-to-end speech modeling capability. The
 import librosa
 model = ...
-model.init_tts(streaming=False)
 system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
@@ -1635,7 +1635,7 @@ For audio-to-text tasks, you can use the following prompts:
 import librosa
 model = ...
-model.init_tts(streaming=False)
 # Load the audio to be transcribed/analyzed
 audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)

 )
 model.eval().cuda()
+# Initialize TTS for audio output
+model.init_tts()
 # Convert simplex model to duplex mode
 duplex_model = model.as_duplex()
 from minicpmo.utils import get_video_frame_audio_segments
 model = ...
+model.init_tts()
 video_path = "assets/Skiing.mp4"
 from minicpmo.utils import get_video_frame_audio_segments
 model = ...
+model.init_tts()
 # Reset session for a new conversation (clears KV cache)
 model.reset_session()
 # You can use each type of system prompt mentioned above in streaming speech conversation
 # Reset state
+model.init_tts()
 model.reset_session(reset_token2wav_cache=True)
 model.init_token2wav_cache(prompt_speech_16k=ref_audio)
 import librosa
 model = ...
+model.init_tts()
 # For both Chinese and English
 ref_audio_path = "assets/HT_ref_audio.wav"
 import librosa
 model = ...
+model.init_tts()
 system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
 import librosa
 model = ...
+model.init_tts()
 # Load the audio to be transcribed/analyzed
 audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)