feat: update readme caused by using token2wav in non-streaming tts
#8
by
airlsyn - opened
README.md
CHANGED
|
@@ -1054,8 +1054,8 @@ model = AutoModel.from_pretrained(
|
|
| 1054 |
)
|
| 1055 |
model.eval().cuda()
|
| 1056 |
|
| 1057 |
-
# Initialize TTS for audio output
|
| 1058 |
-
model.init_tts(
|
| 1059 |
|
| 1060 |
# Convert simplex model to duplex mode
|
| 1061 |
duplex_model = model.as_duplex()
|
|
@@ -1170,7 +1170,7 @@ We provide two inference modes: chat and streaming.
|
|
| 1170 |
from minicpmo.utils import get_video_frame_audio_segments
|
| 1171 |
|
| 1172 |
model = ...
|
| 1173 |
-
model.init_tts(
|
| 1174 |
|
| 1175 |
video_path = "assets/Skiing.mp4"
|
| 1176 |
|
|
@@ -1228,7 +1228,7 @@ import torch
|
|
| 1228 |
from minicpmo.utils import get_video_frame_audio_segments
|
| 1229 |
|
| 1230 |
model = ...
|
| 1231 |
-
model.init_tts(
|
| 1232 |
|
| 1233 |
# Reset session for a new conversation (clears KV cache)
|
| 1234 |
model.reset_session()
|
|
@@ -1344,7 +1344,7 @@ sys_msg = {
|
|
| 1344 |
# You can use each type of system prompt mentioned above in streaming speech conversation
|
| 1345 |
|
| 1346 |
# Reset state
|
| 1347 |
-
model.init_tts(
|
| 1348 |
model.reset_session(reset_token2wav_cache=True)
|
| 1349 |
model.init_token2wav_cache(prompt_speech_16k=ref_audio)
|
| 1350 |
|
|
@@ -1536,7 +1536,7 @@ sys_msg = {
|
|
| 1536 |
import librosa
|
| 1537 |
|
| 1538 |
model = ...
|
| 1539 |
-
model.init_tts(
|
| 1540 |
|
| 1541 |
# For both Chinese and English
|
| 1542 |
ref_audio_path = "assets/HT_ref_audio.wav"
|
|
@@ -1589,7 +1589,7 @@ The `Mimick` task evaluates a model's end-to-end speech modeling capability. The
|
|
| 1589 |
import librosa
|
| 1590 |
|
| 1591 |
model = ...
|
| 1592 |
-
model.init_tts(
|
| 1593 |
|
| 1594 |
system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
|
| 1595 |
|
|
@@ -1635,7 +1635,7 @@ For audio-to-text tasks, you can use the following prompts:
|
|
| 1635 |
import librosa
|
| 1636 |
|
| 1637 |
model = ...
|
| 1638 |
-
model.init_tts(
|
| 1639 |
|
| 1640 |
# Load the audio to be transcribed/analyzed
|
| 1641 |
audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
|
|
|
|
| 1054 |
)
|
| 1055 |
model.eval().cuda()
|
| 1056 |
|
| 1057 |
+
# Initialize TTS for audio output
|
| 1058 |
+
model.init_tts()
|
| 1059 |
|
| 1060 |
# Convert simplex model to duplex mode
|
| 1061 |
duplex_model = model.as_duplex()
|
|
|
|
| 1170 |
from minicpmo.utils import get_video_frame_audio_segments
|
| 1171 |
|
| 1172 |
model = ...
|
| 1173 |
+
model.init_tts()
|
| 1174 |
|
| 1175 |
video_path = "assets/Skiing.mp4"
|
| 1176 |
|
|
|
|
| 1228 |
from minicpmo.utils import get_video_frame_audio_segments
|
| 1229 |
|
| 1230 |
model = ...
|
| 1231 |
+
model.init_tts()
|
| 1232 |
|
| 1233 |
# Reset session for a new conversation (clears KV cache)
|
| 1234 |
model.reset_session()
|
|
|
|
| 1344 |
# You can use each type of system prompt mentioned above in streaming speech conversation
|
| 1345 |
|
| 1346 |
# Reset state
|
| 1347 |
+
model.init_tts()
|
| 1348 |
model.reset_session(reset_token2wav_cache=True)
|
| 1349 |
model.init_token2wav_cache(prompt_speech_16k=ref_audio)
|
| 1350 |
|
|
|
|
| 1536 |
import librosa
|
| 1537 |
|
| 1538 |
model = ...
|
| 1539 |
+
model.init_tts()
|
| 1540 |
|
| 1541 |
# For both Chinese and English
|
| 1542 |
ref_audio_path = "assets/HT_ref_audio.wav"
|
|
|
|
| 1589 |
import librosa
|
| 1590 |
|
| 1591 |
model = ...
|
| 1592 |
+
model.init_tts()
|
| 1593 |
|
| 1594 |
system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
|
| 1595 |
|
|
|
|
| 1635 |
import librosa
|
| 1636 |
|
| 1637 |
model = ...
|
| 1638 |
+
model.init_tts()
|
| 1639 |
|
| 1640 |
# Load the audio to be transcribed/analyzed
|
| 1641 |
audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
|