feat: update readme caused by using token2wav in non-streaming tts

#8
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -1054,8 +1054,8 @@ model = AutoModel.from_pretrained(
1054
  )
1055
  model.eval().cuda()
1056
 
1057
- # Initialize TTS for audio output in chat or streaming mode
1058
- model.init_tts(streaming=False) # or streaming=True
1059
 
1060
  # Convert simplex model to duplex mode
1061
  duplex_model = model.as_duplex()
@@ -1170,7 +1170,7 @@ We provide two inference modes: chat and streaming.
1170
  from minicpmo.utils import get_video_frame_audio_segments
1171
 
1172
  model = ...
1173
- model.init_tts(streaming=False)
1174
 
1175
  video_path = "assets/Skiing.mp4"
1176
 
@@ -1228,7 +1228,7 @@ import torch
1228
  from minicpmo.utils import get_video_frame_audio_segments
1229
 
1230
  model = ...
1231
- model.init_tts(streaming=True)
1232
 
1233
  # Reset session for a new conversation (clears KV cache)
1234
  model.reset_session()
@@ -1344,7 +1344,7 @@ sys_msg = {
1344
  # You can use each type of system prompt mentioned above in streaming speech conversation
1345
 
1346
  # Reset state
1347
- model.init_tts(streaming=True)
1348
  model.reset_session(reset_token2wav_cache=True)
1349
  model.init_token2wav_cache(prompt_speech_16k=ref_audio)
1350
 
@@ -1536,7 +1536,7 @@ sys_msg = {
1536
  import librosa
1537
 
1538
  model = ...
1539
- model.init_tts(streaming=False)
1540
 
1541
  # For both Chinese and English
1542
  ref_audio_path = "assets/HT_ref_audio.wav"
@@ -1589,7 +1589,7 @@ The `Mimick` task evaluates a model's end-to-end speech modeling capability. The
1589
  import librosa
1590
 
1591
  model = ...
1592
- model.init_tts(streaming=False)
1593
 
1594
  system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
1595
 
@@ -1635,7 +1635,7 @@ For audio-to-text tasks, you can use the following prompts:
1635
  import librosa
1636
 
1637
  model = ...
1638
- model.init_tts(streaming=False)
1639
 
1640
  # Load the audio to be transcribed/analyzed
1641
  audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)
 
1054
  )
1055
  model.eval().cuda()
1056
 
1057
+ # Initialize TTS for audio output
1058
+ model.init_tts()
1059
 
1060
  # Convert simplex model to duplex mode
1061
  duplex_model = model.as_duplex()
 
1170
  from minicpmo.utils import get_video_frame_audio_segments
1171
 
1172
  model = ...
1173
+ model.init_tts()
1174
 
1175
  video_path = "assets/Skiing.mp4"
1176
 
 
1228
  from minicpmo.utils import get_video_frame_audio_segments
1229
 
1230
  model = ...
1231
+ model.init_tts()
1232
 
1233
  # Reset session for a new conversation (clears KV cache)
1234
  model.reset_session()
 
1344
  # You can use each type of system prompt mentioned above in streaming speech conversation
1345
 
1346
  # Reset state
1347
+ model.init_tts()
1348
  model.reset_session(reset_token2wav_cache=True)
1349
  model.init_token2wav_cache(prompt_speech_16k=ref_audio)
1350
 
 
1536
  import librosa
1537
 
1538
  model = ...
1539
+ model.init_tts()
1540
 
1541
  # For both Chinese and English
1542
  ref_audio_path = "assets/HT_ref_audio.wav"
 
1589
  import librosa
1590
 
1591
  model = ...
1592
+ model.init_tts()
1593
 
1594
  system_prompt = "You are a helpful assistant. You can accept video, audio, and text input and output voice and text. Respond with just the answer, no redundancy."
1595
 
 
1635
  import librosa
1636
 
1637
  model = ...
1638
+ model.init_tts()
1639
 
1640
  # Load the audio to be transcribed/analyzed
1641
  audio_input, _ = librosa.load("assets/Trump_WEF_2018_10s.mp3", sr=16000, mono=True)