YWMditto commited on
Commit
3dedaec
·
1 Parent(s): a130021

update readme

Browse files
Files changed (1) hide show
  1. README.md +6 -12
README.md CHANGED
@@ -152,11 +152,10 @@ MOSS-TTSD uses a **continuation** workflow: provide reference audio for each spe
152
  import os
153
  from pathlib import Path
154
  import torch
155
- import soundfile as sf
156
  import torchaudio
157
  from transformers import AutoModel, AutoProcessor
158
 
159
- pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTSD"
160
  audio_tokenizer_name_or_path = "OpenMOSS-Team/MOSS-Audio-Tokenizer"
161
  device = "cuda" if torch.cuda.is_available() else "cpu"
162
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
@@ -189,10 +188,8 @@ text_to_generate = "[S1] Listen, let's talk business. China. I'm hearing things.
189
  # --- Load & resample audio ---
190
 
191
  target_sr = int(processor.model_config.sampling_rate)
192
- audio1, sr1 = sf.read(prompt_audio_speaker1, dtype="float32", always_2d=True)
193
- audio2, sr2 = sf.read(prompt_audio_speaker2, dtype="float32", always_2d=True)
194
- wav1 = torch.from_numpy(audio1).transpose(0, 1).contiguous()
195
- wav2 = torch.from_numpy(audio2).transpose(0, 1).contiguous()
196
 
197
  if wav1.shape[0] > 1:
198
  wav1 = wav1.mean(dim=0, keepdim=True)
@@ -244,13 +241,10 @@ with torch.no_grad():
244
  )
245
 
246
  for message in processor.decode(outputs):
247
- for seg_idx, audio in enumerate(message.audio_codes_list):
248
- sf.write(
249
- save_dir / f"{sample_idx}_{seg_idx}.wav",
250
- audio.detach().cpu().to(torch.float32).numpy(),
251
- int(processor.model_config.sampling_rate),
252
- )
253
  sample_idx += 1
 
254
 
255
  ```
256
 
 
152
  import os
153
  from pathlib import Path
154
  import torch
 
155
  import torchaudio
156
  from transformers import AutoModel, AutoProcessor
157
 
158
+ pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTSD-v1.0"
159
  audio_tokenizer_name_or_path = "OpenMOSS-Team/MOSS-Audio-Tokenizer"
160
  device = "cuda" if torch.cuda.is_available() else "cpu"
161
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
 
188
  # --- Load & resample audio ---
189
 
190
  target_sr = int(processor.model_config.sampling_rate)
191
+ wav1, sr1 = torchaudio.load(prompt_audio_speaker1)
192
+ wav2, sr2 = torchaudio.load(prompt_audio_speaker2)
 
 
193
 
194
  if wav1.shape[0] > 1:
195
  wav1 = wav1.mean(dim=0, keepdim=True)
 
241
  )
242
 
243
  for message in processor.decode(outputs):
244
+ audio = message.audio_codes_list[0]
245
+ out_path = save_dir / f"sample{sample_idx}.wav"
 
 
 
 
246
  sample_idx += 1
247
+ torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
248
 
249
  ```
250