update readme
Browse files
README.md
CHANGED
|
@@ -152,11 +152,10 @@ MOSS-TTSD uses a **continuation** workflow: provide reference audio for each spe
|
|
| 152 |
import os
|
| 153 |
from pathlib import Path
|
| 154 |
import torch
|
| 155 |
-
import soundfile as sf
|
| 156 |
import torchaudio
|
| 157 |
from transformers import AutoModel, AutoProcessor
|
| 158 |
|
| 159 |
-
pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTSD"
|
| 160 |
audio_tokenizer_name_or_path = "OpenMOSS-Team/MOSS-Audio-Tokenizer"
|
| 161 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 162 |
dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
|
@@ -189,10 +188,8 @@ text_to_generate = "[S1] Listen, let's talk business. China. I'm hearing things.
|
|
| 189 |
# --- Load & resample audio ---
|
| 190 |
|
| 191 |
target_sr = int(processor.model_config.sampling_rate)
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
wav1 = torch.from_numpy(audio1).transpose(0, 1).contiguous()
|
| 195 |
-
wav2 = torch.from_numpy(audio2).transpose(0, 1).contiguous()
|
| 196 |
|
| 197 |
if wav1.shape[0] > 1:
|
| 198 |
wav1 = wav1.mean(dim=0, keepdim=True)
|
|
@@ -244,13 +241,10 @@ with torch.no_grad():
|
|
| 244 |
)
|
| 245 |
|
| 246 |
for message in processor.decode(outputs):
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
save_dir / f"{sample_idx}_{seg_idx}.wav",
|
| 250 |
-
audio.detach().cpu().to(torch.float32).numpy(),
|
| 251 |
-
int(processor.model_config.sampling_rate),
|
| 252 |
-
)
|
| 253 |
sample_idx += 1
|
|
|
|
| 254 |
|
| 255 |
```
|
| 256 |
|
|
|
|
| 152 |
import os
|
| 153 |
from pathlib import Path
|
| 154 |
import torch
|
|
|
|
| 155 |
import torchaudio
|
| 156 |
from transformers import AutoModel, AutoProcessor
|
| 157 |
|
| 158 |
+
pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTSD-v1.0"
|
| 159 |
audio_tokenizer_name_or_path = "OpenMOSS-Team/MOSS-Audio-Tokenizer"
|
| 160 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 161 |
dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
|
|
|
| 188 |
# --- Load & resample audio ---
|
| 189 |
|
| 190 |
target_sr = int(processor.model_config.sampling_rate)
|
| 191 |
+
wav1, sr1 = torchaudio.load(prompt_audio_speaker1)
|
| 192 |
+
wav2, sr2 = torchaudio.load(prompt_audio_speaker2)
|
|
|
|
|
|
|
| 193 |
|
| 194 |
if wav1.shape[0] > 1:
|
| 195 |
wav1 = wav1.mean(dim=0, keepdim=True)
|
|
|
|
| 241 |
)
|
| 242 |
|
| 243 |
for message in processor.decode(outputs):
|
| 244 |
+
audio = message.audio_codes_list[0]
|
| 245 |
+
out_path = save_dir / f"sample{sample_idx}.wav"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
sample_idx += 1
|
| 247 |
+
torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
|
| 248 |
|
| 249 |
```
|
| 250 |
|