OpenMOSS-Team
/

MOSS-TTSD-v1.0

feature-extraction

Model card Files Files and versions

YWMditto commited on 19 days ago

Commit

a25e3ca

·

1 Parent(s): c5ea929

update readme

Files changed (1) hide show

README.md +9 -6

README.md CHANGED Viewed

@@ -151,29 +151,32 @@ Notes:
 MOSS-TTSD uses a **continuation** workflow: provide reference audio for each speaker, their transcripts as a prefix, and the dialogue text to generate. The model continues in each speaker's identity.
 ```python
-import os
 from pathlib import Path
 import torch
 import torchaudio
 from transformers import AutoModel, AutoProcessor
 pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTSD-v1.0"
-audio_tokenizer_name_or_path = "OpenMOSS-Team/MOSS-Audio-Tokenizer"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
-    codec_path=audio_tokenizer_name_or_path,
 )
 processor.audio_tokenizer = processor.audio_tokenizer.to(device)
-processor.audio_tokenizer.eval()
 model = AutoModel.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
-    attn_implementation="flash_attention_2",
     torch_dtype=dtype,
 ).to(device)
 model.eval()
@@ -226,7 +229,7 @@ conversations = [
 batch_size = 1
-save_dir = Path("output")
 save_dir.mkdir(exist_ok=True, parents=True)
 sample_idx = 0
 with torch.no_grad():

 MOSS-TTSD uses a **continuation** workflow: provide reference audio for each speaker, their transcripts as a prefix, and the dialogue text to generate. The model continues in each speaker's identity.
 ```python
 from pathlib import Path
 import torch
 import torchaudio
 from transformers import AutoModel, AutoProcessor
+# Disable the broken cuDNN SDPA backend
+torch.backends.cuda.enable_cudnn_sdp(False)
+# Keep these enabled as fallbacks
+torch.backends.cuda.enable_flash_sdp(True)
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+torch.backends.cuda.enable_math_sdp(True)
 pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTSD-v1.0"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
 )
 processor.audio_tokenizer = processor.audio_tokenizer.to(device)
 model = AutoModel.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
+    # If FlashAttention 2 is installed, you can set attn_implementation="flash_attention_2"
+    attn_implementation="sdpa",
     torch_dtype=dtype,
 ).to(device)
 model.eval()
 batch_size = 1
+save_dir = Path("inference_root")
 save_dir.mkdir(exist_ok=True, parents=True)
 sample_idx = 0
 with torch.no_grad():