OpenMOSS-Team
/

MOSS-TTS-Local-Transformer

Text-to-Speech

Safetensors

moss_tts_delay

custom_code

Model card Files Files and versions

xet

Community

YWMditto commited on Feb 11

Commit

3fa84fb

1 Parent(s): ea29ade

update readme

Browse files

Files changed (1) hide show

README.md +56 -16

README.md CHANGED Viewed

@@ -183,7 +183,7 @@ MOSS-TTS provides a convenient `generate` interface for rapid usage. The example
 3. Duration control
 ```python
-import os
 from pathlib import Path
 import torch
 import torchaudio
@@ -222,6 +222,28 @@ pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTS"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
@@ -286,7 +308,7 @@ conversations = [
 model = AutoModel.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
-    attn_implementation="sdpa",
     torch_dtype=dtype,
 ).to(device)
 model.eval()
@@ -312,7 +334,6 @@ generation_config.layers = [
 batch_size = 1
-messages = []
 save_dir = Path(f"inference_root_moss_tts_local_transformer_generation")
 save_dir.mkdir(exist_ok=True, parents=True)
 sample_idx = 0
@@ -330,11 +351,10 @@ with torch.no_grad():
         )
         for message in processor.decode(outputs):
-            for seg_idx, audio in enumerate(message.audio_codes_list):
-                # audio is a waveform tensor after decode_audio_codes
-                out_path = save_dir / f"sample{sample_idx}_seg{seg_idx}.wav"
-                sample_idx += 1
-                torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
 ```
@@ -343,7 +363,7 @@ with torch.no_grad():
 MOSS-TTS supports continuation-based cloning: provide a prefix audio clip in the assistant message, and make sure the **prefix transcript** is included in the text. The model continues in the same speaker identity and style.
 ```python
-import os
 from pathlib import Path
 import torch
 import torchaudio
@@ -380,6 +400,28 @@ pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTS"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
@@ -414,7 +456,7 @@ conversations = [
 model = AutoModel.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
-    attn_implementation="sdpa",
     torch_dtype=dtype,
 ).to(device)
 model.eval()
@@ -441,7 +483,6 @@ generation_config.layers = [
 batch_size = 1
-messages = []
 save_dir = Path("inference_root_moss_tts_local_transformer_continuation")
 save_dir.mkdir(exist_ok=True, parents=True)
 sample_idx = 0
@@ -459,11 +500,10 @@ with torch.no_grad():
         )
         for message in processor.decode(outputs):
-            for seg_idx, audio in enumerate(message.audio_codes_list):
-                # audio is a waveform tensor after decode_audio_codes
-                out_path = save_dir / f"sample{sample_idx}_seg{seg_idx}.wav"
-                sample_idx += 1
-                torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
 ```

 3. Duration control
 ```python
+import importlib.util
 from pathlib import Path
 import torch
 import torchaudio
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
+def resolve_attn_implementation() -> str:
+    # Prefer FlashAttention 2 when package + device conditions are met.
+    if (
+        device == "cuda"
+        and importlib.util.find_spec("flash_attn") is not None
+        and dtype in {torch.float16, torch.bfloat16}
+    ):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            return "flash_attention_2"
+    # CUDA fallback: use PyTorch SDPA kernels.
+    if device == "cuda":
+        return "sdpa"
+    # CPU fallback.
+    return "eager"
+attn_implementation = resolve_attn_implementation()
+print(f"[INFO] Using attn_implementation={attn_implementation}")
 processor = AutoProcessor.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
 model = AutoModel.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
+    attn_implementation=attn_implementation,
     torch_dtype=dtype,
 ).to(device)
 model.eval()
 batch_size = 1
 save_dir = Path(f"inference_root_moss_tts_local_transformer_generation")
 save_dir.mkdir(exist_ok=True, parents=True)
 sample_idx = 0
         )
         for message in processor.decode(outputs):
+            audio = message.audio_codes_list[0]
+            out_path = save_dir / f"sample{sample_idx}.wav"
+            sample_idx += 1
+            torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
 ```
 MOSS-TTS supports continuation-based cloning: provide a prefix audio clip in the assistant message, and make sure the **prefix transcript** is included in the text. The model continues in the same speaker identity and style.
 ```python
+import importlib.util
 from pathlib import Path
 import torch
 import torchaudio
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.bfloat16 if device == "cuda" else torch.float32
+def resolve_attn_implementation() -> str:
+    # Prefer FlashAttention 2 when package + device conditions are met.
+    if (
+        device == "cuda"
+        and importlib.util.find_spec("flash_attn") is not None
+        and dtype in {torch.float16, torch.bfloat16}
+    ):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            return "flash_attention_2"
+    # CUDA fallback: use PyTorch SDPA kernels.
+    if device == "cuda":
+        return "sdpa"
+    # CPU fallback.
+    return "eager"
+attn_implementation = resolve_attn_implementation()
+print(f"[INFO] Using attn_implementation={attn_implementation}")
 processor = AutoProcessor.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
 model = AutoModel.from_pretrained(
     pretrained_model_name_or_path,
     trust_remote_code=True,
+    attn_implementation=attn_implementation,
     torch_dtype=dtype,
 ).to(device)
 model.eval()
 batch_size = 1
 save_dir = Path("inference_root_moss_tts_local_transformer_continuation")
 save_dir.mkdir(exist_ok=True, parents=True)
 sample_idx = 0
         )
         for message in processor.decode(outputs):
+            audio = message.audio_codes_list[0]
+            out_path = save_dir / f"sample{sample_idx}.wav"
+            sample_idx += 1
+            torchaudio.save(out_path, audio.unsqueeze(0), processor.model_config.sampling_rate)
 ```