MOSS-TTS-Nano

Running

App Files Files Community

Kuangwei Chen commited on 11 days ago

Commit

16ec632

1 Parent(s): c828a64

try to fix 没有可用的音频 I/O backend。

Browse files

Files changed (2) hide show

requirements.txt +1 -0
weights/tts/modeling_nanotts_global_local.py +33 -5

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ torch==2.7.0
 torchaudio==2.7.0
 transformers==4.57.1
 safetensors>=0.4.3
 gradio==6.5.1

 torchaudio==2.7.0
 transformers==4.57.1
 safetensors>=0.4.3
+soundfile>=0.13.1
 gradio==6.5.1

weights/tts/modeling_nanotts_global_local.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any, Iterator, Optional, Sequence, Union
 import numpy as np
 import torch
 import torch.nn as nn
 import torchaudio
@@ -1573,7 +1574,17 @@ class NanoTTSGlobalLocalForCausalLM(NanoTTSPreTrainedModel):
         target_sample_rate: int,
         target_channels: int,
     ) -> tuple[torch.FloatTensor, int]:
-        waveform, sample_rate = torchaudio.load(str(reference_audio_path))
         waveform = waveform.to(torch.float32)
         if sample_rate != target_sample_rate:
             waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)
@@ -1587,6 +1598,25 @@ class NanoTTSGlobalLocalForCausalLM(NanoTTSPreTrainedModel):
             return waveform.mean(dim=0, keepdim=True), sample_rate
         raise ValueError(f"Unsupported reference audio channel conversion: {current_channels} -> {target_channels}")
     def _decode_local_last_hidden_state(
         self,
         local_inputs_embeds: torch.FloatTensor,
@@ -2193,8 +2223,7 @@ class NanoTTSGlobalLocalForCausalLM(NanoTTSPreTrainedModel):
             decoded_sample_rate = decoded_sample_rate or target_sample_rate
             output_path = Path(output_audio_path)
-            output_path.parent.mkdir(parents=True, exist_ok=True)
-            torchaudio.save(str(output_path), waveform, decoded_sample_rate)
             yield {
                 "type": "result",
@@ -2428,8 +2457,7 @@ class NanoTTSGlobalLocalForCausalLM(NanoTTSPreTrainedModel):
         assert decoded_sample_rate is not None
         output_path = Path(output_audio_path)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        torchaudio.save(str(output_path), waveform, decoded_sample_rate)
         if was_training:
             self.train()

 from typing import Any, Iterator, Optional, Sequence, Union
 import numpy as np
+import soundfile as sf
 import torch
 import torch.nn as nn
 import torchaudio
         target_sample_rate: int,
         target_channels: int,
     ) -> tuple[torch.FloatTensor, int]:
+        try:
+            waveform, sample_rate = torchaudio.load(str(reference_audio_path))
+        except RuntimeError as exc:
+            logging.warning(
+                "torchaudio.load failed for %s; falling back to soundfile",
+                reference_audio_path,
+                exc_info=True,
+            )
+            audio_array, sample_rate = sf.read(str(reference_audio_path), dtype="float32", always_2d=True)
+            waveform = torch.from_numpy(audio_array.T).contiguous()
         waveform = waveform.to(torch.float32)
         if sample_rate != target_sample_rate:
             waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)
             return waveform.mean(dim=0, keepdim=True), sample_rate
         raise ValueError(f"Unsupported reference audio channel conversion: {current_channels} -> {target_channels}")
+    @staticmethod
+    def _save_audio(
+        output_path: Union[str, Path],
+        waveform: torch.Tensor,
+        sample_rate: int,
+    ) -> None:
+        path = Path(output_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            torchaudio.save(str(path), waveform, sample_rate)
+        except RuntimeError:
+            logging.warning(
+                "torchaudio.save failed for %s; falling back to soundfile",
+                path,
+                exc_info=True,
+            )
+            waveform_np = waveform.detach().cpu().to(torch.float32).numpy().T
+            sf.write(str(path), waveform_np, sample_rate)
     def _decode_local_last_hidden_state(
         self,
         local_inputs_embeds: torch.FloatTensor,
             decoded_sample_rate = decoded_sample_rate or target_sample_rate
             output_path = Path(output_audio_path)
+            self._save_audio(output_path, waveform, decoded_sample_rate)
             yield {
                 "type": "result",
         assert decoded_sample_rate is not None
         output_path = Path(output_audio_path)
+        self._save_audio(output_path, waveform, decoded_sample_rate)
         if was_training:
             self.train()