Yng314 commited on
Commit
212dda8
·
1 Parent(s): 5e8b75e

feat: Implement robust audio file loading with `torchaudio` and `soundfile` fallback.

Browse files
Files changed (2) hide show
  1. acestep/handler.py +19 -2
  2. requirements.txt +2 -0
acestep/handler.py CHANGED
@@ -131,6 +131,23 @@ class AceStepHandler:
131
  if self.config is None:
132
  return False
133
  return getattr(self.config, 'is_turbo', False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  def load_lora(self, lora_path: str) -> str:
136
  """Load LoRA adapter into the decoder.
@@ -1235,7 +1252,7 @@ class AceStepHandler:
1235
 
1236
  try:
1237
  # Load audio file
1238
- audio, sr = torchaudio.load(audio_file)
1239
 
1240
  logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
1241
  logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
@@ -1290,7 +1307,7 @@ class AceStepHandler:
1290
 
1291
  try:
1292
  # Load audio file
1293
- audio, sr = torchaudio.load(audio_file)
1294
 
1295
  # Normalize to stereo 48kHz
1296
  audio = self._normalize_audio_to_stereo_48k(audio, sr)
 
131
  if self.config is None:
132
  return False
133
  return getattr(self.config, 'is_turbo', False)
134
+
135
+ def _load_audio_file(self, audio_file: str) -> Tuple[torch.Tensor, int]:
136
+ """Load audio robustly with torchaudio first, then soundfile fallback."""
137
+ try:
138
+ return torchaudio.load(audio_file)
139
+ except Exception as torchaudio_exc:
140
+ logger.warning(
141
+ f"[_load_audio_file] torchaudio.load failed for {audio_file}, "
142
+ f"fallback to soundfile: {torchaudio_exc}"
143
+ )
144
+ try:
145
+ # soundfile returns [frames, channels]; convert to [channels, frames]
146
+ audio_np, sr = sf.read(audio_file, dtype="float32", always_2d=True)
147
+ audio = torch.from_numpy(audio_np.T.copy())
148
+ return audio, int(sr)
149
+ except Exception:
150
+ raise torchaudio_exc
151
 
152
  def load_lora(self, lora_path: str) -> str:
153
  """Load LoRA adapter into the decoder.
 
1252
 
1253
  try:
1254
  # Load audio file
1255
+ audio, sr = self._load_audio_file(audio_file)
1256
 
1257
  logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
1258
  logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
 
1307
 
1308
  try:
1309
  # Load audio file
1310
+ audio, sr = self._load_audio_file(audio_file)
1311
 
1312
  # Normalize to stereo 48kHz
1313
  audio = self._normalize_audio_to_stereo_48k(audio, sr)
requirements.txt CHANGED
@@ -2,6 +2,8 @@ gradio==6.7.0
2
  spaces==0.47.0
3
  torch
4
  torchaudio
 
 
5
  transformers>=4.51.0,<4.58.0
6
  diffusers
7
  accelerate
 
2
  spaces==0.47.0
3
  torch
4
  torchaudio
5
+ # torchaudio>=2.9 may require torchcodec backend for torchaudio.load
6
+ torchcodec
7
  transformers>=4.51.0,<4.58.0
8
  diffusers
9
  accelerate