ChuxiJ commited on
Commit
96b5f27
·
1 Parent(s): d5b1200

fix torchcodec ffmpeg

Browse files
Files changed (3) hide show
  1. Dockerfile +3 -1
  2. acestep/audio_utils.py +12 -26
  3. acestep/handler.py +11 -26
Dockerfile CHANGED
@@ -5,10 +5,12 @@ FROM python:3.11-slim
5
  # Set environment variables
6
  ENV PYTHONDONTWRITEBYTECODE=1 \
7
  PYTHONUNBUFFERED=1 \
8
- DEBIAN_FRONTEND=noninteractive
 
9
 
10
  # Install system dependencies
11
  # build-essential is required for triton to compile CUDA kernels
 
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
13
  git \
14
  ffmpeg \
 
5
  # Set environment variables
6
  ENV PYTHONDONTWRITEBYTECODE=1 \
7
  PYTHONUNBUFFERED=1 \
8
+ DEBIAN_FRONTEND=noninteractive \
9
+ TORCHAUDIO_USE_TORCHCODEC=0
10
 
11
  # Install system dependencies
12
  # build-essential is required for triton to compile CUDA kernels
13
+ # ffmpeg is required for torchaudio ffmpeg backend (audio loading/saving)
14
  RUN apt-get update && apt-get install -y --no-install-recommends \
15
  git \
16
  ffmpeg \
acestep/audio_utils.py CHANGED
@@ -8,6 +8,11 @@ Independent audio file operations outside of handler, supporting:
8
  """
9
 
10
  import os
 
 
 
 
 
11
  import hashlib
12
  import json
13
  from pathlib import Path
@@ -130,11 +135,11 @@ class AudioSaver:
130
 
131
  def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
132
  """
133
- Load audio file with fallback backends for compatibility.
134
 
135
- In HuggingFace Space environment, the default torchcodec backend may fail
136
- due to missing CUDA dependencies (libnppicc.so.12). This method tries
137
- ffmpeg backend first (fast), then sox, then soundfile as fallbacks.
138
 
139
  Args:
140
  audio_file: Path to the audio file
@@ -144,7 +149,6 @@ class AudioSaver:
144
 
145
  Raises:
146
  FileNotFoundError: If the audio file doesn't exist
147
- Exception: If all backends fail to load the audio
148
  """
149
  audio_file = str(audio_file)
150
 
@@ -152,27 +156,9 @@ class AudioSaver:
152
  if not Path(audio_file).exists():
153
  raise FileNotFoundError(f"Audio file not found: {audio_file}")
154
 
155
- # Try ffmpeg backend first (fast and compatible)
156
- try:
157
- audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
158
- return audio, sr
159
- except Exception as e:
160
- logger.debug(f"[AudioSaver._load_audio_file] ffmpeg backend failed: {e}, trying sox backend")
161
-
162
- # Try sox backend as second option
163
- try:
164
- audio, sr = torchaudio.load(audio_file, backend="sox")
165
- return audio, sr
166
- except Exception as e:
167
- logger.debug(f"[AudioSaver._load_audio_file] sox backend failed: {e}, trying soundfile backend")
168
-
169
- # Try soundfile backend as last resort
170
- try:
171
- audio, sr = torchaudio.load(audio_file, backend="soundfile")
172
- return audio, sr
173
- except Exception as e:
174
- logger.error(f"[AudioSaver._load_audio_file] All backends failed to load audio: {audio_file}")
175
- raise
176
 
177
  def convert_audio(
178
  self,
 
8
  """
9
 
10
  import os
11
+
12
+ # Disable torchcodec backend to avoid CUDA dependency issues on HuggingFace Space
13
+ # This forces torchaudio to use ffmpeg/sox/soundfile backends instead
14
+ os.environ["TORCHAUDIO_USE_TORCHCODEC"] = "0"
15
+
16
  import hashlib
17
  import json
18
  from pathlib import Path
 
135
 
136
  def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
137
  """
138
+ Load audio file using torchaudio.
139
 
140
+ Note: TORCHAUDIO_USE_TORCHCODEC=0 is set at module level to disable
141
+ torchcodec backend and avoid CUDA dependency issues on HuggingFace Space.
142
+ This makes torchaudio use ffmpeg backend by default.
143
 
144
  Args:
145
  audio_file: Path to the audio file
 
149
 
150
  Raises:
151
  FileNotFoundError: If the audio file doesn't exist
 
152
  """
153
  audio_file = str(audio_file)
154
 
 
156
  if not Path(audio_file).exists():
157
  raise FileNotFoundError(f"Audio file not found: {audio_file}")
158
 
159
+ # Load audio using default backend (ffmpeg, since torchcodec is disabled)
160
+ audio, sr = torchaudio.load(audio_file)
161
+ return audio, sr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  def convert_audio(
164
  self,
acestep/handler.py CHANGED
@@ -7,6 +7,10 @@ import os
7
  # Disable tokenizers parallelism to avoid fork warning
8
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
 
 
 
 
 
10
  import math
11
  from copy import deepcopy
12
  import tempfile
@@ -1064,11 +1068,11 @@ class AceStepHandler:
1064
 
1065
  def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
1066
  """
1067
- Load audio file with fallback backends for compatibility.
1068
 
1069
- In HuggingFace Space environment, the default torchcodec backend may fail
1070
- due to missing CUDA dependencies (libnppicc.so.12). This method tries
1071
- ffmpeg backend first (fast), then sox, then soundfile as fallbacks.
1072
 
1073
  Args:
1074
  audio_file: Path to the audio file
@@ -1078,33 +1082,14 @@ class AceStepHandler:
1078
 
1079
  Raises:
1080
  FileNotFoundError: If the audio file doesn't exist
1081
- Exception: If all backends fail to load the audio
1082
  """
1083
  # Check if file exists first
1084
  if not os.path.exists(audio_file):
1085
  raise FileNotFoundError(f"Audio file not found: {audio_file}")
1086
 
1087
- # Try ffmpeg backend first (fast and compatible)
1088
- try:
1089
- audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
1090
- return audio, sr
1091
- except Exception as e:
1092
- logger.debug(f"[_load_audio_file] ffmpeg backend failed: {e}, trying sox backend")
1093
-
1094
- # Try sox backend as second option
1095
- try:
1096
- audio, sr = torchaudio.load(audio_file, backend="sox")
1097
- return audio, sr
1098
- except Exception as e:
1099
- logger.debug(f"[_load_audio_file] sox backend failed: {e}, trying soundfile backend")
1100
-
1101
- # Try soundfile backend as last resort
1102
- try:
1103
- audio, sr = torchaudio.load(audio_file, backend="soundfile")
1104
- return audio, sr
1105
- except Exception as e:
1106
- logger.error(f"[_load_audio_file] All backends failed to load audio: {audio_file}")
1107
- raise
1108
 
1109
  def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
1110
  """
 
7
  # Disable tokenizers parallelism to avoid fork warning
8
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
 
10
+ # Disable torchcodec backend to avoid CUDA dependency issues on HuggingFace Space
11
+ # This forces torchaudio to use ffmpeg/sox/soundfile backends instead
12
+ os.environ["TORCHAUDIO_USE_TORCHCODEC"] = "0"
13
+
14
  import math
15
  from copy import deepcopy
16
  import tempfile
 
1068
 
1069
  def _load_audio_file(self, audio_file) -> Tuple[torch.Tensor, int]:
1070
  """
1071
+ Load audio file using torchaudio.
1072
 
1073
+ Note: TORCHAUDIO_USE_TORCHCODEC=0 is set at module level to disable
1074
+ torchcodec backend and avoid CUDA dependency issues on HuggingFace Space.
1075
+ This makes torchaudio use ffmpeg backend by default.
1076
 
1077
  Args:
1078
  audio_file: Path to the audio file
 
1082
 
1083
  Raises:
1084
  FileNotFoundError: If the audio file doesn't exist
 
1085
  """
1086
  # Check if file exists first
1087
  if not os.path.exists(audio_file):
1088
  raise FileNotFoundError(f"Audio file not found: {audio_file}")
1089
 
1090
+ # Load audio using default backend (ffmpeg, since torchcodec is disabled)
1091
+ audio, sr = torchaudio.load(audio_file)
1092
+ return audio, sr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1093
 
1094
  def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
1095
  """