File size: 3,711 Bytes
bfc6d2a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """Audio download and preprocessing for D9c inference."""
import tempfile
from pathlib import Path
from typing import Tuple
import librosa
import numpy as np
import requests
# Default sample rate for MERT/MuQ (hardcoded to avoid import issues)
TARGET_SR = 24000
class AudioDownloadError(Exception):
"""Raised when audio download fails."""
pass
class AudioProcessingError(Exception):
"""Raised when audio processing fails."""
pass
def download_and_preprocess_audio(
audio_url: str,
target_sr: int = TARGET_SR,
max_duration: int = 300,
timeout: int = 60,
) -> Tuple[np.ndarray, float]:
"""Download audio from URL and preprocess for MERT/MuQ.
Args:
audio_url: URL to download audio from
target_sr: Target sample rate (24kHz for MERT/MuQ)
max_duration: Maximum audio duration in seconds
timeout: Download timeout in seconds
Returns:
Tuple of (audio_array, duration_seconds)
Raises:
AudioDownloadError: If download fails
AudioProcessingError: If audio processing fails
"""
try:
response = requests.get(audio_url, timeout=timeout, stream=True)
response.raise_for_status()
except requests.RequestException as e:
raise AudioDownloadError(f"Failed to download audio: {e}")
# Determine file extension from content-type or URL
content_type = response.headers.get("content-type", "")
if "mpeg" in content_type or audio_url.endswith(".mp3"):
suffix = ".mp3"
elif "wav" in content_type or audio_url.endswith(".wav"):
suffix = ".wav"
elif "flac" in content_type or audio_url.endswith(".flac"):
suffix = ".flac"
else:
suffix = ".mp3"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
temp_path = Path(f.name)
try:
audio, sr = librosa.load(temp_path, sr=target_sr, mono=True)
duration = len(audio) / sr
if duration > max_duration:
raise AudioProcessingError(
f"Audio too long: {duration:.1f}s > {max_duration}s limit"
)
if duration < 1.0:
raise AudioProcessingError(
f"Audio too short: {duration:.1f}s < 1.0s minimum"
)
return audio, duration
except AudioProcessingError:
raise
except Exception as e:
raise AudioProcessingError(f"Failed to process audio: {e}")
finally:
temp_path.unlink(missing_ok=True)
def load_audio_from_file(
audio_path: Path,
target_sr: int = TARGET_SR,
) -> Tuple[np.ndarray, float]:
"""Load audio from local file."""
audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
duration = len(audio) / sr
return audio, duration
def preprocess_audio_from_bytes(
audio_bytes: bytes,
target_sr: int = TARGET_SR,
max_duration: int = 300,
) -> Tuple[np.ndarray, float]:
"""Preprocess audio from raw bytes (e.g., base64 decoded)."""
import io
try:
audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=target_sr, mono=True)
duration = len(audio) / sr
if duration > max_duration:
raise AudioProcessingError(
f"Audio too long: {duration:.1f}s > {max_duration}s limit"
)
if duration < 1.0:
raise AudioProcessingError(
f"Audio too short: {duration:.1f}s < 1.0s minimum"
)
return audio, duration
except AudioProcessingError:
raise
except Exception as e:
raise AudioProcessingError(f"Failed to process audio bytes: {e}")
|