File size: 3,711 Bytes
bfc6d2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""Audio download and preprocessing for D9c inference."""

import tempfile
from pathlib import Path
from typing import Tuple

import librosa
import numpy as np
import requests

# Default sample rate for MERT/MuQ (hardcoded to avoid import issues)
TARGET_SR = 24000


class AudioDownloadError(Exception):
    """Raised when audio download fails."""
    pass


class AudioProcessingError(Exception):
    """Raised when audio processing fails."""
    pass


def download_and_preprocess_audio(
    audio_url: str,
    target_sr: int = TARGET_SR,
    max_duration: int = 300,
    timeout: int = 60,
) -> Tuple[np.ndarray, float]:
    """Download audio from URL and preprocess for MERT/MuQ.

    Args:
        audio_url: URL to download audio from
        target_sr: Target sample rate (24kHz for MERT/MuQ)
        max_duration: Maximum audio duration in seconds
        timeout: Download timeout in seconds

    Returns:
        Tuple of (audio_array, duration_seconds)

    Raises:
        AudioDownloadError: If download fails
        AudioProcessingError: If audio processing fails
    """
    try:
        response = requests.get(audio_url, timeout=timeout, stream=True)
        response.raise_for_status()
    except requests.RequestException as e:
        raise AudioDownloadError(f"Failed to download audio: {e}")

    # Determine file extension from content-type or URL
    content_type = response.headers.get("content-type", "")
    if "mpeg" in content_type or audio_url.endswith(".mp3"):
        suffix = ".mp3"
    elif "wav" in content_type or audio_url.endswith(".wav"):
        suffix = ".wav"
    elif "flac" in content_type or audio_url.endswith(".flac"):
        suffix = ".flac"
    else:
        suffix = ".mp3"

    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
        temp_path = Path(f.name)

    try:
        audio, sr = librosa.load(temp_path, sr=target_sr, mono=True)
        duration = len(audio) / sr

        if duration > max_duration:
            raise AudioProcessingError(
                f"Audio too long: {duration:.1f}s > {max_duration}s limit"
            )

        if duration < 1.0:
            raise AudioProcessingError(
                f"Audio too short: {duration:.1f}s < 1.0s minimum"
            )

        return audio, duration

    except AudioProcessingError:
        raise
    except Exception as e:
        raise AudioProcessingError(f"Failed to process audio: {e}")

    finally:
        temp_path.unlink(missing_ok=True)


def load_audio_from_file(
    audio_path: Path,
    target_sr: int = TARGET_SR,
) -> Tuple[np.ndarray, float]:
    """Load audio from local file."""
    audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
    duration = len(audio) / sr
    return audio, duration


def preprocess_audio_from_bytes(
    audio_bytes: bytes,
    target_sr: int = TARGET_SR,
    max_duration: int = 300,
) -> Tuple[np.ndarray, float]:
    """Preprocess audio from raw bytes (e.g., base64 decoded)."""
    import io

    try:
        audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=target_sr, mono=True)
        duration = len(audio) / sr

        if duration > max_duration:
            raise AudioProcessingError(
                f"Audio too long: {duration:.1f}s > {max_duration}s limit"
            )

        if duration < 1.0:
            raise AudioProcessingError(
                f"Audio too short: {duration:.1f}s < 1.0s minimum"
            )

        return audio, duration

    except AudioProcessingError:
        raise
    except Exception as e:
        raise AudioProcessingError(f"Failed to process audio bytes: {e}")