File size: 2,833 Bytes
a0fcd39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Audio utility functions for format conversion and processing."""

import io
import numpy as np
import soundfile as sf


def to_mono(audio: np.ndarray) -> np.ndarray:
    """
    Convert stereo audio to mono by averaging channels.

    Args:
        audio: Audio array, shape (samples,) for mono or (samples, channels) for stereo

    Returns:
        Mono audio array, shape (samples,)
    """
    if audio.ndim == 2:
        return np.mean(audio, axis=1)
    return audio


def to_float32(audio: np.ndarray) -> np.ndarray:
    """
    Ensure audio is float32 in [-1, 1] range.

    Args:
        audio: Audio array in any numeric format

    Returns:
        Audio array as float32 normalized to [-1, 1]
    """
    audio = audio.astype(np.float32)

    # Check if already normalized
    max_val = np.max(np.abs(audio))
    if max_val > 1.0:
        audio = audio / max_val

    return audio


def normalize(audio: np.ndarray, peak: float = 0.95) -> np.ndarray:
    """
    Normalize audio so peak amplitude equals given value.

    Args:
        audio: Audio array
        peak: Target peak amplitude (default 0.95 to avoid clipping)

    Returns:
        Normalized audio array
    """
    max_val = np.max(np.abs(audio))
    if max_val > 0:
        audio = audio / max_val * peak
    return audio


def pad_or_trim(audio: np.ndarray, target_length: int) -> np.ndarray:
    """
    Pad with zeros or trim audio to target length.

    Args:
        audio: Audio array
        target_length: Desired length in samples

    Returns:
        Audio array with exactly target_length samples
    """
    current_length = len(audio)

    if current_length == target_length:
        return audio
    elif current_length > target_length:
        return audio[:target_length]
    else:
        # Pad with zeros
        padding = np.zeros(target_length - current_length, dtype=audio.dtype)
        return np.concatenate([audio, padding])


def encode_wav_to_bytes(audio: np.ndarray, sr: int) -> bytes:
    """
    Encode numpy array to WAV bytes.

    Args:
        audio: Audio array
        sr: Sample rate

    Returns:
        WAV file as bytes
    """
    buf = io.BytesIO()
    sf.write(buf, audio, sr, format='WAV')
    buf.seek(0)
    return buf.read()


def encode_flac_to_bytes(audio: np.ndarray, sr: int) -> bytes:
    """
    Encode numpy array to FLAC bytes (lossless, ~50% smaller than WAV).

    Args:
        audio: Audio array
        sr: Sample rate

    Returns:
        FLAC file as bytes
    """
    buf = io.BytesIO()
    # Convert float32 to int16 for FLAC (better compression)
    if audio.dtype == np.float32 or audio.dtype == np.float64:
        audio_int = (audio * 32767).astype(np.int16)
    else:
        audio_int = audio
    sf.write(buf, audio_int, sr, format='FLAC')
    buf.seek(0)
    return buf.read()