File size: 1,704 Bytes
35bb6f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from __future__ import annotations

import io

import numpy as np
import soundfile as sf


def validate_wav(data: bytes) -> dict:
    """Validate a WAV file and return its properties."""
    buf = io.BytesIO(data)
    try:
        info = sf.info(buf)
    except Exception as e:
        raise ValueError(f"Invalid WAV file: {e}") from e

    return {
        "sample_rate": info.samplerate,
        "channels": info.channels,
        "duration": info.duration,
        "frames": info.frames,
        "format": info.format,
    }


def validate_reference_audio(data: bytes) -> dict:
    """Validate reference audio for voice cloning.

    Requirements:
    - Mono channel
    - 16-44 kHz sample rate
    - 3-15 seconds duration
    """
    props = validate_wav(data)

    if props["channels"] != 1:
        raise ValueError(
            f"Reference audio must be mono (1 channel), got {props['channels']} channels"
        )

    if not (8000 <= props["sample_rate"] <= 48000):
        raise ValueError(
            f"Reference audio sample rate must be 8-48 kHz, got {props['sample_rate']} Hz"
        )

    if props["duration"] < 1.0:
        raise ValueError(
            f"Reference audio too short ({props['duration']:.1f}s), minimum 1 second"
        )

    if props["duration"] > 30.0:
        raise ValueError(
            f"Reference audio too long ({props['duration']:.1f}s), maximum 30 seconds"
        )

    return props


def pcm_to_wav_bytes(pcm_data: np.ndarray, sample_rate: int = 24000) -> bytes:
    """Convert float32 PCM numpy array to WAV bytes."""
    buf = io.BytesIO()
    sf.write(buf, pcm_data, sample_rate, format="WAV", subtype="PCM_16")
    buf.seek(0)
    return buf.read()