Spaces:
Runtime error
Runtime error
File size: 4,684 Bytes
0020ddc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | # Purpose: Audio load/resample/sliding-chunk utilities for HF Spaces
# Dependencies: soundfile, torch, numpy
"""HF Space ์ ์ฉ โ demo/ ๋ vendor/ ์์กด์ฑ ์์.
- load_audio: soundfile ์ฐ์ , ์คํจ์ ffmpeg WAV ๋ณํ fallback
- sliding_chunks: production infer.py::_sliding_chunks ์ ๋์ผํ ๊ท์น
ยท stride=CHUNK_SAMPLES (4s)
ยท ๊ผฌ๋ฆฌ chunk ๋ actual_ratio >= 0.5 ์ผ ๋๋ง ์ ์ง
ยท ์ต์ 1 chunk ๋ณด์ฅ (์งง์ ๊ณก๋ padding)
"""
from __future__ import annotations
import subprocess
import tempfile
from pathlib import Path
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
from config import SR, MAX_DURATION_SEC, CHUNK_SAMPLES
_NEEDS_FFMPEG = {".m4a", ".aac", ".wma", ".opus", ".mp4", ".webm"}
def _ffmpeg_to_wav(path: str) -> str | None:
tmp = tempfile.mktemp(suffix=".wav")
try:
r = subprocess.run(
["ffmpeg", "-hide_banner", "-loglevel", "error",
"-i", str(path), "-f", "wav", "-acodec", "pcm_f32le",
"-ac", "2", "-ar", str(SR), "-t", str(MAX_DURATION_SEC),
"-y", tmp],
capture_output=True, timeout=30,
)
return tmp if r.returncode == 0 else None
except Exception:
return None
def load_audio(path: str) -> tuple[np.ndarray, bool]:
"""Return (audio[samples, channels] float32, is_stereo)."""
ext = Path(path).suffix.lower()
converted = None
if ext in _NEEDS_FFMPEG:
converted = _ffmpeg_to_wav(path)
if converted is None:
raise RuntimeError(f"Failed to convert {ext} via ffmpeg")
path = converted
try:
audio, sr = sf.read(str(path), dtype="float32", always_2d=True)
if sr != SR:
try:
import torchaudio
t = torch.from_numpy(audio.T)
resampler = torchaudio.transforms.Resample(sr, SR)
audio = resampler(t).T.numpy()
except Exception:
# scipy fallback (linear) โ ํ์ง ๋ฎ์ง๋ง crash ๋ฐฉ์ง
from scipy.signal import resample_poly
up, down = SR, sr
audio = np.stack([
resample_poly(audio[:, c], up, down)
for c in range(audio.shape[1])
], axis=1).astype(np.float32)
max_samples = MAX_DURATION_SEC * SR
if len(audio) > max_samples:
audio = audio[:max_samples]
is_stereo = audio.shape[1] >= 2
return audio.astype(np.float32), is_stereo
finally:
if converted:
Path(converted).unlink(missing_ok=True)
def load_audio_mono_tensor(path: str) -> tuple[torch.Tensor, np.ndarray, bool]:
audio, is_stereo = load_audio(path)
if is_stereo and audio.shape[1] >= 2:
mono = (audio[:, 0] + audio[:, 1]) / 2.0
else:
mono = audio[:, 0]
return torch.from_numpy(mono), audio, is_stereo
def sliding_chunks(wav: torch.Tensor, chunk_size: int = CHUNK_SAMPLES,
min_actual_ratio: float = 0.5) -> list[tuple[torch.Tensor, dict]]:
"""production ๊ณผ ๋์ผ ๊ท์น์ผ๋ก ๊ณก ์ ์ฒด๋ฅผ 4s stride ๋ก sliding.
๋ฐํ: [(chunk_tensor, metadata), ...] โ metadata = start_sample, actual_samples, actual_ratio, rms
"""
n = wav.shape[0]
chunks: list[tuple[torch.Tensor, dict]] = []
if n < chunk_size // 2:
# 2์ด ๋ฏธ๋ง โ ๋น ๊ฒฐ๊ณผ (ํธ์ถ์ธก์์ "Too Short" ์ฒ๋ฆฌ)
return chunks
for start in range(0, n, chunk_size):
c = wav[start:start + chunk_size]
actual = c.shape[0]
actual_ratio = actual / chunk_size
if actual_ratio < min_actual_ratio:
continue
if actual < chunk_size:
c = F.pad(c, (0, chunk_size - actual))
rms = float(torch.sqrt(torch.mean(c ** 2)))
chunks.append((c, {
"start_sample": int(start),
"actual_samples": int(actual),
"actual_ratio": float(actual_ratio),
"rms": rms,
}))
if not chunks:
# 2~4 ์ด ๊ณก โ 1 chunk ๋ padding ํด์ ๋ณด์ฅ
c = wav[:chunk_size]
c = F.pad(c, (0, chunk_size - c.shape[0]))
chunks.append((c, {
"start_sample": 0,
"actual_samples": int(n),
"actual_ratio": float(n / chunk_size),
"rms": float(torch.sqrt(torch.mean(c ** 2))),
}))
return chunks
def get_audio_info(audio: np.ndarray, is_stereo: bool) -> dict:
duration = len(audio) / SR
return {
"duration": duration,
"sr": SR,
"channels": "Stereo" if is_stereo else "Mono",
"samples": len(audio),
}
|