Spaces:
Running
on
A100
Running
on
A100
| import io | |
| import numpy as np | |
| import soundfile as sf | |
| import torch | |
| from numpy.typing import NDArray | |
| def wav_to_bytes( | |
| wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav" | |
| ) -> NDArray[np.int8]: | |
| """Convert audio tensor to bytes using soundfile directly.""" | |
| # Convert to numpy if torch tensor | |
| if isinstance(wav, torch.Tensor): | |
| if wav.is_cuda: | |
| wav = wav.cpu() | |
| # Convert to float32 first (numpy doesn't support bfloat16) | |
| if wav.dtype != torch.float32: | |
| wav = wav.float() | |
| wav = wav.numpy() | |
| # Ensure float32 dtype for numpy arrays | |
| if wav.dtype != np.float32: | |
| wav = wav.astype(np.float32) | |
| # Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel | |
| if wav.ndim == 1: | |
| # Already correct shape for mono | |
| pass | |
| elif wav.ndim == 2: | |
| # If shape is (channels, samples), transpose to (samples, channels) | |
| if wav.shape[0] < wav.shape[1]: | |
| wav = wav.T | |
| # Create buffer and write using soundfile directly | |
| buffer = io.BytesIO() | |
| # Map format string to soundfile format | |
| sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV' | |
| subtype = 'PCM_16' if sf_format == 'WAV' else None | |
| # Write to buffer | |
| sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype) | |
| buffer.seek(0) | |
| return np.frombuffer(buffer.getvalue(), dtype=np.int8) | |