omniasr-transcriptions / server /inference /audio_reading_tools.py
jeanma's picture
Omnilingual ASR transcription demo
ae238b3 verified
import io
import numpy as np
import soundfile as sf
import torch
from numpy.typing import NDArray
def wav_to_bytes(
wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav"
) -> NDArray[np.int8]:
"""Convert audio tensor to bytes using soundfile directly."""
# Convert to numpy if torch tensor
if isinstance(wav, torch.Tensor):
if wav.is_cuda:
wav = wav.cpu()
# Convert to float32 first (numpy doesn't support bfloat16)
if wav.dtype != torch.float32:
wav = wav.float()
wav = wav.numpy()
# Ensure float32 dtype for numpy arrays
if wav.dtype != np.float32:
wav = wav.astype(np.float32)
# Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel
if wav.ndim == 1:
# Already correct shape for mono
pass
elif wav.ndim == 2:
# If shape is (channels, samples), transpose to (samples, channels)
if wav.shape[0] < wav.shape[1]:
wav = wav.T
# Create buffer and write using soundfile directly
buffer = io.BytesIO()
# Map format string to soundfile format
sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV'
subtype = 'PCM_16' if sf_format == 'WAV' else None
# Write to buffer
sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype)
buffer.seek(0)
return np.frombuffer(buffer.getvalue(), dtype=np.int8)