cs-ai-sakura-dev / src /utils /audio_helper.py
lifedebugger's picture
Deploy files from GitHub repository
10ab06b
import io
from fastrtc.utils import audio_to_int16
from pydub import AudioSegment
import wave
import torch
import torchaudio
import numpy as np
def audio_to_bytes(audio_tuple, sample_rate=24000) -> io.BufferedReader:
sr, audio_data = audio_tuple
audio_int16 = audio_to_int16(audio_tuple)
buffer = io.BytesIO()
with wave.open(buffer, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sr)
wf.writeframes(audio_int16.tobytes())
buffer.seek(0)
buffer.name = "audio.wav"
return buffer
def resample_audio(audio_buffer:io.BytesIO):
audio_buffer.seek(0)
audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15)
if audio_segment.channels == 2:
samples = samples.reshape((-1, 2)).mean(axis=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device)
resampler = torchaudio.transforms.Resample(
orig_freq=audio_segment.frame_rate,
new_freq=24000
).to(device)
resampled_tensor = resampler(audio_tensor)
resampled = resampled_tensor.squeeze(0).cpu().numpy()
return resampled