import io from fastrtc.utils import audio_to_int16 from pydub import AudioSegment import wave import torch import torchaudio import numpy as np def audio_to_bytes(audio_tuple, sample_rate=24000) -> io.BufferedReader: sr, audio_data = audio_tuple audio_int16 = audio_to_int16(audio_tuple) buffer = io.BytesIO() with wave.open(buffer, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sr) wf.writeframes(audio_int16.tobytes()) buffer.seek(0) buffer.name = "audio.wav" return buffer def resample_audio(audio_buffer:io.BytesIO): audio_buffer.seek(0) audio_segment = AudioSegment.from_file(audio_buffer, format="mp3") samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / (2 ** 15) if audio_segment.channels == 2: samples = samples.reshape((-1, 2)).mean(axis=1) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") audio_tensor = torch.from_numpy(samples).unsqueeze(0).to(device) resampler = torchaudio.transforms.Resample( orig_freq=audio_segment.frame_rate, new_freq=24000 ).to(device) resampled_tensor = resampler(audio_tensor) resampled = resampled_tensor.squeeze(0).cpu().numpy() return resampled