rag-voicechat-demo / whisper_support.py
willsh1997's picture
:clown_face: MVP working turn-taking RAG
1a5ba3c
raw
history blame contribute delete
885 Bytes
from faster_whisper import WhisperModel
import numpy as np
import scipy.signal
import soundfile as sf
model_size = "base.en"
model = WhisperModel(model_size, device="cpu", compute_type="float32")
def process_audio(audio_file):
sample_rate, audio_data = audio_file
print(sample_rate)
print(max(audio_data))
if audio_data.ndim > 1 and audio_data.shape[1] > 1:
# Mix stereo channels by averaging them
audio_data = np.mean(audio_data, axis=1)
#normalise audio data
np_audio_float32 = audio_data.astype(np.float32) / 32768.0
np_audio_16k = scipy.signal.resample(np_audio_float32, int(len(np_audio_float32) * 16000 / sample_rate))
return np_audio_16k
def transcribe(audio):
segments, info = model.transcribe(process_audio(audio), beam_size=5, language='en')
text = "".join([segment.text for segment in segments])
return text