audio2audio / app.py
kamranferoz's picture
Update app.py
3ccf750 verified
import os
import warnings
import whisper
import gradio as gr
import openai
from gtts import gTTS
# Load model outside high-frequency function
model = whisper.load_model("base")
device = model.device
# Separate OpenAI API functionality into a function
def call_openai_api(text):
openai.api_key = os.getenv("OPENAI_API_KEY")
result = openai.Completion.create(
model="gpt-3.5-turbo-instruct",
prompt=text,
max_tokens=500,
temperature=0
)
return result["choices"][0]["text"]
def transcribe(audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(device)
# detect the spoken language
with warnings.catch_warnings():
warnings.simplefilter("ignore")
_, _ = model.detect_language(mel)
# decode the audio
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(model, mel, options)
result_text = result.text
# Call OpenAI API for response
out_result = call_openai_api(result_text)
# Generate speech audio
speech = gTTS(out_result)
speech.save("test.mp3")
return [result_text, out_result, "test.mp3"]
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Audio(type="filepath", autoplay=True)
demo = gr.Interface(
title = 'Voice to Text & Voice reply using OpenAI (KF)',
fn=transcribe,
inputs=gr.Audio(sources=["microphone"], type="filepath"),
outputs=[output_1, output_2, output_3]
)
if __name__ == "__main__":
demo.launch()