|
|
|
|
|
import os |
|
|
from io import BytesIO |
|
|
import gradio as gr |
|
|
from gtts import gTTS |
|
|
from pydub import AudioSegment |
|
|
import whisper |
|
|
import openai |
|
|
|
|
|
|
|
|
os.environ["OPENAI_API_KEY"] = "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" |
|
|
openai.api_key = os.environ["OPENAI_API_KEY"] |
|
|
|
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
|
|
|
|
|
|
|
def voice_to_voice(audio): |
|
|
|
|
|
transcription_result = whisper_model.transcribe(audio, fp16=False) |
|
|
user_input = transcription_result["text"] |
|
|
|
|
|
|
|
|
response = openai.ChatCompletion.create( |
|
|
model="gpt-4", |
|
|
messages=[{"role": "user", "content": user_input}], |
|
|
) |
|
|
response_text = response.choices[0].message["content"] |
|
|
|
|
|
|
|
|
tts = gTTS(text=response_text, lang="en") |
|
|
audio_fp = BytesIO() |
|
|
tts.write_to_fp(audio_fp) |
|
|
audio_fp.seek(0) |
|
|
|
|
|
|
|
|
audio_segment = AudioSegment.from_file(audio_fp, format="mp3") |
|
|
output_fp = BytesIO() |
|
|
audio_segment.export(output_fp, format="mp3") |
|
|
output_fp.seek(0) |
|
|
|
|
|
return response_text, output_fp |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=voice_to_voice, |
|
|
inputs=gr.Audio(type="filepath"), |
|
|
outputs=[gr.Textbox(label="Transcription"), gr.Audio(label="Response Audio")], |
|
|
live=True, |
|
|
title="Real-Time Voice-to-Voice Chatbot", |
|
|
description="Speak into the microphone and get a spoken response from the chatbot.", |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |
|
|
|