# Install required libraries import os from io import BytesIO import gradio as gr from gtts import gTTS from pydub import AudioSegment import whisper import openai # Set up OpenAI API key os.environ["OPENAI_API_KEY"] = "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" openai.api_key = os.environ["OPENAI_API_KEY"] # Initialize models whisper_model = whisper.load_model("base") # Load Whisper model # Define the voice-to-voice workflow def voice_to_voice(audio): # 1. Transcribe audio using Whisper transcription_result = whisper_model.transcribe(audio, fp16=False) user_input = transcription_result["text"] # 2. Get response from OpenAI's GPT response = openai.ChatCompletion.create( model="gpt-4", messages=[{"role": "user", "content": user_input}], ) response_text = response.choices[0].message["content"] # 3. Convert LLM response to audio using gTTS tts = gTTS(text=response_text, lang="en") audio_fp = BytesIO() tts.write_to_fp(audio_fp) audio_fp.seek(0) # Convert gTTS output to a playable format using pydub audio_segment = AudioSegment.from_file(audio_fp, format="mp3") output_fp = BytesIO() audio_segment.export(output_fp, format="mp3") output_fp.seek(0) return response_text, output_fp # Gradio interface iface = gr.Interface( fn=voice_to_voice, inputs=gr.Audio(type="filepath"), outputs=[gr.Textbox(label="Transcription"), gr.Audio(label="Response Audio")], live=True, title="Real-Time Voice-to-Voice Chatbot", description="Speak into the microphone and get a spoken response from the chatbot.", ) # Launch Gradio app if __name__ == "__main__": iface.launch()