Sandra Sanchez
Remove the Extra Gradio Audio Output, maintain only autoplaying HTML audio
eeadbca
# imports
import os
import base64
import gradio as gr
from openai import OpenAI
# Initialization
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
print("OpenAI API Key not set")
MODEL = "gpt-4o-mini"
openai = OpenAI()
system_message = "You are a language tutor, and as such provide only with \
helpful tips and accurate translations. You are entertaining and polite. \
If you don't know something, you say so."
def talker(message):
response = openai.audio.speech.create(
model="tts-1",
voice="nova", # Can I change the vibe parameter?
input=message
)
# Convert audio content to base64 for embedding in HTML
audio_base64 = base64.b64encode(response.content).decode()
audio_html = f'<audio autoplay controls src="data:audio/mp3;base64,{audio_base64}"></audio>'
return message, audio_html # Returning text + HTML audio (no separate "Press Play" button)
# Transcription function
def transcribe_audio(audio_file):
# Ensure the audio file is opened as a binary file
with open(audio_file, "rb") as audio:
translation = openai.audio.translations.create(
model="whisper-1",
file=audio # Pass the opened file, not the filepath
)
print(translation.text)
return translation.text
# Wrapper function to combine microphone input, transcription, and chat
def process_microphone_input(audio, history=[]):
if audio is None:
raise ValueError("No audio input detected. Please ensure the microphone is functioning correctly.")
# Step 1: Transcribe the audio captured from the microphone
transcribed_text = transcribe_audio(audio)
# Step 2: Pass the transcription to the chat function
response = chat(transcribed_text, history)
return response
def chat(message, history):
messages = [{"role": "system", "content": system_message}] + history + [{"role": "user", "content": message}]
response = openai.chat.completions.create(model=MODEL, messages=messages)
reply = response.choices[0].message.content
print(f"History: {history}")
print(f"Message: {message}")
print(f"Messages: {messages}")
return talker(reply)
# Gradio interface for microphone input
interface = gr.Interface(
fn=process_microphone_input,
inputs=[gr.Audio(sources="microphone", type="filepath")], # Microphone as input
outputs=["text", "html"], # Keep text + autoplaying HTML audio
title="Speech-to-Chatbot-to-Speech Language Tutor",
description="Speak into the microphone to chat with GPT-4. Wait a couple of seconds before you submit your message."
)
if __name__ == "__main__":
interface.launch()