| |
| |
| |
|
|
| import gradio as gr |
| import openai |
| import whisper |
| import numpy as np |
| import os |
|
|
| |
| model = whisper.load_model("base") |
|
|
| |
| openai.api_key = 'your_openai_api_key' |
|
|
| |
| def speech_to_text(audio): |
| result = model.transcribe(audio) |
| return result['text'] |
|
|
| |
| def gpt_response(text): |
| response = openai.Completion.create( |
| engine="gpt-3.5-turbo", |
| prompt=text, |
| max_tokens=100 |
| ) |
| return response.choices[0].text.strip() |
|
|
| |
| def text_to_speech_google(text): |
| from google.cloud import texttospeech |
| client = texttospeech.TextToSpeechClient() |
| input_text = texttospeech.SynthesisInput(text=text) |
| |
| voice = texttospeech.VoiceSelectionParams( |
| language_code="en-US", |
| ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL |
| ) |
| |
| audio_config = texttospeech.AudioConfig( |
| audio_encoding=texttospeech.AudioEncoding.MP3 |
| ) |
| |
| response = client.synthesize_speech( |
| input=input_text, voice=voice, audio_config=audio_config |
| ) |
| |
| |
| output_path = "output.mp3" |
| with open(output_path, "wb") as out: |
| out.write(response.audio_content) |
| return output_path |
|
|
| |
| def conversation_pipeline(audio): |
| |
| text = speech_to_text(audio) |
| |
| |
| response_text = gpt_response(text) |
| |
| |
| response_audio = text_to_speech_google(response_text) |
| |
| return response_text, response_audio |
|
|
| |
| demo = gr.Interface( |
| fn=conversation_pipeline, |
| inputs=gr.Audio(source="microphone", type="filepath"), |
| outputs=[gr.Textbox(label="GPT Response"), gr.Audio(label="GPT Response Audio", type="filepath", autoplay=True)] |
| ) |
|
|
| demo.launch(show_error=True) |