Spaces:
Running
Running
extract the numpy.ndarray from the tuple returned by gr.Audio before passing it to the whisper pipeline and added check in transcribe
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
from transformers import pipeline
|
| 3 |
from gtts import gTTS
|
| 4 |
import os
|
|
|
|
| 5 |
|
| 6 |
# Initialize Whisper for speech-to-text
|
| 7 |
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
|
@@ -10,35 +11,73 @@ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
|
| 10 |
knowledge_base = {
|
| 11 |
"what cars are available": "We have Toyota Camry, Honda Civic, and Ford Mustang.",
|
| 12 |
"price of camry": "The Toyota Camry starts at $25,000."
|
|
|
|
| 13 |
}
|
| 14 |
|
| 15 |
def transcribe(audio):
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def text_to_speech(text):
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def answer_question(text):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def process_audio(audio):
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Gradio interface
|
| 36 |
with gr.Blocks() as demo:
|
| 37 |
gr.Markdown("# AI Support Agent: Car Dealership")
|
| 38 |
-
audio_input = gr.Audio(label="Speak to the Agent")
|
| 39 |
text_output = gr.Textbox(label="Agent Response")
|
| 40 |
audio_output = gr.Audio(label="Listen to Response")
|
| 41 |
btn = gr.Button("Submit")
|
| 42 |
-
btn.click(fn=process_audio, inputs=audio_input, outputs=[text_output, audio_output])
|
| 43 |
-
|
| 44 |
-
demo.launch()
|
|
|
|
| 2 |
from transformers import pipeline
|
| 3 |
from gtts import gTTS
|
| 4 |
import os
|
| 5 |
+
import numpy as np
|
| 6 |
|
| 7 |
# Initialize Whisper for speech-to-text
|
| 8 |
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
|
|
|
| 11 |
knowledge_base = {
|
| 12 |
"what cars are available": "We have Toyota Camry, Honda Civic, and Ford Mustang.",
|
| 13 |
"price of camry": "The Toyota Camry starts at $25,000."
|
| 14 |
+
"price of Tesla": "The Tesla starts at $60,000."
|
| 15 |
}
|
| 16 |
|
| 17 |
def transcribe(audio):
|
| 18 |
+
print(f"Transcribing audio: {type(audio)}")
|
| 19 |
+
try:
|
| 20 |
+
# Check if audio is a tuple (numpy array, sample rate)
|
| 21 |
+
if isinstance(audio, tuple):
|
| 22 |
+
audio_data, _ = audio # Extract numpy array, ignore sample rate
|
| 23 |
+
else:
|
| 24 |
+
audio_data = audio
|
| 25 |
+
result = whisper(audio_data)["text"]
|
| 26 |
+
print(f"Transcription result: {result}")
|
| 27 |
+
return result
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error in transcribe: {str(e)}")
|
| 30 |
+
import traceback
|
| 31 |
+
traceback.print_exc()
|
| 32 |
+
raise
|
| 33 |
|
| 34 |
def text_to_speech(text):
|
| 35 |
+
print(f"Generating speech for text: {text}")
|
| 36 |
+
try:
|
| 37 |
+
tts = gTTS(text, lang="en")
|
| 38 |
+
tts.save("response.mp3")
|
| 39 |
+
print("Speech saved to response.mp3")
|
| 40 |
+
return "response.mp3"
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"Error in text_to_speech: {str(e)}")
|
| 43 |
+
import traceback
|
| 44 |
+
traceback.print_exc()
|
| 45 |
+
raise
|
| 46 |
|
| 47 |
def answer_question(text):
|
| 48 |
+
print(f"Answering question: {text}")
|
| 49 |
+
try:
|
| 50 |
+
for key in knowledge_base:
|
| 51 |
+
if key in text.lower():
|
| 52 |
+
print(f"Found match for key: {key}")
|
| 53 |
+
return knowledge_base[key]
|
| 54 |
+
print("No match found in knowledge base")
|
| 55 |
+
return "Sorry, I can help with car availability and prices. Try again!"
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error in answer_question: {str(e)}")
|
| 58 |
+
import traceback
|
| 59 |
+
traceback.print_exc()
|
| 60 |
+
raise
|
| 61 |
|
| 62 |
def process_audio(audio):
|
| 63 |
+
print(f"Processing audio: {type(audio)}")
|
| 64 |
+
try:
|
| 65 |
+
text = transcribe(audio)
|
| 66 |
+
response = answer_question(text)
|
| 67 |
+
audio_response = text_to_speech(response)
|
| 68 |
+
print(f"Process complete. Response: {response}, Audio: {audio_response}")
|
| 69 |
+
return response, audio_response
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"Error in process_audio: {str(e)}")
|
| 72 |
+
import traceback
|
| 73 |
+
traceback.print_exc()
|
| 74 |
+
raise
|
| 75 |
|
| 76 |
# Gradio interface
|
| 77 |
with gr.Blocks() as demo:
|
| 78 |
gr.Markdown("# AI Support Agent: Car Dealership")
|
| 79 |
+
audio_input = gr.Audio(label="Speak to the Agent")
|
| 80 |
text_output = gr.Textbox(label="Agent Response")
|
| 81 |
audio_output = gr.Audio(label="Listen to Response")
|
| 82 |
btn = gr.Button("Submit")
|
| 83 |
+
btn.click(fn=process_audio, inputs=audio_input, outputs=[text_output, audio_output])
|
|
|
|
|
|