Spaces:

develops20
/

VoiceSupportAgent

Sleeping

App Files Files Community

develops20 commited on Jun 9, 2025

Commit

6d5604d

verified ·

1 Parent(s): 0d22192

extract the numpy.ndarray from the tuple returned by gr.Audio before passing it to the whisper pipeline and added check in transcribe

Browse files

Files changed (1) hide show

app.py +55 -16

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from transformers import pipeline
 from gtts import gTTS
 import os
 # Initialize Whisper for speech-to-text
 whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
@@ -10,35 +11,73 @@ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 knowledge_base = {
     "what cars are available": "We have Toyota Camry, Honda Civic, and Ford Mustang.",
     "price of camry": "The Toyota Camry starts at $25,000."
 }
 def transcribe(audio):
-    return whisper(audio)["text"]
 def text_to_speech(text):
-    tts = gTTS(text, lang="en")
-    tts.save("response.mp3")
-    return "response.mp3"
 def answer_question(text):
-    for key in knowledge_base:
-        if key in text.lower():
-            return knowledge_base[key]
-    return "Sorry, I can help with car availability and prices. Try again!"
 def process_audio(audio):
-    text = transcribe(audio)
-    response = answer_question(text)
-    audio_response = text_to_speech(response)
-    return response, audio_response
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# AI Support Agent: Car Dealership")
-    audio_input = gr.Audio(label="Speak to the Agent")  # No 'type' parameter needed
     text_output = gr.Textbox(label="Agent Response")
     audio_output = gr.Audio(label="Listen to Response")
     btn = gr.Button("Submit")
-    btn.click(fn=process_audio, inputs=audio_input, outputs=[text_output, audio_output])
-demo.launch()

 from transformers import pipeline
 from gtts import gTTS
 import os
+import numpy as np
 # Initialize Whisper for speech-to-text
 whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 knowledge_base = {
     "what cars are available": "We have Toyota Camry, Honda Civic, and Ford Mustang.",
     "price of camry": "The Toyota Camry starts at $25,000."
+    "price of Tesla": "The Tesla starts at $60,000."
 }
 def transcribe(audio):
+    print(f"Transcribing audio: {type(audio)}")
+    try:
+        # Check if audio is a tuple (numpy array, sample rate)
+        if isinstance(audio, tuple):
+            audio_data, _ = audio  # Extract numpy array, ignore sample rate
+        else:
+            audio_data = audio
+        result = whisper(audio_data)["text"]
+        print(f"Transcription result: {result}")
+        return result
+    except Exception as e:
+        print(f"Error in transcribe: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise
 def text_to_speech(text):
+    print(f"Generating speech for text: {text}")
+    try:
+        tts = gTTS(text, lang="en")
+        tts.save("response.mp3")
+        print("Speech saved to response.mp3")
+        return "response.mp3"
+    except Exception as e:
+        print(f"Error in text_to_speech: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise
 def answer_question(text):
+    print(f"Answering question: {text}")
+    try:
+        for key in knowledge_base:
+            if key in text.lower():
+                print(f"Found match for key: {key}")
+                return knowledge_base[key]
+        print("No match found in knowledge base")
+        return "Sorry, I can help with car availability and prices. Try again!"
+    except Exception as e:
+        print(f"Error in answer_question: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise
 def process_audio(audio):
+    print(f"Processing audio: {type(audio)}")
+    try:
+        text = transcribe(audio)
+        response = answer_question(text)
+        audio_response = text_to_speech(response)
+        print(f"Process complete. Response: {response}, Audio: {audio_response}")
+        return response, audio_response
+    except Exception as e:
+        print(f"Error in process_audio: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# AI Support Agent: Car Dealership")
+    audio_input = gr.Audio(label="Speak to the Agent")
     text_output = gr.Textbox(label="Agent Response")
     audio_output = gr.Audio(label="Listen to Response")
     btn = gr.Button("Submit")
+    btn.click(fn=process_audio, inputs=audio_input, outputs=[text_output, audio_output])