AIVoice / app.py
dschandra's picture
Update app.py
a298e65 verified
import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from gtts import gTTS
import os
import librosa
import webbrowser
import random
# Load Wav2Vec2 model and processor for speech-to-text
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# Hugging Face conversational model (DialoGPT) for generating responses
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
# Load the question answering model for specific commands
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
def speech_to_text(audio_file):
"""Convert speech in audio file to text using Wav2Vec2"""
audio_input, _ = librosa.load(audio_file, sr=16000) # Load the audio
input_values = processor(audio_input, return_tensors="pt").input_values
# Perform speech-to-text
with torch.no_grad():
logits = model(input_values).logits
# Get the predicted ids and convert them back to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
def generate_response(text):
"""Generate a response based on user input using DialoGPT"""
response = conversational_pipeline(text, max_length=50)
return response[0]['generated_text']
def execute_action(command):
"""Execute actions like opening YouTube or playing music based on the user's command"""
command = command.lower()
if 'youtube' in command or 'open youtube' in command:
webbrowser.open('https://www.youtube.com')
return "Opening YouTube..."
elif 'play music' in command or 'play song' in command:
# Playing a random song (or you can modify to play a specific song)
songs = ["song1.mp3", "song2.mp3", "song3.mp3"] # Replace with actual file names
song = random.choice(songs)
os.system(f"mpg321 {song}") # Use your preferred way to play music
return f"Playing music: {song}"
else:
return "Sorry, I don't understand that command."
def process_audio(audio_file):
"""Process the audio input: Convert to text, generate response, and convert response to speech"""
# Convert speech to text using Wav2Vec 2.0
text = speech_to_text(audio_file)
print(f"User said: {text}")
# Check if the user gave a command for an action (e.g., open YouTube or play music)
action_response = execute_action(text)
if action_response:
# If it's an action, return it directly
bot_response = action_response
else:
# Generate a conversational response using DialoGPT
bot_response = generate_response(text)
print(f"Bot response: {bot_response}")
# Convert the bot's response to speech using gTTS
tts = gTTS(bot_response)
tts.save("response.mp3")
# Return the bot's response text and the path to the generated audio file
return bot_response, "response.mp3"
# Create Gradio interface for audio input/output
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"), # Gradio automatically supports microphone input
outputs=[gr.Textbox(), gr.Audio(type="filepath")], # Corrected for Gradio v3.x (type="filepath" for audio output)
live=True,
title="Voice Bot with Wav2Vec2.0",
description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri."
)
# Launch the interface
iface.launch(share=True)