|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline |
|
|
from gtts import gTTS |
|
|
import os |
|
|
import librosa |
|
|
import webbrowser |
|
|
import random |
|
|
|
|
|
|
|
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") |
|
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") |
|
|
|
|
|
|
|
|
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium") |
|
|
|
|
|
|
|
|
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") |
|
|
|
|
|
def speech_to_text(audio_file): |
|
|
"""Convert speech in audio file to text using Wav2Vec2""" |
|
|
audio_input, _ = librosa.load(audio_file, sr=16000) |
|
|
input_values = processor(audio_input, return_tensors="pt").input_values |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
logits = model(input_values).logits |
|
|
|
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
|
transcription = processor.decode(predicted_ids[0]) |
|
|
|
|
|
return transcription |
|
|
|
|
|
def generate_response(text): |
|
|
"""Generate a response based on user input using DialoGPT""" |
|
|
response = conversational_pipeline(text, max_length=50) |
|
|
return response[0]['generated_text'] |
|
|
|
|
|
def execute_action(command): |
|
|
"""Execute actions like opening YouTube or playing music based on the user's command""" |
|
|
command = command.lower() |
|
|
|
|
|
if 'youtube' in command or 'open youtube' in command: |
|
|
webbrowser.open('https://www.youtube.com') |
|
|
return "Opening YouTube..." |
|
|
|
|
|
elif 'play music' in command or 'play song' in command: |
|
|
|
|
|
songs = ["song1.mp3", "song2.mp3", "song3.mp3"] |
|
|
song = random.choice(songs) |
|
|
os.system(f"mpg321 {song}") |
|
|
return f"Playing music: {song}" |
|
|
|
|
|
else: |
|
|
return "Sorry, I don't understand that command." |
|
|
|
|
|
def process_audio(audio_file): |
|
|
"""Process the audio input: Convert to text, generate response, and convert response to speech""" |
|
|
|
|
|
text = speech_to_text(audio_file) |
|
|
print(f"User said: {text}") |
|
|
|
|
|
|
|
|
action_response = execute_action(text) |
|
|
if action_response: |
|
|
|
|
|
bot_response = action_response |
|
|
else: |
|
|
|
|
|
bot_response = generate_response(text) |
|
|
|
|
|
print(f"Bot response: {bot_response}") |
|
|
|
|
|
|
|
|
tts = gTTS(bot_response) |
|
|
tts.save("response.mp3") |
|
|
|
|
|
|
|
|
return bot_response, "response.mp3" |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=process_audio, |
|
|
inputs=gr.Audio(type="filepath"), |
|
|
outputs=[gr.Textbox(), gr.Audio(type="filepath")], |
|
|
live=True, |
|
|
title="Voice Bot with Wav2Vec2.0", |
|
|
description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri." |
|
|
) |
|
|
|
|
|
|
|
|
iface.launch(share=True) |
|
|
|