File size: 3,600 Bytes
1b89955 b10a4cb a298e65 886085a b10a4cb abb5d10 a298e65 b10a4cb 83a6bbe a298e65 b10a4cb 83a6bbe 497eb2a b10a4cb 5135b09 b10a4cb 5135b09 b10a4cb 83a6bbe b10a4cb 886085a a298e65 1b89955 83a6bbe b10a4cb a298e65 b10a4cb 83a6bbe 1b89955 b10a4cb 83a6bbe 886085a 1b89955 b10a4cb 1b89955 886085a c6550ef 1b89955 b10a4cb 497eb2a 1b89955 497eb2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from gtts import gTTS
import os
import librosa
import webbrowser
import random
# Load Wav2Vec2 model and processor for speech-to-text
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# Hugging Face conversational model (DialoGPT) for generating responses
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
# Load the question answering model for specific commands
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
def speech_to_text(audio_file):
"""Convert speech in audio file to text using Wav2Vec2"""
audio_input, _ = librosa.load(audio_file, sr=16000) # Load the audio
input_values = processor(audio_input, return_tensors="pt").input_values
# Perform speech-to-text
with torch.no_grad():
logits = model(input_values).logits
# Get the predicted ids and convert them back to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
def generate_response(text):
"""Generate a response based on user input using DialoGPT"""
response = conversational_pipeline(text, max_length=50)
return response[0]['generated_text']
def execute_action(command):
"""Execute actions like opening YouTube or playing music based on the user's command"""
command = command.lower()
if 'youtube' in command or 'open youtube' in command:
webbrowser.open('https://www.youtube.com')
return "Opening YouTube..."
elif 'play music' in command or 'play song' in command:
# Playing a random song (or you can modify to play a specific song)
songs = ["song1.mp3", "song2.mp3", "song3.mp3"] # Replace with actual file names
song = random.choice(songs)
os.system(f"mpg321 {song}") # Use your preferred way to play music
return f"Playing music: {song}"
else:
return "Sorry, I don't understand that command."
def process_audio(audio_file):
"""Process the audio input: Convert to text, generate response, and convert response to speech"""
# Convert speech to text using Wav2Vec 2.0
text = speech_to_text(audio_file)
print(f"User said: {text}")
# Check if the user gave a command for an action (e.g., open YouTube or play music)
action_response = execute_action(text)
if action_response:
# If it's an action, return it directly
bot_response = action_response
else:
# Generate a conversational response using DialoGPT
bot_response = generate_response(text)
print(f"Bot response: {bot_response}")
# Convert the bot's response to speech using gTTS
tts = gTTS(bot_response)
tts.save("response.mp3")
# Return the bot's response text and the path to the generated audio file
return bot_response, "response.mp3"
# Create Gradio interface for audio input/output
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"), # Gradio automatically supports microphone input
outputs=[gr.Textbox(), gr.Audio(type="filepath")], # Corrected for Gradio v3.x (type="filepath" for audio output)
live=True,
title="Voice Bot with Wav2Vec2.0",
description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri."
)
# Launch the interface
iface.launch(share=True)
|