import gradio as gr import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline from gtts import gTTS import os import librosa import webbrowser import random # Load Wav2Vec2 model and processor for speech-to-text processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") # Hugging Face conversational model (DialoGPT) for generating responses conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium") # Load the question answering model for specific commands qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") def speech_to_text(audio_file): """Convert speech in audio file to text using Wav2Vec2""" audio_input, _ = librosa.load(audio_file, sr=16000) # Load the audio input_values = processor(audio_input, return_tensors="pt").input_values # Perform speech-to-text with torch.no_grad(): logits = model(input_values).logits # Get the predicted ids and convert them back to text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription def generate_response(text): """Generate a response based on user input using DialoGPT""" response = conversational_pipeline(text, max_length=50) return response[0]['generated_text'] def execute_action(command): """Execute actions like opening YouTube or playing music based on the user's command""" command = command.lower() if 'youtube' in command or 'open youtube' in command: webbrowser.open('https://www.youtube.com') return "Opening YouTube..." elif 'play music' in command or 'play song' in command: # Playing a random song (or you can modify to play a specific song) songs = ["song1.mp3", "song2.mp3", "song3.mp3"] # Replace with actual file names song = random.choice(songs) os.system(f"mpg321 {song}") # Use your preferred way to play music return f"Playing music: {song}" else: return "Sorry, I don't understand that command." def process_audio(audio_file): """Process the audio input: Convert to text, generate response, and convert response to speech""" # Convert speech to text using Wav2Vec 2.0 text = speech_to_text(audio_file) print(f"User said: {text}") # Check if the user gave a command for an action (e.g., open YouTube or play music) action_response = execute_action(text) if action_response: # If it's an action, return it directly bot_response = action_response else: # Generate a conversational response using DialoGPT bot_response = generate_response(text) print(f"Bot response: {bot_response}") # Convert the bot's response to speech using gTTS tts = gTTS(bot_response) tts.save("response.mp3") # Return the bot's response text and the path to the generated audio file return bot_response, "response.mp3" # Create Gradio interface for audio input/output iface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath"), # Gradio automatically supports microphone input outputs=[gr.Textbox(), gr.Audio(type="filepath")], # Corrected for Gradio v3.x (type="filepath" for audio output) live=True, title="Voice Bot with Wav2Vec2.0", description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri." ) # Launch the interface iface.launch(share=True)