import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from gtts import gTTS
import os
import librosa
import webbrowser
import random

# Load Wav2Vec2 model and processor for speech-to-text
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Hugging Face conversational model (DialoGPT) for generating responses
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")

# Load the question answering model for specific commands
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def speech_to_text(audio_file):
    """Convert speech in audio file to text using Wav2Vec2"""
    audio_input, _ = librosa.load(audio_file, sr=16000)  # Load the audio
    input_values = processor(audio_input, return_tensors="pt").input_values

    # Perform speech-to-text
    with torch.no_grad():
        logits = model(input_values).logits

    # Get the predicted ids and convert them back to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription

def generate_response(text):
    """Generate a response based on user input using DialoGPT"""
    response = conversational_pipeline(text, max_length=50)
    return response[0]['generated_text']

def execute_action(command):
    """Execute actions like opening YouTube or playing music based on the user's command"""
    command = command.lower()
    
    if 'youtube' in command or 'open youtube' in command:
        webbrowser.open('https://www.youtube.com')
        return "Opening YouTube..."
    
    elif 'play music' in command or 'play song' in command:
        # Playing a random song (or you can modify to play a specific song)
        songs = ["song1.mp3", "song2.mp3", "song3.mp3"]  # Replace with actual file names
        song = random.choice(songs)
        os.system(f"mpg321 {song}")  # Use your preferred way to play music
        return f"Playing music: {song}"
    
    else:
        return "Sorry, I don't understand that command."

def process_audio(audio_file):
    """Process the audio input: Convert to text, generate response, and convert response to speech"""
    # Convert speech to text using Wav2Vec 2.0
    text = speech_to_text(audio_file)
    print(f"User said: {text}")

    # Check if the user gave a command for an action (e.g., open YouTube or play music)
    action_response = execute_action(text)
    if action_response:
        # If it's an action, return it directly
        bot_response = action_response
    else:
        # Generate a conversational response using DialoGPT
        bot_response = generate_response(text)
    
    print(f"Bot response: {bot_response}")

    # Convert the bot's response to speech using gTTS
    tts = gTTS(bot_response)
    tts.save("response.mp3")
    
    # Return the bot's response text and the path to the generated audio file
    return bot_response, "response.mp3"

# Create Gradio interface for audio input/output
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),  # Gradio automatically supports microphone input
    outputs=[gr.Textbox(), gr.Audio(type="filepath")],  # Corrected for Gradio v3.x (type="filepath" for audio output)
    live=True,
    title="Voice Bot with Wav2Vec2.0",
    description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri."
)

# Launch the interface
iface.launch(share=True)