Spaces:

dschandra
/

AIVoice

Sleeping

File size: 3,600 Bytes

1b89955
b10a4cb
a298e65
886085a
b10a4cb
abb5d10
a298e65
 
b10a4cb
 
 
 
 
83a6bbe
 
 
a298e65
 
 
b10a4cb
83a6bbe
497eb2a
b10a4cb
 
 
 
 
5135b09
b10a4cb
 
 
5135b09
b10a4cb
 
 
83a6bbe
b10a4cb
 
886085a
a298e65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b89955
83a6bbe
b10a4cb
 
 
 
a298e65
 
 
 
 
 
 
 
 
b10a4cb
 
83a6bbe
1b89955
 
b10a4cb
83a6bbe
886085a
1b89955
b10a4cb
1b89955
886085a
c6550ef
 
1b89955
b10a4cb
497eb2a
1b89955
 
 
497eb2a

import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from gtts import gTTS
import os
import librosa
import webbrowser
import random

# Load Wav2Vec2 model and processor for speech-to-text
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Hugging Face conversational model (DialoGPT) for generating responses
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")

# Load the question answering model for specific commands
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def speech_to_text(audio_file):
    """Convert speech in audio file to text using Wav2Vec2"""
    audio_input, _ = librosa.load(audio_file, sr=16000)  # Load the audio
    input_values = processor(audio_input, return_tensors="pt").input_values

    # Perform speech-to-text
    with torch.no_grad():
        logits = model(input_values).logits

    # Get the predicted ids and convert them back to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription

def generate_response(text):
    """Generate a response based on user input using DialoGPT"""
    response = conversational_pipeline(text, max_length=50)
    return response[0]['generated_text']

def execute_action(command):
    """Execute actions like opening YouTube or playing music based on the user's command"""
    command = command.lower()
    
    if 'youtube' in command or 'open youtube' in command:
        webbrowser.open('https://www.youtube.com')
        return "Opening YouTube..."
    
    elif 'play music' in command or 'play song' in command:
        # Playing a random song (or you can modify to play a specific song)
        songs = ["song1.mp3", "song2.mp3", "song3.mp3"]  # Replace with actual file names
        song = random.choice(songs)
        os.system(f"mpg321 {song}")  # Use your preferred way to play music
        return f"Playing music: {song}"
    
    else:
        return "Sorry, I don't understand that command."

def process_audio(audio_file):
    """Process the audio input: Convert to text, generate response, and convert response to speech"""
    # Convert speech to text using Wav2Vec 2.0
    text = speech_to_text(audio_file)
    print(f"User said: {text}")

    # Check if the user gave a command for an action (e.g., open YouTube or play music)
    action_response = execute_action(text)
    if action_response:
        # If it's an action, return it directly
        bot_response = action_response
    else:
        # Generate a conversational response using DialoGPT
        bot_response = generate_response(text)
    
    print(f"Bot response: {bot_response}")

    # Convert the bot's response to speech using gTTS
    tts = gTTS(bot_response)
    tts.save("response.mp3")
    
    # Return the bot's response text and the path to the generated audio file
    return bot_response, "response.mp3"

# Create Gradio interface for audio input/output
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),  # Gradio automatically supports microphone input
    outputs=[gr.Textbox(), gr.Audio(type="filepath")],  # Corrected for Gradio v3.x (type="filepath" for audio output)
    live=True,
    title="Voice Bot with Wav2Vec2.0",
    description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri."
)

# Launch the interface
iface.launch(share=True)