Spaces:

dschandra
/

AIVoice

Sleeping

App Files Files Community

AIVoice / app.py

dschandra

Update app.py

a298e65 verified about 1 year ago

raw

history blame contribute delete

3.6 kB

	import gradio as gr
	import torch
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
	from gtts import gTTS
	import os
	import librosa
	import webbrowser
	import random

	# Load Wav2Vec2 model and processor for speech-to-text
	processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
	model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

	# Hugging Face conversational model (DialoGPT) for generating responses
	conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")

	# Load the question answering model for specific commands
	qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

	def speech_to_text(audio_file):
	"""Convert speech in audio file to text using Wav2Vec2"""
	audio_input, _ = librosa.load(audio_file, sr=16000) # Load the audio
	input_values = processor(audio_input, return_tensors="pt").input_values

	# Perform speech-to-text
	with torch.no_grad():
	logits = model(input_values).logits

	# Get the predicted ids and convert them back to text
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0])

	return transcription

	def generate_response(text):
	"""Generate a response based on user input using DialoGPT"""
	response = conversational_pipeline(text, max_length=50)
	return response[0]['generated_text']

	def execute_action(command):
	"""Execute actions like opening YouTube or playing music based on the user's command"""
	command = command.lower()

	if 'youtube' in command or 'open youtube' in command:
	webbrowser.open('https://www.youtube.com')
	return "Opening YouTube..."

	elif 'play music' in command or 'play song' in command:
	# Playing a random song (or you can modify to play a specific song)
	songs = ["song1.mp3", "song2.mp3", "song3.mp3"] # Replace with actual file names
	song = random.choice(songs)
	os.system(f"mpg321 {song}") # Use your preferred way to play music
	return f"Playing music: {song}"

	else:
	return "Sorry, I don't understand that command."

	def process_audio(audio_file):
	"""Process the audio input: Convert to text, generate response, and convert response to speech"""
	# Convert speech to text using Wav2Vec 2.0
	text = speech_to_text(audio_file)
	print(f"User said: {text}")

	# Check if the user gave a command for an action (e.g., open YouTube or play music)
	action_response = execute_action(text)
	if action_response:
	# If it's an action, return it directly
	bot_response = action_response
	else:
	# Generate a conversational response using DialoGPT
	bot_response = generate_response(text)

	print(f"Bot response: {bot_response}")

	# Convert the bot's response to speech using gTTS
	tts = gTTS(bot_response)
	tts.save("response.mp3")

	# Return the bot's response text and the path to the generated audio file
	return bot_response, "response.mp3"

	# Create Gradio interface for audio input/output
	iface = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(type="filepath"), # Gradio automatically supports microphone input
	outputs=[gr.Textbox(), gr.Audio(type="filepath")], # Corrected for Gradio v3.x (type="filepath" for audio output)
	live=True,
	title="Voice Bot with Wav2Vec2.0",
	description="Speak to the bot and get a response instantly! This bot listens and responds like Google Assistant/Siri."
	)

	# Launch the interface
	iface.launch(share=True)