Spaces:

Udyan
/

Voice-Assistant

Sleeping

App Files Files Community

Voice-Assistant / app.py

Udyan

Update app.py

2f853b3 verified 2 months ago

raw

history blame contribute delete

2.04 kB

	import gradio as gr
	from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
	import torch
	import numpy as np

	# Speech → Text
	stt = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-base"
	)

	# LLM
	llm = pipeline(
	"text-generation",
	model="distilgpt2"
	)

	# Text → Speech
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

	# simple default speaker embedding
	speaker_embeddings = torch.randn(1, 512)


	def voice_assistant(audio):
	if audio is None:
	return "No audio", "No audio", None

	# 1. Unpack the tuple
	sr, y = audio

	# 2. Convert to float32 (Whisper requirement)
	y = y.astype(np.float32)
	y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1

	# 3. Speech → Text
	speech_text = stt(y)["text"]

	# AI response
	response = llm(
	speech_text,
	max_new_tokens=60
	)[0]["generated_text"]

	# Prepare text for TTS
	inputs = processor(text=response, return_tensors="pt")

	speech = tts_model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings
	)

	audio_output = speech.cpu().numpy()

	# 1. Normalize the volume (so it's not too quiet)
	audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping

	# 2. Scale to 16-bit PCM (Required for most players)
	audio_output = (audio_output * 32767).astype(np.int16)

	# 3. SpeechT5 outputs at 16000Hz
	return speech_text, response, (16000, audio_output)


	iface = gr.Interface(
	fn=voice_assistant,
	inputs=gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="Speak here"
	),
	outputs=[
	gr.Textbox(label="Recognized Speech"),
	gr.Textbox(label="AI Response"),
	gr.Audio(label="Voice Reply")
	],
	title="Voice AI Assistant",
	description="Speak and the assistant will respond with voice"
	)

	iface.launch(server_name="0.0.0.0", server_port=7860)