Spaces:

Sayiqa7
/

Speech_to_image

Runtime error

App Files Files Community

Speech_to_image / app.py

Sayiqa7

Update app.py

cc76d1c verified over 1 year ago

raw

history blame contribute delete

2.65 kB



	import gradio as gr
	import torch
	import librosa
	import numpy as np
	from diffusers import StableDiffusionPipeline
	import whisper
	from huggingface_hub import login

	# Log in to Hugging Face using your token
	login("") # Replace with your Hugging Face token

	# Load the Whisper model for speech recognition
	whisper_model = whisper.load_model("base")

	# Check if GPU is available and set the device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load Stable Diffusion pipeline with safetensors
	text_to_image = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5",
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	).to(device)

	def transcribe_audio(audio):
	"""Transcribe audio to text using Whisper."""
	if audio is None:
	return "Error: No audio file provided."

	try:
	# Check if the input is a file path or raw NumPy array
	if isinstance(audio, str):
	# If audio is a file path, load the file
	waveform, sr = librosa.load(audio, sr=16000)
	elif isinstance(audio, tuple):
	# If audio is a raw NumPy array, extract data and sample rate
	waveform, sr = audio
	else:
	return "Error: Unsupported audio format."

	# Transcribe the audio
	result = whisper_model.transcribe(waveform)
	return result['text']
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	def generate_image(text):
	"""Generate an image from text using Stable Diffusion."""
	try:
	image = text_to_image(text).images[0] # Retrieve the first image from the pipeline
	return image
	except Exception as e:
	print(f"Error generating image: {str(e)}")
	return None

	def voice_to_image(audio):
	"""Transcribe audio and generate an image."""
	transcribed_text = transcribe_audio(audio)
	if not transcribed_text or "Error" in transcribed_text:
	return transcribed_text, None

	image = generate_image(transcribed_text)
	if image is None:
	return transcribed_text, "Image generation failed. Please try again."

	return transcribed_text, image

	# Create Gradio interface
	interface = gr.Interface(
	fn=voice_to_image,
	inputs=gr.Audio(type="numpy", label="Speak or upload an audio file"), # Use NumPy array for raw audio
	outputs=[
	gr.Textbox(label="Transcribed Text"),
	gr.Image(label="Generated Image")
	],
	title="Real-time Voice-to-Image Generator",
	description="Speak into the microphone to generate an image from your voice."
	)

	# Launch the interface
	interface.launch(share=True)