Speech_to_image / app.py
Sayiqa7's picture
Update app.py
cc76d1c verified
import gradio as gr
import torch
import librosa
import numpy as np
from diffusers import StableDiffusionPipeline
import whisper
from huggingface_hub import login
# Log in to Hugging Face using your token
login("") # Replace with your Hugging Face token
# Load the Whisper model for speech recognition
whisper_model = whisper.load_model("base")
# Check if GPU is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load Stable Diffusion pipeline with safetensors
text_to_image = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)
def transcribe_audio(audio):
"""Transcribe audio to text using Whisper."""
if audio is None:
return "Error: No audio file provided."
try:
# Check if the input is a file path or raw NumPy array
if isinstance(audio, str):
# If audio is a file path, load the file
waveform, sr = librosa.load(audio, sr=16000)
elif isinstance(audio, tuple):
# If audio is a raw NumPy array, extract data and sample rate
waveform, sr = audio
else:
return "Error: Unsupported audio format."
# Transcribe the audio
result = whisper_model.transcribe(waveform)
return result['text']
except Exception as e:
return f"Error transcribing audio: {str(e)}"
def generate_image(text):
"""Generate an image from text using Stable Diffusion."""
try:
image = text_to_image(text).images[0] # Retrieve the first image from the pipeline
return image
except Exception as e:
print(f"Error generating image: {str(e)}")
return None
def voice_to_image(audio):
"""Transcribe audio and generate an image."""
transcribed_text = transcribe_audio(audio)
if not transcribed_text or "Error" in transcribed_text:
return transcribed_text, None
image = generate_image(transcribed_text)
if image is None:
return transcribed_text, "Image generation failed. Please try again."
return transcribed_text, image
# Create Gradio interface
interface = gr.Interface(
fn=voice_to_image,
inputs=gr.Audio(type="numpy", label="Speak or upload an audio file"), # Use NumPy array for raw audio
outputs=[
gr.Textbox(label="Transcribed Text"),
gr.Image(label="Generated Image")
],
title="Real-time Voice-to-Image Generator",
description="Speak into the microphone to generate an image from your voice."
)
# Launch the interface
interface.launch(share=True)