Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import librosa | |
| import numpy as np | |
| from diffusers import StableDiffusionPipeline | |
| import whisper | |
| from huggingface_hub import login | |
| # Log in to Hugging Face using your token | |
| login("") # Replace with your Hugging Face token | |
| # Load the Whisper model for speech recognition | |
| whisper_model = whisper.load_model("base") | |
| # Check if GPU is available and set the device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load Stable Diffusion pipeline with safetensors | |
| text_to_image = StableDiffusionPipeline.from_pretrained( | |
| "runwayml/stable-diffusion-v1-5", | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32 | |
| ).to(device) | |
| def transcribe_audio(audio): | |
| """Transcribe audio to text using Whisper.""" | |
| if audio is None: | |
| return "Error: No audio file provided." | |
| try: | |
| # Check if the input is a file path or raw NumPy array | |
| if isinstance(audio, str): | |
| # If audio is a file path, load the file | |
| waveform, sr = librosa.load(audio, sr=16000) | |
| elif isinstance(audio, tuple): | |
| # If audio is a raw NumPy array, extract data and sample rate | |
| waveform, sr = audio | |
| else: | |
| return "Error: Unsupported audio format." | |
| # Transcribe the audio | |
| result = whisper_model.transcribe(waveform) | |
| return result['text'] | |
| except Exception as e: | |
| return f"Error transcribing audio: {str(e)}" | |
| def generate_image(text): | |
| """Generate an image from text using Stable Diffusion.""" | |
| try: | |
| image = text_to_image(text).images[0] # Retrieve the first image from the pipeline | |
| return image | |
| except Exception as e: | |
| print(f"Error generating image: {str(e)}") | |
| return None | |
| def voice_to_image(audio): | |
| """Transcribe audio and generate an image.""" | |
| transcribed_text = transcribe_audio(audio) | |
| if not transcribed_text or "Error" in transcribed_text: | |
| return transcribed_text, None | |
| image = generate_image(transcribed_text) | |
| if image is None: | |
| return transcribed_text, "Image generation failed. Please try again." | |
| return transcribed_text, image | |
| # Create Gradio interface | |
| interface = gr.Interface( | |
| fn=voice_to_image, | |
| inputs=gr.Audio(type="numpy", label="Speak or upload an audio file"), # Use NumPy array for raw audio | |
| outputs=[ | |
| gr.Textbox(label="Transcribed Text"), | |
| gr.Image(label="Generated Image") | |
| ], | |
| title="Real-time Voice-to-Image Generator", | |
| description="Speak into the microphone to generate an image from your voice." | |
| ) | |
| # Launch the interface | |
| interface.launch(share=True) |