from transformers import pipeline from huggingface_hub import login from diffusers import StableDiffusionPipeline import gradio as gr import torch import transformers # Set Hugging Face token hf_token = "your_huggingface_token_here" # Replace this with your token login(hf_token) # Load Hugging Face models speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base") # Load Stable Diffusion model using diffusers text_to_image = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ).to("cuda" if torch.cuda.is_available() else "cpu") # Speech-to-text function def transcribe_audio(audio_file): try: result = speech_to_text(audio_file) transcription = result["text"] return transcription except Exception as e: return f"Error in transcription: {str(e)}" # Text-to-image function def generate_image_from_text(text): try: image = text_to_image(text).images[0] # Generate one image return image except Exception as e: return f"Error in image generation: {str(e)}" # Combined processing function def process_audio_and_generate_image(audio_file): transcription = transcribe_audio(audio_file) if "Error" in transcription: return None, transcription image = generate_image_from_text(transcription) if isinstance(image, str) and "Error" in image: return None, image return image, transcription # Gradio interface iface = gr.Interface( fn=process_audio_and_generate_image, inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"), outputs=[ gr.Image(label="Generated Image"), gr.Textbox(label="Transcription") ], title="Speech-to-Text and Image Generation", description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.", ) # Launch the interface iface.launch(share=True)