Spaces:
Sleeping
Sleeping
| import os | |
| import whisper | |
| from groq import Groq | |
| from diffusers import StableDiffusionPipeline | |
| import gradio as gr | |
| import torch | |
| # Load Whisper model | |
| whisper_model = whisper.load_model("base") | |
| GROQ_API_KEY="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" | |
| client=Groq(api_key=GROQ_API_KEY) | |
| # Load Stable Diffusion pipeline | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| stable_diffusion_model = StableDiffusionPipeline.from_pretrained( | |
| "runwayml/stable-diffusion-v1-5" | |
| ).to(device) | |
| # Function to handle voice-to-image pipeline | |
| def voice_to_image(audio): | |
| # Step 1: Transcribe audio to text using Whisper | |
| transcription = whisper_model.transcribe(audio) | |
| input_text = transcription["text"] | |
| # Step 2: Query LLM using Groq API | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| {"role": "user", "content": input_text}, | |
| ], | |
| model="llama3-8b-8192", | |
| stream=False, | |
| ) | |
| response_text = chat_completion.choices[0].message.content | |
| # Step 3: Generate image using Stable Diffusion | |
| image = stable_diffusion_model(response_text).images[0] | |
| return image | |
| # Gradio Interface | |
| interface = gr.Interface( | |
| fn=voice_to_image, | |
| inputs=gr.Audio(type="filepath"), | |
| outputs="image", | |
| title="Voice-to-Image Generator", | |
| description="Transcribe voice input into an image using Whisper, Groq LLM, and Stable Diffusion." | |
| ) | |
| # Launch Gradio app | |
| interface.launch() |