import os import torch import streamlit as st import torchaudio import tempfile from transformers import pipeline from diffusers import StableDiffusionPipeline from groq import Groq # Set up Groq API client = Groq(api_key=os.getenv("GROQ_API_KEY")) # Load Whisper Tiny ASR model device = "cpu" whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device) # Load Stable Diffusion model sd_model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(device) # Streamlit UI st.title("Voice-to-Image Generator") # Upload audio audio_file = st.file_uploader("Upload an audio file generate image", type=["wav", "mp3", "ogg"]) if audio_file: # Save file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: temp_audio.write(audio_file.read()) temp_audio_path = temp_audio.name # Ensure temp_audio_path is defined # Transcribe speech to text with torch.no_grad(): result = whisper_pipeline(temp_audio_path, return_timestamps=True) text_output = result["text"] st.write("Transcribed Text:", text_output) # Generate an image using Stable Diffusion with st.spinner("Generating image..."): image = sd_model(text_output).images[0] st.image(image, caption="Generated Image") # Optional: Use Groq API for additional processing chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": text_output}], model="llama-3.3-70b-versatile", ) st.write("Groq AI Response:", chat_completion.choices[0].message.content) st.write("Powered by MERAJ GRAPHICS")