imaginations / app.py
meraj12's picture
Update app.py
29cc163 verified
import os
import torch
import streamlit as st
import torchaudio
import tempfile
from transformers import pipeline
from diffusers import StableDiffusionPipeline
from groq import Groq
# Set up Groq API
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
# Load Whisper Tiny ASR model
device = "cpu"
whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
# Load Stable Diffusion model
sd_model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(device)
# Streamlit UI
st.title("Voice-to-Image Generator")
# Upload audio
audio_file = st.file_uploader("Upload an audio file generate image", type=["wav", "mp3", "ogg"])
if audio_file:
# Save file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
temp_audio.write(audio_file.read())
temp_audio_path = temp_audio.name # Ensure temp_audio_path is defined
# Transcribe speech to text
with torch.no_grad():
result = whisper_pipeline(temp_audio_path, return_timestamps=True)
text_output = result["text"]
st.write("Transcribed Text:", text_output)
# Generate an image using Stable Diffusion
with st.spinner("Generating image..."):
image = sd_model(text_output).images[0]
st.image(image, caption="Generated Image")
# Optional: Use Groq API for additional processing
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": text_output}],
model="llama-3.3-70b-versatile",
)
st.write("Groq AI Response:", chat_completion.choices[0].message.content)
st.write("Powered by MERAJ GRAPHICS")