File size: 4,563 Bytes
68f198e 665107f 68f198e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import numpy as np
import torch
import librosa
import asyncio
import streamlit as st
from transformers import pipeline
from huggingface_hub import login
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from functools import lru_cache
# Install missing packages (if required, handled manually for Streamlit environment)
def install_missing_packages():
required_packages = {
"librosa": None,
"diffusers": ">=0.14.0",
"transformers": None,
"torch": "==2.0.0+cu118",
}
for package, version in required_packages.items():
try:
__import__(package)
except ImportError:
package_name = f"{package}{version}" if version else package
print(f"Installing {package_name}...")
try:
subprocess.check_call(["pip", "install", package_name])
except subprocess.CalledProcessError as e:
print(f"Error installing {package_name}: {e}")
return
install_missing_packages()
# Authenticate with Hugging Face Hub
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(hf_token)
else:
raise ValueError("HF_TOKEN environment variable not set.")
# Load models
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
text_to_image = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
)
device = "cuda" if torch.cuda.is_available() else "cpu"
text_to_image.to(device)
text_to_image.enable_attention_slicing()
text_to_image.safety_checker = None
text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
# Preprocess audio file into NumPy array
def preprocess_audio(audio_path):
try:
audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
return np.array(audio, dtype=np.float32)
except Exception as e:
return f"Error in preprocessing audio: {str(e)}"
# Speech-to-text function
@lru_cache(maxsize=10)
async def transcribe_audio(audio_path):
try:
audio_array = preprocess_audio(audio_path)
if isinstance(audio_array, str): # Error message from preprocessing
return audio_array
result = speech_to_text(audio_array)
transcription = result["text"]
return transcription
except Exception as e:
return f"Error in transcription: {str(e)}"
# Text-to-image function
@lru_cache(maxsize=10)
async def generate_image_from_text(text):
try:
image = text_to_image(text, height=256, width=256).images[0]
return image
except Exception as e:
return f"Error in image generation: {str(e)}"
# Combined processing function
async def process_audio_and_generate_image(audio_path):
transcription_result = {"result": None}
image_result = {"result": None}
async def transcription_thread():
transcription_result["result"] = await transcribe_audio(audio_path)
async def image_generation_thread():
transcription = transcription_result["result"]
if transcription and "Error" not in transcription:
image_result["result"] = await generate_image_from_text(transcription)
await asyncio.gather(transcription_thread(), image_generation_thread())
transcription = transcription_result["result"]
image = image_result["result"]
if "Error" in transcription:
return None, transcription
if isinstance(image, str) and "Error" in image:
return None, image
return image, transcription
# Streamlit interface
st.title("Voice-to-Image Generator")
st.write("Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.")
audio_file = st.file_uploader("Upload audio file (WAV/MP3)", type=["wav", "mp3"])
if audio_file:
audio_path = f"temp_{audio_file.name}"
with open(audio_path, "wb") as f:
f.write(audio_file.read())
with st.spinner("Processing..."):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
image, transcription = loop.run_until_complete(process_audio_and_generate_image(audio_path))
if transcription and "Error" not in transcription:
st.subheader("Transcription")
st.write(transcription)
if image:
st.subheader("Generated Image")
st.image(image, caption="Generated from transcription")
else:
st.error("Error in generating image.")
else:
st.error(transcription)
|