|
|
import subprocess |
|
|
import os |
|
|
import threading |
|
|
import numpy as np |
|
|
import librosa |
|
|
import gradio as gr |
|
|
from functools import lru_cache |
|
|
from transformers import pipeline |
|
|
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler |
|
|
import torch |
|
|
from huggingface_hub import login |
|
|
|
|
|
|
|
|
def install_missing_packages(): |
|
|
required_packages = { |
|
|
"librosa": None, |
|
|
"diffusers": ">=0.14.0", |
|
|
"gradio": ">=3.35.2", |
|
|
"huggingface_hub": None, |
|
|
"accelerate": ">=0.20.1", |
|
|
"transformers": ">=4.31.0", |
|
|
"torch": ">=1.11.0" |
|
|
} |
|
|
for package, version in required_packages.items(): |
|
|
try: |
|
|
__import__(package) |
|
|
except ImportError: |
|
|
package_name = f"{package}{version}" if version else package |
|
|
subprocess.check_call(["pip", "install", package_name]) |
|
|
|
|
|
install_missing_packages() |
|
|
|
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
if hf_token: |
|
|
login(hf_token) |
|
|
else: |
|
|
raise ValueError("HF_TOKEN environment variable not set.") |
|
|
|
|
|
|
|
|
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") |
|
|
|
|
|
|
|
|
text_to_image = StableDiffusionPipeline.from_pretrained( |
|
|
"runwayml/stable-diffusion-v1-5", |
|
|
torch_dtype=torch.float16 |
|
|
) |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
text_to_image.to(device) |
|
|
text_to_image.enable_attention_slicing() |
|
|
text_to_image.safety_checker = None |
|
|
text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config) |
|
|
|
|
|
|
|
|
def preprocess_audio(audio_path): |
|
|
try: |
|
|
audio, sr = librosa.load(audio_path, sr=16000) |
|
|
return np.array(audio, dtype=np.float32) |
|
|
except Exception as e: |
|
|
return f"Error in preprocessing audio: {str(e)}" |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=10) |
|
|
def transcribe_audio(audio_path): |
|
|
try: |
|
|
audio_array = preprocess_audio(audio_path) |
|
|
if isinstance(audio_array, str): |
|
|
return audio_array |
|
|
result = speech_to_text(audio_array) |
|
|
return result["text"] |
|
|
except Exception as e: |
|
|
return f"Error in transcription: {str(e)}" |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=10) |
|
|
def generate_image_from_text(text): |
|
|
try: |
|
|
image = text_to_image(text, height=512, width=512).images[0] |
|
|
return image |
|
|
except Exception as e: |
|
|
return f"Error in image generation: {str(e)}" |
|
|
|
|
|
|
|
|
def speech_to_image(audio_path): |
|
|
transcription = transcribe_audio(audio_path) |
|
|
if "Error" in transcription: |
|
|
return None, f"Transcription failed: {transcription}" |
|
|
|
|
|
image = generate_image_from_text(transcription) |
|
|
if isinstance(image, str) and "Error" in image: |
|
|
return None, f"Image generation failed: {image}" |
|
|
|
|
|
return image |
|
|
|
|
|
|
|
|
def text_to_image_interface(input_text): |
|
|
try: |
|
|
image = generate_image_from_text(input_text) |
|
|
return image |
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
speech_to_image_interface = gr.Interface( |
|
|
fn=speech_to_image, |
|
|
inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"), |
|
|
outputs=gr.Image(label="Generated Image"), |
|
|
title="Speech-to-Image Generator", |
|
|
description="Upload an audio file to generate an image based on the transcribed speech." |
|
|
) |
|
|
|
|
|
text_to_image_interface = gr.Interface( |
|
|
fn=text_to_image_interface, |
|
|
inputs=gr.Textbox(label="Enter Text", placeholder="Describe an image..."), |
|
|
outputs=gr.Image(label="Generated Image"), |
|
|
title="Text-to-Image Generator", |
|
|
description="Enter text to generate an image based on the description." |
|
|
) |
|
|
|
|
|
|
|
|
app = gr.TabbedInterface( |
|
|
interface_list=[speech_to_image_interface, text_to_image_interface], |
|
|
tab_names=["Speech-to-Image", "Text-to-Image"] |
|
|
) |
|
|
|
|
|
|
|
|
app.launch(debug=True, share=True) |
|
|
|