File size: 4,173 Bytes
68add06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import subprocess
import os
import threading
import numpy as np
import librosa
import gradio as gr
from functools import lru_cache
from transformers import pipeline
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
import torch
from huggingface_hub import login

# Install required dependencies
def install_missing_packages():
    required_packages = {
        "librosa": None,
        "diffusers": ">=0.14.0",
        "gradio": ">=3.35.2",
        "huggingface_hub": None,
        "accelerate": ">=0.20.1",
        "transformers": ">=4.31.0",
        "torch": ">=1.11.0"
    }
    for package, version in required_packages.items():
        try:
            __import__(package)
        except ImportError:
            package_name = f"{package}{version}" if version else package
            subprocess.check_call(["pip", "install", package_name])

install_missing_packages()

# Hugging Face token authentication
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(hf_token)
else:
    raise ValueError("HF_TOKEN environment variable not set.")

# Load the speech-to-text model
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

# Load Stable Diffusion model
text_to_image = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", 
    torch_dtype=torch.float16
)
device = "cuda" if torch.cuda.is_available() else "cpu"
text_to_image.to(device)
text_to_image.enable_attention_slicing()  # Optimizes memory usage
text_to_image.safety_checker = None  # Disables safety checker
text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)

# Preprocess audio file into NumPy array
def preprocess_audio(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
        return np.array(audio, dtype=np.float32)
    except Exception as e:
        return f"Error in preprocessing audio: {str(e)}"

# Transcribe audio to text
@lru_cache(maxsize=10)
def transcribe_audio(audio_path):
    try:
        audio_array = preprocess_audio(audio_path)
        if isinstance(audio_array, str):  # Error message from preprocessing
            return audio_array
        result = speech_to_text(audio_array)
        return result["text"]
    except Exception as e:
        return f"Error in transcription: {str(e)}"

# Generate image from text
@lru_cache(maxsize=10)
def generate_image_from_text(text):
    try:
        image = text_to_image(text, height=512, width=512).images[0]
        return image
    except Exception as e:
        return f"Error in image generation: {str(e)}"

# Process audio input (speech-to-image)
def speech_to_image(audio_path):
    transcription = transcribe_audio(audio_path)
    if "Error" in transcription:
        return None, f"Transcription failed: {transcription}"
    
    image = generate_image_from_text(transcription)
    if isinstance(image, str) and "Error" in image:
        return None, f"Image generation failed: {image}"
    
    return image

# Process text input (text-to-image)
def text_to_image_interface(input_text):
    try:
        image = generate_image_from_text(input_text)
        return image
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio interface
speech_to_image_interface = gr.Interface(
    fn=speech_to_image,
    inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
    outputs=gr.Image(label="Generated Image"),
    title="Speech-to-Image Generator",
    description="Upload an audio file to generate an image based on the transcribed speech."
)

text_to_image_interface = gr.Interface(
    fn=text_to_image_interface,
    inputs=gr.Textbox(label="Enter Text", placeholder="Describe an image..."),
    outputs=gr.Image(label="Generated Image"),
    title="Text-to-Image Generator",
    description="Enter text to generate an image based on the description."
)

# Combine interfaces into a single Gradio app
app = gr.TabbedInterface(
    interface_list=[speech_to_image_interface, text_to_image_interface],
    tab_names=["Speech-to-Image", "Text-to-Image"]
)

# Launch Gradio interface
app.launch(debug=True, share=True)