File size: 4,563 Bytes
68f198e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665107f
68f198e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

import os
import numpy as np
import torch
import librosa
import asyncio
import streamlit as st
from transformers import pipeline
from huggingface_hub import login
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from functools import lru_cache

# Install missing packages (if required, handled manually for Streamlit environment)
def install_missing_packages():
    required_packages = {
        "librosa": None,
        "diffusers": ">=0.14.0",
        "transformers": None,
        "torch": "==2.0.0+cu118",
    }
    for package, version in required_packages.items():
        try:
            __import__(package)
        except ImportError:
            package_name = f"{package}{version}" if version else package
            print(f"Installing {package_name}...")
            try:
                subprocess.check_call(["pip", "install", package_name])
            except subprocess.CalledProcessError as e:
                print(f"Error installing {package_name}: {e}")
                return

install_missing_packages()

# Authenticate with Hugging Face Hub
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(hf_token)
else:
    raise ValueError("HF_TOKEN environment variable not set.")

# Load models
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

text_to_image = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
)
device = "cuda" if torch.cuda.is_available() else "cpu"
text_to_image.to(device)
text_to_image.enable_attention_slicing()
text_to_image.safety_checker = None
text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)

# Preprocess audio file into NumPy array
def preprocess_audio(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
        return np.array(audio, dtype=np.float32)
    except Exception as e:
        return f"Error in preprocessing audio: {str(e)}"

# Speech-to-text function
@lru_cache(maxsize=10)
async def transcribe_audio(audio_path):
    try:
        audio_array = preprocess_audio(audio_path)
        if isinstance(audio_array, str):  # Error message from preprocessing
            return audio_array
        result = speech_to_text(audio_array)
        transcription = result["text"]
        return transcription
    except Exception as e:
        return f"Error in transcription: {str(e)}"

# Text-to-image function
@lru_cache(maxsize=10)
async def generate_image_from_text(text):
    try:
        image = text_to_image(text, height=256, width=256).images[0]
        return image
    except Exception as e:
        return f"Error in image generation: {str(e)}"

# Combined processing function
async def process_audio_and_generate_image(audio_path):
    transcription_result = {"result": None}
    image_result = {"result": None}

    async def transcription_thread():
        transcription_result["result"] = await transcribe_audio(audio_path)

    async def image_generation_thread():
        transcription = transcription_result["result"]
        if transcription and "Error" not in transcription:
            image_result["result"] = await generate_image_from_text(transcription)

    await asyncio.gather(transcription_thread(), image_generation_thread())

    transcription = transcription_result["result"]
    image = image_result["result"]

    if "Error" in transcription:
        return None, transcription
    if isinstance(image, str) and "Error" in image:
        return None, image

    return image, transcription

# Streamlit interface
st.title("Voice-to-Image Generator")
st.write("Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.")

audio_file = st.file_uploader("Upload audio file (WAV/MP3)", type=["wav", "mp3"])

if audio_file:
    audio_path = f"temp_{audio_file.name}"
    with open(audio_path, "wb") as f:
        f.write(audio_file.read())

    with st.spinner("Processing..."):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        image, transcription = loop.run_until_complete(process_audio_and_generate_image(audio_path))

    if transcription and "Error" not in transcription:
        st.subheader("Transcription")
        st.write(transcription)

        if image:
            st.subheader("Generated Image")
            st.image(image, caption="Generated from transcription")
        else:
            st.error("Error in generating image.")
    else:
        st.error(transcription)