File size: 13,366 Bytes
5a352b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23a54a8
 
 
 
 
bb91481
23a54a8
 
 
bb91481
23a54a8
 
 
 
 
 
bb91481
23a54a8
 
 
bb91481
23a54a8
 
bb91481
23a54a8
bb91481
23a54a8
 
 
 
 
 
bb91481
23a54a8
 
bb91481
23a54a8
bb91481
23a54a8
 
 
 
bb91481
23a54a8
 
bb91481
23a54a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b400ba0
23a54a8
 
 
 
fa7605e
24bf5a8
23a54a8
f849521
b430e75
24bf5a8
 
23a54a8
 
 
 
 
 
 
 
 
 
3e82da2
9e36c1d
b430e75
23a54a8
 
 
 
 
98fa131
23a54a8
 
24bf5a8
f849521
b430e75
24bf5a8
bb91481
23a54a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a729e3
23a54a8
82e5655
23a54a8
5a729e3
 
 
 
 
7859725
23a54a8
 
 
 
 
 
 
 
 
 
2597bef
23a54a8
 
ed452e9
23a54a8
 
 
b27f82d
23a54a8
 
 
 
 
b27f82d
23a54a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b27f82d
23a54a8
 
 
 
 
 
 
 
db0448e
23a54a8
 
b430e75
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# import gradio as gr
# import requests
# import time
# from PIL import Image
# from io import BytesIO

# # AssemblyAI API Key
# ASSEMBLYAI_API_KEY = "your_assemblyai_api_key_here"
# # DeepAI API Key
# DEEPAI_API_KEY = "your_deepai_api_key_here"

# # Function to convert speech to text using AssemblyAI API
# def speech_to_text(audio_file):
#     # Upload audio to AssemblyAI for transcription
#     upload_url = "https://api.assemblyai.com/v2/upload"
#     headers = {
#         "authorization": ASSEMBLYAI_API_KEY
#     }

#     # Upload the audio file to AssemblyAI
#     with open(audio_file, 'rb') as file:
#         response = requests.post(upload_url, headers=headers, files={"file": file})

#     if response.status_code != 200:
#         return "Error uploading audio."

#     audio_url = response.json()["upload_url"]

#     # Request transcription from AssemblyAI
#     transcript_url = "https://api.assemblyai.com/v2/transcript"
#     transcript_request = {
#         "audio_url": audio_url
#     }
#     transcript_response = requests.post(transcript_url, json=transcript_request, headers=headers)

#     if transcript_response.status_code != 200:
#         return "Error requesting transcription."

#     transcript_id = transcript_response.json()["id"]

#     # Poll for transcription completion
#     while True:
#         polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
#         polling_response = requests.get(polling_url, headers=headers)

#         if polling_response.status_code != 200:
#             return "Error polling for transcription status."

#         status = polling_response.json()["status"]
#         if status == "completed":
#             return polling_response.json()["text"]
#         elif status == "failed":
#             return "Transcription failed."

#         time.sleep(5)  # Wait 5 seconds before polling again

# # Function to generate an image based on text using DeepAI's Image Generation API
# def generate_image_from_text(text):
#     image_generation_url = "https://api.deepai.org/api/text2img"
#     headers = {
#         "api-key": DEEPAI_API_KEY
#     }
#     payload = {
#         "text": text
#     }

#     # Request image generation from DeepAI
#     response = requests.post(image_generation_url, data=payload, headers=headers)

#     if response.status_code == 200:
#         # Get the image URL from the response
#         image_url = response.json()["output_url"]
#         return image_url
#     else:
#         return "Failed to generate image."

# # Function to download image from URL and return as a PIL image
# def get_image_from_url(image_url):
#     try:
#         response = requests.get(image_url)
#         img = Image.open(BytesIO(response.content))
#         return img
#     except Exception as e:
#         return "Error downloading image: " + str(e)

# # Gradio Interface function
# def process_audio(audio_file):
#     # Convert speech to text
#     text = speech_to_text(audio_file)
#     if text and text != "Error uploading audio." and text != "Error requesting transcription.":
#         print(f"Transcribed text: {text}")  # Debug output for transcribed text
        
#         # Generate image from the transcribed text
#         image_url = generate_image_from_text(text)
#         if "Failed" not in image_url:
#             print(f"Image URL: {image_url}")  # Debug output for image URL
#             # Download the image from URL and return it as a PIL image
#             return get_image_from_url(image_url)
#         else:
#             return image_url
#     else:
#         return "Error processing audio."

# # Set up Gradio interface
# iface = gr.Interface(fn=process_audio,
#                      inputs=gr.Audio(type="filepath"),  # Audio input
#                      outputs=gr.Image(type="pil"),  # Image output as PIL image
#                      live=True,
#                      title="Speech-to-Text to Image Generator")

# iface.launch()


# import gradio as gr
# import requests
# import time
# from PIL import Image
# from io import BytesIO

# # API keys
# ASSEMBLYAI_API_KEY = "your_assemblyai_api_key_here"
# STABILITY_AI_API_KEY = "your_stability_ai_api_key_here"

# # Function to convert speech to text using AssemblyAI API
# def speech_to_text(audio_file):
#     upload_url = "https://api.assemblyai.com/v2/upload"
#     headers = {
#         "authorization": ASSEMBLYAI_API_KEY
#     }

#     # Upload the audio file to AssemblyAI
#     with open(audio_file, 'rb') as file:
#         response = requests.post(upload_url, headers=headers, files={"file": file})

#     if response.status_code != 200:
#         return "Error uploading audio."

#     audio_url = response.json()["upload_url"]

#     # Request transcription from AssemblyAI
#     transcript_url = "https://api.assemblyai.com/v2/transcript"
#     transcript_request = {
#         "audio_url": audio_url
#     }
#     transcript_response = requests.post(transcript_url, json=transcript_request, headers=headers)

#     if transcript_response.status_code != 200:
#         return "Error requesting transcription."

#     transcript_id = transcript_response.json()["id"]

#     # Poll for transcription completion
#     while True:
#         polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
#         polling_response = requests.get(polling_url, headers=headers)

#         if polling_response.status_code != 200:
#             return "Error polling for transcription status."

#         status = polling_response.json()["status"]
#         if status == "completed":
#             return polling_response.json()["text"]
#         elif status == "failed":
#             return "Transcription failed."

#         time.sleep(5)  # Wait 5 seconds before polling again

# # Function to generate an image based on text using Stability AI (Stable Diffusion)
# def generate_image_from_text(text):
#     image_generation_url = "https://stability.ai/api/v3/generate"  # Stability AI API endpoint (assuming)
#     headers = {
#         "Authorization": f"Bearer {STABILITY_AI_API_KEY}"
#     }
#     payload = {
#         "text": text,
#         "width": 512,  # Adjust image dimensions as needed
#         "height": 512
#     }

#     # Request image generation from Stability AI
#     response = requests.post(image_generation_url, json=payload, headers=headers)

#     if response.status_code == 200:
#         # Get the image URL from the response (assuming the response contains a URL)
#         image_url = response.json().get("image_url", "")
#         if image_url:
#             return image_url
#         else:
#             return "Failed to generate image: No image URL found in response."
#     else:
#         return f"Failed to generate image: {response.status_code}"

# # Function to download image from URL and return as a PIL image
# def get_image_from_url(image_url):
#     try:
#         response = requests.get(image_url)
#         img = Image.open(BytesIO(response.content))
#         return img
#     except Exception as e:
#         return f"Error downloading image: {str(e)}"

# # Gradio Interface function
# def process_audio(audio_file):
#     # Convert speech to text
#     text = speech_to_text(audio_file)
#     if text and text != "Error uploading audio." and text != "Error requesting transcription.":
#         print(f"Transcribed text: {text}")  # Debug output for transcribed text
        
#         # Generate image from the transcribed text
#         image_url = generate_image_from_text(text)
#         if "Failed" not in image_url:
#             print(f"Image URL: {image_url}")  # Debug output for image URL
#             # Download the image from URL and return it as a PIL image
#             return get_image_from_url(image_url)
#         else:
#             return image_url
#     else:
#         return "Error processing audio."

# # Set up Gradio interface
# iface = gr.Interface(fn=process_audio,
#                      inputs=gr.Audio(type="filepath"),  # Audio input
#                      outputs=gr.Image(type="pil"),  # Image output as PIL image
#                      live=True,
#                      title="Speech-to-Text to Image Generator")

# iface.launch()
#1st D
import subprocess

# Install required libraries
subprocess.check_call(["pip", "install", "torch>=1.11.0"])
subprocess.check_call(["pip", "install", "transformers"])
subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
subprocess.check_call(["pip", "install", "librosa"])
subprocess.check_call(["pip", "install", "accelerate>= 0.20.1"])
subprocess.check_call(["pip", "install", "safetensors>=0.1.0"])
subprocess.check_call(["pip", "install", "huggingface_hub>=0.16.4"])

import os
import threading
import numpy as np
import diffusers
from functools import lru_cache
import gradio as gr
from transformers import pipeline
from huggingface_hub import login
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
import librosa
import accelerate
import pandas
import safetensors
import torch  # Import torch here to avoid the NameError

# Ensure required dependencies are installed
def install_missing_packages():
    required_packages = {
        "librosa": None, 
        "diffusers": ">=0.14.0",
        "gradio": ">=3.35.2",
        "huggingface_hub": ">=0.16.4",
        "accelerate": ">= 0.20.1",
        "safetensors":">=0.1.0",
        "torch":">=1.11.0",
    }
    for package, version in required_packages.items():
        try:
            __import__(package)
        except ImportError:
            package_name = f"{package}{version}" if version else package
            subprocess.check_call(["pip", "install", package_name])

install_missing_packages()

# Get Hugging Face token for authentication
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(hf_token)
else:
    raise ValueError("HF_TOKEN environment variable not set.")

# Load speech-to-text model (Whisper)
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

#Load Stable Diffusion model for text-to-image
text_to_image = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5"     
)
# text_to_image = StableDiffusionPipeline.from_pretrained(
#     "runwayml/stable-diffusion-v1-5",
#     cache_dir="./my_model_cache",  # Custom cache directory
#     revision="fp16"
# )

device = "cuda" if torch.cuda.is_available() else "cpu"  # This will now work since torch is imported
text_to_image.to(device)
text_to_image.enable_attention_slicing()  # Optimizes memory usage
text_to_image.safety_checker = None  # Disables safety checker to improve speed
text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)  # Faster scheduler

# Preprocess audio file into NumPy array
def preprocess_audio(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
        return np.array(audio, dtype=np.float16)
    except Exception as e:
        return f"Error in preprocessing audio: {str(e)}"

# Speech-to-text function
@lru_cache(maxsize=10)
def transcribe_audio(audio_path):
    try:
        audio_array = preprocess_audio(audio_path)
        if isinstance(audio_array, str):  # Error message from preprocessing
            return audio_array
        result = speech_to_text(audio_array)
        return result["text"]
    except Exception as e:
        return f"Error in transcription: {str(e)}"

# Text-to-image function
@lru_cache(maxsize=10)
def generate_image_from_text(text):
    try:
        image = text_to_image(text, height=256, width=256).images[0]  # Generate smaller images for speed
        return image
    except Exception as e:
        return f"Error in image generation: {str(e)}"

# Optimized combined processing function
def process_audio_and_generate_image(audio_path):
    transcription_result = {"result": None}
    image_result = {"result": None}

    # Function to run transcription and image generation in parallel
    def transcription_thread():
        transcription_result["result"] = transcribe_audio(audio_path)

    def image_generation_thread():
        transcription = transcription_result["result"]
        if transcription and "Error" not in transcription:
            image_result["result"] = generate_image_from_text(transcription)

    # Start both tasks in parallel
    t1 = threading.Thread(target=transcription_thread)
    t2 = threading.Thread(target=image_generation_thread)

    t1.start()
    t2.start()

    t1.join()  # Wait for transcription to finish
    t2.join()  # Wait for image generation to finish

    transcription = transcription_result["result"]
    image = image_result["result"]

    if "Error" in transcription:
        return None, transcription
    if isinstance(image, str) and "Error" in image:
        return None, image

    return image, transcription

# Gradio interface
iface = gr.Interface(
    fn=process_audio_and_generate_image,
    inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
    outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
    title="Voice-to-Image Generator",
    description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
)

# Launch Gradio interface
iface.launch(debug=True, share=True)
#2 D