Sayiqa commited on
Commit
68add06
·
verified ·
1 Parent(s): ea531b1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -0
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import os
3
+ import threading
4
+ import numpy as np
5
+ import librosa
6
+ import gradio as gr
7
+ from functools import lru_cache
8
+ from transformers import pipeline
9
+ from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
10
+ import torch
11
+ from huggingface_hub import login
12
+
13
+ # Install required dependencies
14
+ def install_missing_packages():
15
+ required_packages = {
16
+ "librosa": None,
17
+ "diffusers": ">=0.14.0",
18
+ "gradio": ">=3.35.2",
19
+ "huggingface_hub": None,
20
+ "accelerate": ">=0.20.1",
21
+ "transformers": ">=4.31.0",
22
+ "torch": ">=1.11.0"
23
+ }
24
+ for package, version in required_packages.items():
25
+ try:
26
+ __import__(package)
27
+ except ImportError:
28
+ package_name = f"{package}{version}" if version else package
29
+ subprocess.check_call(["pip", "install", package_name])
30
+
31
+ install_missing_packages()
32
+
33
+ # Hugging Face token authentication
34
+ hf_token = os.getenv("HF_TOKEN")
35
+ if hf_token:
36
+ login(hf_token)
37
+ else:
38
+ raise ValueError("HF_TOKEN environment variable not set.")
39
+
40
+ # Load the speech-to-text model
41
+ speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
42
+
43
+ # Load Stable Diffusion model
44
+ text_to_image = StableDiffusionPipeline.from_pretrained(
45
+ "runwayml/stable-diffusion-v1-5",
46
+ torch_dtype=torch.float16
47
+ )
48
+ device = "cuda" if torch.cuda.is_available() else "cpu"
49
+ text_to_image.to(device)
50
+ text_to_image.enable_attention_slicing() # Optimizes memory usage
51
+ text_to_image.safety_checker = None # Disables safety checker
52
+ text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
53
+
54
+ # Preprocess audio file into NumPy array
55
+ def preprocess_audio(audio_path):
56
+ try:
57
+ audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
58
+ return np.array(audio, dtype=np.float32)
59
+ except Exception as e:
60
+ return f"Error in preprocessing audio: {str(e)}"
61
+
62
+ # Transcribe audio to text
63
+ @lru_cache(maxsize=10)
64
+ def transcribe_audio(audio_path):
65
+ try:
66
+ audio_array = preprocess_audio(audio_path)
67
+ if isinstance(audio_array, str): # Error message from preprocessing
68
+ return audio_array
69
+ result = speech_to_text(audio_array)
70
+ return result["text"]
71
+ except Exception as e:
72
+ return f"Error in transcription: {str(e)}"
73
+
74
+ # Generate image from text
75
+ @lru_cache(maxsize=10)
76
+ def generate_image_from_text(text):
77
+ try:
78
+ image = text_to_image(text, height=512, width=512).images[0]
79
+ return image
80
+ except Exception as e:
81
+ return f"Error in image generation: {str(e)}"
82
+
83
+ # Process audio input (speech-to-image)
84
+ def speech_to_image(audio_path):
85
+ transcription = transcribe_audio(audio_path)
86
+ if "Error" in transcription:
87
+ return None, f"Transcription failed: {transcription}"
88
+
89
+ image = generate_image_from_text(transcription)
90
+ if isinstance(image, str) and "Error" in image:
91
+ return None, f"Image generation failed: {image}"
92
+
93
+ return image
94
+
95
+ # Process text input (text-to-image)
96
+ def text_to_image_interface(input_text):
97
+ try:
98
+ image = generate_image_from_text(input_text)
99
+ return image
100
+ except Exception as e:
101
+ return f"Error: {str(e)}"
102
+
103
+ # Gradio interface
104
+ speech_to_image_interface = gr.Interface(
105
+ fn=speech_to_image,
106
+ inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
107
+ outputs=gr.Image(label="Generated Image"),
108
+ title="Speech-to-Image Generator",
109
+ description="Upload an audio file to generate an image based on the transcribed speech."
110
+ )
111
+
112
+ text_to_image_interface = gr.Interface(
113
+ fn=text_to_image_interface,
114
+ inputs=gr.Textbox(label="Enter Text", placeholder="Describe an image..."),
115
+ outputs=gr.Image(label="Generated Image"),
116
+ title="Text-to-Image Generator",
117
+ description="Enter text to generate an image based on the description."
118
+ )
119
+
120
+ # Combine interfaces into a single Gradio app
121
+ app = gr.TabbedInterface(
122
+ interface_list=[speech_to_image_interface, text_to_image_interface],
123
+ tab_names=["Speech-to-Image", "Text-to-Image"]
124
+ )
125
+
126
+ # Launch Gradio interface
127
+ app.launch(debug=True, share=True)