Files changed (1) hide show
  1. app.py +186 -186
app.py CHANGED
@@ -1,182 +1,3 @@
1
- # import subprocess
2
-
3
- # # Install required libraries
4
- # subprocess.check_call(["pip", "install", "torch>=1.11.0"])
5
- # subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
6
- # subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
7
- # subprocess.check_call(["pip", "install", "librosa"])
8
- # subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
9
- # subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
10
- # subprocess.check_call(["pip", "install", "huggingface_hub"])
11
-
12
- # import os
13
- # import threading
14
- # import numpy as np
15
- # import librosa
16
- # import torch
17
- # import gradio as gr
18
- # from functools import lru_cache
19
- # from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
- # from huggingface_hub import login
21
- # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
-
23
- # # Ensure required dependencies are installed
24
- # def install_missing_packages():
25
- # required_packages = {
26
- # "librosa": None,
27
- # "diffusers": ">=0.14.0",
28
- # "gradio": ">=3.35.2",
29
- # "huggingface_hub": None,
30
- # "accelerate": ">=0.20.1",
31
- # "transformers": ">=4.31.0"
32
- # }
33
- # for package, version in required_packages.items():
34
- # try:
35
- # __import__(package)
36
- # except ImportError:
37
- # package_name = f"{package}{version}" if version else package
38
- # subprocess.check_call(["pip", "install", package_name])
39
-
40
- # install_missing_packages()
41
-
42
- # # Get Hugging Face token for authentication
43
- # hf_token = os.getenv("HF_TOKEN")
44
- # if hf_token:
45
- # login(hf_token)
46
- # else:
47
- # raise ValueError("HF_TOKEN environment variable not set.")
48
-
49
- # # Load speech-to-text model (Whisper)
50
- # speech_to_text = pipeline(
51
- # "automatic-speech-recognition",
52
- # model="openai/whisper-tiny",
53
- # return_timestamps=True
54
- # )
55
-
56
- # # Load Stable Diffusion model for text-to-image
57
- # text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
58
- # device = "cuda" if torch.cuda.is_available() else "cpu"
59
- # text_to_image.to(device)
60
- # text_to_image.enable_attention_slicing()
61
- # text_to_image.safety_checker = None
62
- # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
-
64
- # # Load ChatGPT-like conversational model
65
- # chat_model_name = "microsoft/DialoGPT-medium"
66
- # chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
- # chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
-
69
- # # Preprocess audio file into NumPy array
70
- # def preprocess_audio(audio_path):
71
- # try:
72
- # audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
73
- # return np.array(audio, dtype=np.float32)
74
- # except Exception as e:
75
- # return f"Error in preprocessing audio: {str(e)}"
76
-
77
- # # Speech-to-text function with long-form transcription support
78
- # @lru_cache(maxsize=10)
79
- # def transcribe_audio(audio_path):
80
- # try:
81
- # audio_array = preprocess_audio(audio_path)
82
- # if isinstance(audio_array, str): # Error message from preprocessing
83
- # return audio_array
84
- # result = speech_to_text(audio_array)
85
- # # Combine text from multiple segments for long-form transcription
86
- # transcription = " ".join(segment["text"] for segment in result["chunks"])
87
- # return transcription
88
- # except Exception as e:
89
- # return f"Error in transcription: {str(e)}"
90
-
91
- # # Text-to-image function
92
- # @lru_cache(maxsize=10)
93
- # def generate_image_from_text(text):
94
- # try:
95
- # image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
96
- # return image
97
- # except Exception as e:
98
- # return f"Error in image generation: {str(e)}"
99
-
100
- # # ChatGPT-like conversational response
101
- # def chat_with_gpt(prompt):
102
- # try:
103
- # inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
- # outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
- # response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
- # return response
107
- # except Exception as e:
108
- # return f"Error in chat response: {str(e)}"
109
-
110
- # # Combined processing function
111
- # def process_audio_and_generate_results(audio_path):
112
- # transcription_result = {"result": None}
113
- # image_result = {"result": None}
114
-
115
- # # Function to run transcription and image generation in parallel
116
- # def transcription_thread():
117
- # transcription_result["result"] = transcribe_audio(audio_path)
118
-
119
- # def image_generation_thread():
120
- # transcription = transcription_result["result"]
121
- # if transcription and "Error" not in transcription:
122
- # image_result["result"] = generate_image_from_text(transcription)
123
-
124
- # # Start both tasks in parallel
125
- # t1 = threading.Thread(target=transcription_thread)
126
- # t2 = threading.Thread(target=image_generation_thread)
127
-
128
- # t1.start()
129
- # t2.start()
130
-
131
- # t1.join() # Wait for transcription to finish
132
- # t2.join() # Wait for image generation to finish
133
-
134
- # transcription = transcription_result["result"]
135
- # image = image_result["result"]
136
-
137
- # if "Error" in transcription:
138
- # return None, transcription
139
- # if isinstance(image, str) and "Error" in image:
140
- # return None, image
141
-
142
- # return image, transcription
143
-
144
- # # Gradio interface for speech-to-text
145
- # speech_to_text_iface = gr.Interface(
146
- # fn=transcribe_audio,
147
- # inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
148
- # outputs=gr.Textbox(label="Transcription"),
149
- # title="Speech-to-Text Transcription",
150
- # description="Upload an audio file to transcribe speech into text.",
151
- # )
152
-
153
- # # Gradio interface for voice-to-image and chat
154
- # voice_to_image_and_chat_iface = gr.Interface(
155
- # fn=process_audio_and_generate_results,
156
- # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
- # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
- # title="Voice-to-Image and Chat",
159
- # description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
- # )
161
-
162
- # # Gradio interface for ChatGPT-like functionality
163
- # chat_iface = gr.Interface(
164
- # fn=chat_with_gpt,
165
- # inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
- # outputs=gr.Textbox(label="ChatGPT Response"),
167
- # title="ChatGPT",
168
- # description="Chat with GPT-like conversational AI.",
169
- # )
170
-
171
- # # Combined Gradio app
172
- # iface = gr.TabbedInterface(
173
- # interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
- # tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
- # )
176
-
177
- # # Launch Gradio interface
178
- # iface.launch(debug=True, share=True)
179
-
180
  import subprocess
181
 
182
  # Install required libraries
@@ -195,7 +16,7 @@ import librosa
195
  import torch
196
  import gradio as gr
197
  from functools import lru_cache
198
- from transformers import pipeline
199
  from huggingface_hub import login
200
  from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
201
 
@@ -240,6 +61,11 @@ text_to_image.enable_attention_slicing()
240
  text_to_image.safety_checker = None
241
  text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
 
 
 
 
 
 
243
  # Preprocess audio file into NumPy array
244
  def preprocess_audio(audio_path):
245
  try:
@@ -271,6 +97,16 @@ def generate_image_from_text(text):
271
  except Exception as e:
272
  return f"Error in image generation: {str(e)}"
273
 
 
 
 
 
 
 
 
 
 
 
274
  # Combined processing function
275
  def process_audio_and_generate_results(audio_path):
276
  transcription_result = {"result": None}
@@ -314,24 +150,188 @@ speech_to_text_iface = gr.Interface(
314
  description="Upload an audio file to transcribe speech into text.",
315
  )
316
 
317
- # Gradio interface for voice-to-image
318
- voice_to_image_iface = gr.Interface(
319
  fn=process_audio_and_generate_results,
320
  inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
321
  outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
322
- title="Voice-to-Image",
323
- description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
 
 
 
 
 
 
 
 
 
324
  )
325
 
326
  # Combined Gradio app
327
  iface = gr.TabbedInterface(
328
- interface_list=[speech_to_text_iface, voice_to_image_iface],
329
- tab_names=["Speech-to-Text", "Voice-to-Image"]
330
  )
331
 
332
  # Launch Gradio interface
333
  iface.launch(debug=True, share=True)
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
 
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import subprocess
2
 
3
  # Install required libraries
 
16
  import torch
17
  import gradio as gr
18
  from functools import lru_cache
19
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
  from huggingface_hub import login
21
  from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
 
 
61
  text_to_image.safety_checker = None
62
  text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
 
64
+ # Load ChatGPT-like conversational model
65
+ chat_model_name = "microsoft/DialoGPT-medium"
66
+ chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
+ chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
+
69
  # Preprocess audio file into NumPy array
70
  def preprocess_audio(audio_path):
71
  try:
 
97
  except Exception as e:
98
  return f"Error in image generation: {str(e)}"
99
 
100
+ # ChatGPT-like conversational response
101
+ def chat_with_gpt(prompt):
102
+ try:
103
+ inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
+ outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
+ response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
+ return response
107
+ except Exception as e:
108
+ return f"Error in chat response: {str(e)}"
109
+
110
  # Combined processing function
111
  def process_audio_and_generate_results(audio_path):
112
  transcription_result = {"result": None}
 
150
  description="Upload an audio file to transcribe speech into text.",
151
  )
152
 
153
+ # Gradio interface for voice-to-image and chat
154
+ voice_to_image_and_chat_iface = gr.Interface(
155
  fn=process_audio_and_generate_results,
156
  inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
  outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
+ title="Voice-to-Image and Chat",
159
+ description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
+ )
161
+
162
+ # Gradio interface for ChatGPT-like functionality
163
+ chat_iface = gr.Interface(
164
+ fn=chat_with_gpt,
165
+ inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
+ outputs=gr.Textbox(label="ChatGPT Response"),
167
+ title="ChatGPT",
168
+ description="Chat with GPT-like conversational AI.",
169
  )
170
 
171
  # Combined Gradio app
172
  iface = gr.TabbedInterface(
173
+ interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
+ tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
  )
176
 
177
  # Launch Gradio interface
178
  iface.launch(debug=True, share=True)
179
 
180
+ # import subprocess
181
+
182
+ # # Install required libraries
183
+ # subprocess.check_call(["pip", "install", "torch>=1.11.0"])
184
+ # subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
185
+ # subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
186
+ # subprocess.check_call(["pip", "install", "librosa"])
187
+ # subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
188
+ # subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
189
+ # subprocess.check_call(["pip", "install", "huggingface_hub"])
190
+
191
+ # import os
192
+ # import threading
193
+ # import numpy as np
194
+ # import librosa
195
+ # import torch
196
+ # import gradio as gr
197
+ # from functools import lru_cache
198
+ # from transformers import pipeline
199
+ # from huggingface_hub import login
200
+ # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
201
+
202
+ # # Ensure required dependencies are installed
203
+ # def install_missing_packages():
204
+ # required_packages = {
205
+ # "librosa": None,
206
+ # "diffusers": ">=0.14.0",
207
+ # "gradio": ">=3.35.2",
208
+ # "huggingface_hub": None,
209
+ # "accelerate": ">=0.20.1",
210
+ # "transformers": ">=4.31.0"
211
+ # }
212
+ # for package, version in required_packages.items():
213
+ # try:
214
+ # __import__(package)
215
+ # except ImportError:
216
+ # package_name = f"{package}{version}" if version else package
217
+ # subprocess.check_call(["pip", "install", package_name])
218
+
219
+ # install_missing_packages()
220
+
221
+ # # Get Hugging Face token for authentication
222
+ # hf_token = os.getenv("HF_TOKEN")
223
+ # if hf_token:
224
+ # login(hf_token)
225
+ # else:
226
+ # raise ValueError("HF_TOKEN environment variable not set.")
227
+
228
+ # # Load speech-to-text model (Whisper)
229
+ # speech_to_text = pipeline(
230
+ # "automatic-speech-recognition",
231
+ # model="openai/whisper-tiny",
232
+ # return_timestamps=True
233
+ # )
234
+
235
+ # # Load Stable Diffusion model for text-to-image
236
+ # text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
237
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
238
+ # text_to_image.to(device)
239
+ # text_to_image.enable_attention_slicing()
240
+ # text_to_image.safety_checker = None
241
+ # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
+
243
+ # # Preprocess audio file into NumPy array
244
+ # def preprocess_audio(audio_path):
245
+ # try:
246
+ # audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
247
+ # return np.array(audio, dtype=np.float32)
248
+ # except Exception as e:
249
+ # return f"Error in preprocessing audio: {str(e)}"
250
+
251
+ # # Speech-to-text function with long-form transcription support
252
+ # @lru_cache(maxsize=10)
253
+ # def transcribe_audio(audio_path):
254
+ # try:
255
+ # audio_array = preprocess_audio(audio_path)
256
+ # if isinstance(audio_array, str): # Error message from preprocessing
257
+ # return audio_array
258
+ # result = speech_to_text(audio_array)
259
+ # # Combine text from multiple segments for long-form transcription
260
+ # transcription = " ".join(segment["text"] for segment in result["chunks"])
261
+ # return transcription
262
+ # except Exception as e:
263
+ # return f"Error in transcription: {str(e)}"
264
+
265
+ # # Text-to-image function
266
+ # @lru_cache(maxsize=10)
267
+ # def generate_image_from_text(text):
268
+ # try:
269
+ # image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
270
+ # return image
271
+ # except Exception as e:
272
+ # return f"Error in image generation: {str(e)}"
273
+
274
+ # # Combined processing function
275
+ # def process_audio_and_generate_results(audio_path):
276
+ # transcription_result = {"result": None}
277
+ # image_result = {"result": None}
278
+
279
+ # # Function to run transcription and image generation in parallel
280
+ # def transcription_thread():
281
+ # transcription_result["result"] = transcribe_audio(audio_path)
282
+
283
+ # def image_generation_thread():
284
+ # transcription = transcription_result["result"]
285
+ # if transcription and "Error" not in transcription:
286
+ # image_result["result"] = generate_image_from_text(transcription)
287
+
288
+ # # Start both tasks in parallel
289
+ # t1 = threading.Thread(target=transcription_thread)
290
+ # t2 = threading.Thread(target=image_generation_thread)
291
+
292
+ # t1.start()
293
+ # t2.start()
294
+
295
+ # t1.join() # Wait for transcription to finish
296
+ # t2.join() # Wait for image generation to finish
297
+
298
+ # transcription = transcription_result["result"]
299
+ # image = image_result["result"]
300
+
301
+ # if "Error" in transcription:
302
+ # return None, transcription
303
+ # if isinstance(image, str) and "Error" in image:
304
+ # return None, image
305
+
306
+ # return image, transcription
307
+
308
+ # # Gradio interface for speech-to-text
309
+ # speech_to_text_iface = gr.Interface(
310
+ # fn=transcribe_audio,
311
+ # inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
312
+ # outputs=gr.Textbox(label="Transcription"),
313
+ # title="Speech-to-Text Transcription",
314
+ # description="Upload an audio file to transcribe speech into text.",
315
+ # )
316
+
317
+ # # Gradio interface for voice-to-image
318
+ # voice_to_image_iface = gr.Interface(
319
+ # fn=process_audio_and_generate_results,
320
+ # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
321
+ # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
322
+ # title="Voice-to-Image",
323
+ # description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
324
+ # )
325
+
326
+ # # Combined Gradio app
327
+ # iface = gr.TabbedInterface(
328
+ # interface_list=[speech_to_text_iface, voice_to_image_iface],
329
+ # tab_names=["Speech-to-Text", "Voice-to-Image"]
330
+ # )
331
+
332
+ # # Launch Gradio interface
333
+ # iface.launch(debug=True, share=True)
334
+
335
 
336
 
337