Sayiqa commited on
Commit
bb4ad60
·
verified ·
1 Parent(s): 6cf8578

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -199
app.py CHANGED
@@ -1,182 +1,3 @@
1
- import subprocess
2
-
3
- # Install required libraries
4
- subprocess.check_call(["pip", "install", "torch>=1.11.0"])
5
- subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
6
- subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
7
- subprocess.check_call(["pip", "install", "librosa"])
8
- subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
9
- subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
10
- subprocess.check_call(["pip", "install", "huggingface_hub"])
11
-
12
- import os
13
- import threading
14
- import numpy as np
15
- import librosa
16
- import torch
17
- import gradio as gr
18
- from functools import lru_cache
19
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
- from huggingface_hub import login
21
- from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
-
23
- # Ensure required dependencies are installed
24
- def install_missing_packages():
25
- required_packages = {
26
- "librosa": None,
27
- "diffusers": ">=0.14.0",
28
- "gradio": ">=3.35.2",
29
- "huggingface_hub": None,
30
- "accelerate": ">=0.20.1",
31
- "transformers": ">=4.31.0"
32
- }
33
- for package, version in required_packages.items():
34
- try:
35
- __import__(package)
36
- except ImportError:
37
- package_name = f"{package}{version}" if version else package
38
- subprocess.check_call(["pip", "install", package_name])
39
-
40
- install_missing_packages()
41
-
42
- # Get Hugging Face token for authentication
43
- hf_token = os.getenv("HF_TOKEN")
44
- if hf_token:
45
- login(hf_token)
46
- else:
47
- raise ValueError("HF_TOKEN environment variable not set.")
48
-
49
- # Load speech-to-text model (Whisper)
50
- speech_to_text = pipeline(
51
- "automatic-speech-recognition",
52
- model="openai/whisper-tiny",
53
- return_timestamps=True
54
- )
55
-
56
- # Load Stable Diffusion model for text-to-image
57
- text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
58
- device = "cuda" if torch.cuda.is_available() else "cpu"
59
- text_to_image.to(device)
60
- text_to_image.enable_attention_slicing()
61
- text_to_image.safety_checker = None
62
- text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
-
64
- # Load ChatGPT-like conversational model
65
- chat_model_name = "microsoft/DialoGPT-medium"
66
- chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
- chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
-
69
- # Preprocess audio file into NumPy array
70
- def preprocess_audio(audio_path):
71
- try:
72
- audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
73
- return np.array(audio, dtype=np.float32)
74
- except Exception as e:
75
- return f"Error in preprocessing audio: {str(e)}"
76
-
77
- # Speech-to-text function with long-form transcription support
78
- @lru_cache(maxsize=10)
79
- def transcribe_audio(audio_path):
80
- try:
81
- audio_array = preprocess_audio(audio_path)
82
- if isinstance(audio_array, str): # Error message from preprocessing
83
- return audio_array
84
- result = speech_to_text(audio_array)
85
- # Combine text from multiple segments for long-form transcription
86
- transcription = " ".join(segment["text"] for segment in result["chunks"])
87
- return transcription
88
- except Exception as e:
89
- return f"Error in transcription: {str(e)}"
90
-
91
- # Text-to-image function
92
- @lru_cache(maxsize=10)
93
- def generate_image_from_text(text):
94
- try:
95
- image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
96
- return image
97
- except Exception as e:
98
- return f"Error in image generation: {str(e)}"
99
-
100
- # ChatGPT-like conversational response
101
- def chat_with_gpt(prompt):
102
- try:
103
- inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
- outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
- response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
- return response
107
- except Exception as e:
108
- return f"Error in chat response: {str(e)}"
109
-
110
- # Combined processing function
111
- def process_audio_and_generate_results(audio_path):
112
- transcription_result = {"result": None}
113
- image_result = {"result": None}
114
-
115
- # Function to run transcription and image generation in parallel
116
- def transcription_thread():
117
- transcription_result["result"] = transcribe_audio(audio_path)
118
-
119
- def image_generation_thread():
120
- transcription = transcription_result["result"]
121
- if transcription and "Error" not in transcription:
122
- image_result["result"] = generate_image_from_text(transcription)
123
-
124
- # Start both tasks in parallel
125
- t1 = threading.Thread(target=transcription_thread)
126
- t2 = threading.Thread(target=image_generation_thread)
127
-
128
- t1.start()
129
- t2.start()
130
-
131
- t1.join() # Wait for transcription to finish
132
- t2.join() # Wait for image generation to finish
133
-
134
- transcription = transcription_result["result"]
135
- image = image_result["result"]
136
-
137
- if "Error" in transcription:
138
- return None, transcription
139
- if isinstance(image, str) and "Error" in image:
140
- return None, image
141
-
142
- return image, transcription
143
-
144
- # Gradio interface for speech-to-text
145
- speech_to_text_iface = gr.Interface(
146
- fn=transcribe_audio,
147
- inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
148
- outputs=gr.Textbox(label="Transcription"),
149
- title="Speech-to-Text Transcription",
150
- description="Upload an audio file to transcribe speech into text.",
151
- )
152
-
153
- # Gradio interface for voice-to-image and chat
154
- voice_to_image_and_chat_iface = gr.Interface(
155
- fn=process_audio_and_generate_results,
156
- inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
- outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
- title="Voice-to-Image and Chat",
159
- description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
- )
161
-
162
- # Gradio interface for ChatGPT-like functionality
163
- chat_iface = gr.Interface(
164
- fn=chat_with_gpt,
165
- inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
- outputs=gr.Textbox(label="ChatGPT Response"),
167
- title="ChatGPT",
168
- description="Chat with GPT-like conversational AI.",
169
- )
170
-
171
- # Combined Gradio app
172
- iface = gr.TabbedInterface(
173
- interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
- tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
- )
176
-
177
- # Launch Gradio interface
178
- iface.launch(debug=True, share=True)
179
-
180
  # import subprocess
181
 
182
  # # Install required libraries
@@ -195,7 +16,7 @@ iface.launch(debug=True, share=True)
195
  # import torch
196
  # import gradio as gr
197
  # from functools import lru_cache
198
- # from transformers import pipeline
199
  # from huggingface_hub import login
200
  # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
201
 
@@ -240,8 +61,10 @@ iface.launch(debug=True, share=True)
240
  # text_to_image.safety_checker = None
241
  # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
 
243
- # # Load question-answering model (DistilBERT for factual answers)
244
- # qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 
 
245
 
246
  # # Preprocess audio file into NumPy array
247
  # def preprocess_audio(audio_path):
@@ -274,14 +97,15 @@ iface.launch(debug=True, share=True)
274
  # except Exception as e:
275
  # return f"Error in image generation: {str(e)}"
276
 
277
- # # Question answering function
278
- # def answer_question(question):
279
  # try:
280
- # context = """Imran Khan is a Pakistani politician, former cricketer, and philanthropist. He is the 22nd Prime Minister of Pakistan, serving from 2018 to 2022. Khan is the founder of the political party Pakistan Tehreek-e-Insaf (PTI). He was one of the most successful cricketers of his time and led Pakistan to victory in the 1992 Cricket World Cup."""
281
- # answer = qa_pipeline(question=question, context=context)
282
- # return answer['answer']
 
283
  # except Exception as e:
284
- # return f"Error in answering question: {str(e)}"
285
 
286
  # # Combined processing function
287
  # def process_audio_and_generate_results(audio_path):
@@ -331,28 +155,184 @@ iface.launch(debug=True, share=True)
331
  # fn=process_audio_and_generate_results,
332
  # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
333
  # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
334
- # title="Voice-to-Image",
335
- # description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
336
  # )
337
 
338
- # # Gradio interface for Question Answering
339
- # qa_iface = gr.Interface(
340
- # fn=answer_question,
341
- # inputs=gr.Textbox(label="Ask a question"),
342
- # outputs=gr.Textbox(label="Answer"),
343
- # title="Question Answering",
344
- # description="Ask a factual question, and get an answer.",
345
  # )
346
 
347
  # # Combined Gradio app
348
  # iface = gr.TabbedInterface(
349
- # interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, qa_iface],
350
- # tab_names=["Speech-to-Text", "Voice-to-Image", "Question Answering"]
351
  # )
352
 
353
  # # Launch Gradio interface
354
  # iface.launch(debug=True, share=True)
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
 
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # import subprocess
2
 
3
  # # Install required libraries
 
16
  # import torch
17
  # import gradio as gr
18
  # from functools import lru_cache
19
+ # from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
  # from huggingface_hub import login
21
  # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
 
 
61
  # text_to_image.safety_checker = None
62
  # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
 
64
+ # # Load ChatGPT-like conversational model
65
+ # chat_model_name = "microsoft/DialoGPT-medium"
66
+ # chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
+ # chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
 
69
  # # Preprocess audio file into NumPy array
70
  # def preprocess_audio(audio_path):
 
97
  # except Exception as e:
98
  # return f"Error in image generation: {str(e)}"
99
 
100
+ # # ChatGPT-like conversational response
101
+ # def chat_with_gpt(prompt):
102
  # try:
103
+ # inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
+ # outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
+ # response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
+ # return response
107
  # except Exception as e:
108
+ # return f"Error in chat response: {str(e)}"
109
 
110
  # # Combined processing function
111
  # def process_audio_and_generate_results(audio_path):
 
155
  # fn=process_audio_and_generate_results,
156
  # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
  # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
+ # title="Voice-to-Image and Chat",
159
+ # description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
  # )
161
 
162
+ # # Gradio interface for ChatGPT-like functionality
163
+ # chat_iface = gr.Interface(
164
+ # fn=chat_with_gpt,
165
+ # inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
+ # outputs=gr.Textbox(label="ChatGPT Response"),
167
+ # title="ChatGPT",
168
+ # description="Chat with GPT-like conversational AI.",
169
  # )
170
 
171
  # # Combined Gradio app
172
  # iface = gr.TabbedInterface(
173
+ # interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
+ # tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
  # )
176
 
177
  # # Launch Gradio interface
178
  # iface.launch(debug=True, share=True)
179
 
180
+ import subprocess
181
+
182
+ # Install required libraries
183
+ subprocess.check_call(["pip", "install", "torch>=1.11.0"])
184
+ subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
185
+ subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
186
+ subprocess.check_call(["pip", "install", "librosa"])
187
+ subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
188
+ subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
189
+ subprocess.check_call(["pip", "install", "huggingface_hub"])
190
+
191
+ import os
192
+ import threading
193
+ import numpy as np
194
+ import librosa
195
+ import torch
196
+ import gradio as gr
197
+ from functools import lru_cache
198
+ from transformers import pipeline
199
+ from huggingface_hub import login
200
+ from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
201
+
202
+ # Ensure required dependencies are installed
203
+ def install_missing_packages():
204
+ required_packages = {
205
+ "librosa": None,
206
+ "diffusers": ">=0.14.0",
207
+ "gradio": ">=3.35.2",
208
+ "huggingface_hub": None,
209
+ "accelerate": ">=0.20.1",
210
+ "transformers": ">=4.31.0"
211
+ }
212
+ for package, version in required_packages.items():
213
+ try:
214
+ __import__(package)
215
+ except ImportError:
216
+ package_name = f"{package}{version}" if version else package
217
+ subprocess.check_call(["pip", "install", package_name])
218
+
219
+ install_missing_packages()
220
+
221
+ # Get Hugging Face token for authentication
222
+ hf_token = os.getenv("HF_TOKEN")
223
+ if hf_token:
224
+ login(hf_token)
225
+ else:
226
+ raise ValueError("HF_TOKEN environment variable not set.")
227
+
228
+ # Load speech-to-text model (Whisper)
229
+ speech_to_text = pipeline(
230
+ "automatic-speech-recognition",
231
+ model="openai/whisper-tiny",
232
+ return_timestamps=True
233
+ )
234
+
235
+ # Load Stable Diffusion model for text-to-image
236
+ text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
237
+ device = "cuda" if torch.cuda.is_available() else "cpu"
238
+ text_to_image.to(device)
239
+ text_to_image.enable_attention_slicing()
240
+ text_to_image.safety_checker = None
241
+ text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
+
243
+ # Preprocess audio file into NumPy array
244
+ def preprocess_audio(audio_path):
245
+ try:
246
+ audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
247
+ return np.array(audio, dtype=np.float32)
248
+ except Exception as e:
249
+ return f"Error in preprocessing audio: {str(e)}"
250
+
251
+ # Speech-to-text function with long-form transcription support
252
+ @lru_cache(maxsize=10)
253
+ def transcribe_audio(audio_path):
254
+ try:
255
+ audio_array = preprocess_audio(audio_path)
256
+ if isinstance(audio_array, str): # Error message from preprocessing
257
+ return audio_array
258
+ result = speech_to_text(audio_array)
259
+ # Combine text from multiple segments for long-form transcription
260
+ transcription = " ".join(segment["text"] for segment in result["chunks"])
261
+ return transcription
262
+ except Exception as e:
263
+ return f"Error in transcription: {str(e)}"
264
+
265
+ # Text-to-image function
266
+ @lru_cache(maxsize=10)
267
+ def generate_image_from_text(text):
268
+ try:
269
+ image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
270
+ return image
271
+ except Exception as e:
272
+ return f"Error in image generation: {str(e)}"
273
+
274
+ # Combined processing function
275
+ def process_audio_and_generate_results(audio_path):
276
+ transcription_result = {"result": None}
277
+ image_result = {"result": None}
278
+
279
+ # Function to run transcription and image generation in parallel
280
+ def transcription_thread():
281
+ transcription_result["result"] = transcribe_audio(audio_path)
282
+
283
+ def image_generation_thread():
284
+ transcription = transcription_result["result"]
285
+ if transcription and "Error" not in transcription:
286
+ image_result["result"] = generate_image_from_text(transcription)
287
+
288
+ # Start both tasks in parallel
289
+ t1 = threading.Thread(target=transcription_thread)
290
+ t2 = threading.Thread(target=image_generation_thread)
291
+
292
+ t1.start()
293
+ t2.start()
294
+
295
+ t1.join() # Wait for transcription to finish
296
+ t2.join() # Wait for image generation to finish
297
+
298
+ transcription = transcription_result["result"]
299
+ image = image_result["result"]
300
+
301
+ if "Error" in transcription:
302
+ return None, transcription
303
+ if isinstance(image, str) and "Error" in image:
304
+ return None, image
305
+
306
+ return image, transcription
307
+
308
+ # Gradio interface for speech-to-text
309
+ speech_to_text_iface = gr.Interface(
310
+ fn=transcribe_audio,
311
+ inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
312
+ outputs=gr.Textbox(label="Transcription"),
313
+ title="Speech-to-Text Transcription",
314
+ description="Upload an audio file to transcribe speech into text.",
315
+ )
316
+
317
+ # Gradio interface for voice-to-image
318
+ voice_to_image_iface = gr.Interface(
319
+ fn=process_audio_and_generate_results,
320
+ inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
321
+ outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
322
+ title="Voice-to-Image",
323
+ description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
324
+ )
325
+
326
+ # Combined Gradio app
327
+ iface = gr.TabbedInterface(
328
+ interface_list=[speech_to_text_iface, voice_to_image_iface],
329
+ tab_names=["Speech-to-Text", "Voice-to-Image"]
330
+ )
331
+
332
+ # Launch Gradio interface
333
+ iface.launch(debug=True, share=True)
334
+
335
+
336
 
337
 
338