Sayiqa commited on
Commit
6cf8578
·
verified ·
1 Parent(s): dc005f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -199
app.py CHANGED
@@ -1,182 +1,3 @@
1
- # import subprocess
2
-
3
- # # Install required libraries
4
- # subprocess.check_call(["pip", "install", "torch>=1.11.0"])
5
- # subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
6
- # subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
7
- # subprocess.check_call(["pip", "install", "librosa"])
8
- # subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
9
- # subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
10
- # subprocess.check_call(["pip", "install", "huggingface_hub"])
11
-
12
- # import os
13
- # import threading
14
- # import numpy as np
15
- # import librosa
16
- # import torch
17
- # import gradio as gr
18
- # from functools import lru_cache
19
- # from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
- # from huggingface_hub import login
21
- # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
-
23
- # # Ensure required dependencies are installed
24
- # def install_missing_packages():
25
- # required_packages = {
26
- # "librosa": None,
27
- # "diffusers": ">=0.14.0",
28
- # "gradio": ">=3.35.2",
29
- # "huggingface_hub": None,
30
- # "accelerate": ">=0.20.1",
31
- # "transformers": ">=4.31.0"
32
- # }
33
- # for package, version in required_packages.items():
34
- # try:
35
- # __import__(package)
36
- # except ImportError:
37
- # package_name = f"{package}{version}" if version else package
38
- # subprocess.check_call(["pip", "install", package_name])
39
-
40
- # install_missing_packages()
41
-
42
- # # Get Hugging Face token for authentication
43
- # hf_token = os.getenv("HF_TOKEN")
44
- # if hf_token:
45
- # login(hf_token)
46
- # else:
47
- # raise ValueError("HF_TOKEN environment variable not set.")
48
-
49
- # # Load speech-to-text model (Whisper)
50
- # speech_to_text = pipeline(
51
- # "automatic-speech-recognition",
52
- # model="openai/whisper-tiny",
53
- # return_timestamps=True
54
- # )
55
-
56
- # # Load Stable Diffusion model for text-to-image
57
- # text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
58
- # device = "cuda" if torch.cuda.is_available() else "cpu"
59
- # text_to_image.to(device)
60
- # text_to_image.enable_attention_slicing()
61
- # text_to_image.safety_checker = None
62
- # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
-
64
- # # Load ChatGPT-like conversational model
65
- # chat_model_name = "microsoft/DialoGPT-medium"
66
- # chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
- # chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
-
69
- # # Preprocess audio file into NumPy array
70
- # def preprocess_audio(audio_path):
71
- # try:
72
- # audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
73
- # return np.array(audio, dtype=np.float32)
74
- # except Exception as e:
75
- # return f"Error in preprocessing audio: {str(e)}"
76
-
77
- # # Speech-to-text function with long-form transcription support
78
- # @lru_cache(maxsize=10)
79
- # def transcribe_audio(audio_path):
80
- # try:
81
- # audio_array = preprocess_audio(audio_path)
82
- # if isinstance(audio_array, str): # Error message from preprocessing
83
- # return audio_array
84
- # result = speech_to_text(audio_array)
85
- # # Combine text from multiple segments for long-form transcription
86
- # transcription = " ".join(segment["text"] for segment in result["chunks"])
87
- # return transcription
88
- # except Exception as e:
89
- # return f"Error in transcription: {str(e)}"
90
-
91
- # # Text-to-image function
92
- # @lru_cache(maxsize=10)
93
- # def generate_image_from_text(text):
94
- # try:
95
- # image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
96
- # return image
97
- # except Exception as e:
98
- # return f"Error in image generation: {str(e)}"
99
-
100
- # # ChatGPT-like conversational response
101
- # def chat_with_gpt(prompt):
102
- # try:
103
- # inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
- # outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
- # response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
- # return response
107
- # except Exception as e:
108
- # return f"Error in chat response: {str(e)}"
109
-
110
- # # Combined processing function
111
- # def process_audio_and_generate_results(audio_path):
112
- # transcription_result = {"result": None}
113
- # image_result = {"result": None}
114
-
115
- # # Function to run transcription and image generation in parallel
116
- # def transcription_thread():
117
- # transcription_result["result"] = transcribe_audio(audio_path)
118
-
119
- # def image_generation_thread():
120
- # transcription = transcription_result["result"]
121
- # if transcription and "Error" not in transcription:
122
- # image_result["result"] = generate_image_from_text(transcription)
123
-
124
- # # Start both tasks in parallel
125
- # t1 = threading.Thread(target=transcription_thread)
126
- # t2 = threading.Thread(target=image_generation_thread)
127
-
128
- # t1.start()
129
- # t2.start()
130
-
131
- # t1.join() # Wait for transcription to finish
132
- # t2.join() # Wait for image generation to finish
133
-
134
- # transcription = transcription_result["result"]
135
- # image = image_result["result"]
136
-
137
- # if "Error" in transcription:
138
- # return None, transcription
139
- # if isinstance(image, str) and "Error" in image:
140
- # return None, image
141
-
142
- # return image, transcription
143
-
144
- # # Gradio interface for speech-to-text
145
- # speech_to_text_iface = gr.Interface(
146
- # fn=transcribe_audio,
147
- # inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
148
- # outputs=gr.Textbox(label="Transcription"),
149
- # title="Speech-to-Text Transcription",
150
- # description="Upload an audio file to transcribe speech into text.",
151
- # )
152
-
153
- # # Gradio interface for voice-to-image and chat
154
- # voice_to_image_and_chat_iface = gr.Interface(
155
- # fn=process_audio_and_generate_results,
156
- # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
- # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
- # title="Voice-to-Image and Chat",
159
- # description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
- # )
161
-
162
- # # Gradio interface for ChatGPT-like functionality
163
- # chat_iface = gr.Interface(
164
- # fn=chat_with_gpt,
165
- # inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
- # outputs=gr.Textbox(label="ChatGPT Response"),
167
- # title="ChatGPT",
168
- # description="Chat with GPT-like conversational AI.",
169
- # )
170
-
171
- # # Combined Gradio app
172
- # iface = gr.TabbedInterface(
173
- # interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
- # tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
- # )
176
-
177
- # # Launch Gradio interface
178
- # iface.launch(debug=True, share=True)
179
-
180
  import subprocess
181
 
182
  # Install required libraries
@@ -195,7 +16,7 @@ import librosa
195
  import torch
196
  import gradio as gr
197
  from functools import lru_cache
198
- from transformers import pipeline
199
  from huggingface_hub import login
200
  from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
201
 
@@ -240,8 +61,10 @@ text_to_image.enable_attention_slicing()
240
  text_to_image.safety_checker = None
241
  text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
 
243
- # Load question-answering model (DistilBERT for factual answers)
244
- qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 
 
245
 
246
  # Preprocess audio file into NumPy array
247
  def preprocess_audio(audio_path):
@@ -274,14 +97,15 @@ def generate_image_from_text(text):
274
  except Exception as e:
275
  return f"Error in image generation: {str(e)}"
276
 
277
- # Question answering function
278
- def answer_question(question):
279
  try:
280
- context = """Imran Khan is a Pakistani politician, former cricketer, and philanthropist. He is the 22nd Prime Minister of Pakistan, serving from 2018 to 2022. Khan is the founder of the political party Pakistan Tehreek-e-Insaf (PTI). He was one of the most successful cricketers of his time and led Pakistan to victory in the 1992 Cricket World Cup."""
281
- answer = qa_pipeline(question=question, context=context)
282
- return answer['answer']
 
283
  except Exception as e:
284
- return f"Error in answering question: {str(e)}"
285
 
286
  # Combined processing function
287
  def process_audio_and_generate_results(audio_path):
@@ -331,28 +155,204 @@ voice_to_image_and_chat_iface = gr.Interface(
331
  fn=process_audio_and_generate_results,
332
  inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
333
  outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
334
- title="Voice-to-Image",
335
- description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
336
  )
337
 
338
- # Gradio interface for Question Answering
339
- qa_iface = gr.Interface(
340
- fn=answer_question,
341
- inputs=gr.Textbox(label="Ask a question"),
342
- outputs=gr.Textbox(label="Answer"),
343
- title="Question Answering",
344
- description="Ask a factual question, and get an answer.",
345
  )
346
 
347
  # Combined Gradio app
348
  iface = gr.TabbedInterface(
349
- interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, qa_iface],
350
- tab_names=["Speech-to-Text", "Voice-to-Image", "Question Answering"]
351
  )
352
 
353
  # Launch Gradio interface
354
  iface.launch(debug=True, share=True)
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
 
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import subprocess
2
 
3
  # Install required libraries
 
16
  import torch
17
  import gradio as gr
18
  from functools import lru_cache
19
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
  from huggingface_hub import login
21
  from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
 
 
61
  text_to_image.safety_checker = None
62
  text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
 
64
+ # Load ChatGPT-like conversational model
65
+ chat_model_name = "microsoft/DialoGPT-medium"
66
+ chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
+ chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
 
69
  # Preprocess audio file into NumPy array
70
  def preprocess_audio(audio_path):
 
97
  except Exception as e:
98
  return f"Error in image generation: {str(e)}"
99
 
100
+ # ChatGPT-like conversational response
101
+ def chat_with_gpt(prompt):
102
  try:
103
+ inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
+ outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
+ response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
+ return response
107
  except Exception as e:
108
+ return f"Error in chat response: {str(e)}"
109
 
110
  # Combined processing function
111
  def process_audio_and_generate_results(audio_path):
 
155
  fn=process_audio_and_generate_results,
156
  inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
  outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
+ title="Voice-to-Image and Chat",
159
+ description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
  )
161
 
162
+ # Gradio interface for ChatGPT-like functionality
163
+ chat_iface = gr.Interface(
164
+ fn=chat_with_gpt,
165
+ inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
+ outputs=gr.Textbox(label="ChatGPT Response"),
167
+ title="ChatGPT",
168
+ description="Chat with GPT-like conversational AI.",
169
  )
170
 
171
  # Combined Gradio app
172
  iface = gr.TabbedInterface(
173
+ interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
+ tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
  )
176
 
177
  # Launch Gradio interface
178
  iface.launch(debug=True, share=True)
179
 
180
+ # import subprocess
181
+
182
+ # # Install required libraries
183
+ # subprocess.check_call(["pip", "install", "torch>=1.11.0"])
184
+ # subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
185
+ # subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
186
+ # subprocess.check_call(["pip", "install", "librosa"])
187
+ # subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
188
+ # subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
189
+ # subprocess.check_call(["pip", "install", "huggingface_hub"])
190
+
191
+ # import os
192
+ # import threading
193
+ # import numpy as np
194
+ # import librosa
195
+ # import torch
196
+ # import gradio as gr
197
+ # from functools import lru_cache
198
+ # from transformers import pipeline
199
+ # from huggingface_hub import login
200
+ # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
201
+
202
+ # # Ensure required dependencies are installed
203
+ # def install_missing_packages():
204
+ # required_packages = {
205
+ # "librosa": None,
206
+ # "diffusers": ">=0.14.0",
207
+ # "gradio": ">=3.35.2",
208
+ # "huggingface_hub": None,
209
+ # "accelerate": ">=0.20.1",
210
+ # "transformers": ">=4.31.0"
211
+ # }
212
+ # for package, version in required_packages.items():
213
+ # try:
214
+ # __import__(package)
215
+ # except ImportError:
216
+ # package_name = f"{package}{version}" if version else package
217
+ # subprocess.check_call(["pip", "install", package_name])
218
+
219
+ # install_missing_packages()
220
+
221
+ # # Get Hugging Face token for authentication
222
+ # hf_token = os.getenv("HF_TOKEN")
223
+ # if hf_token:
224
+ # login(hf_token)
225
+ # else:
226
+ # raise ValueError("HF_TOKEN environment variable not set.")
227
+
228
+ # # Load speech-to-text model (Whisper)
229
+ # speech_to_text = pipeline(
230
+ # "automatic-speech-recognition",
231
+ # model="openai/whisper-tiny",
232
+ # return_timestamps=True
233
+ # )
234
+
235
+ # # Load Stable Diffusion model for text-to-image
236
+ # text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
237
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
238
+ # text_to_image.to(device)
239
+ # text_to_image.enable_attention_slicing()
240
+ # text_to_image.safety_checker = None
241
+ # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
+
243
+ # # Load question-answering model (DistilBERT for factual answers)
244
+ # qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
245
+
246
+ # # Preprocess audio file into NumPy array
247
+ # def preprocess_audio(audio_path):
248
+ # try:
249
+ # audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
250
+ # return np.array(audio, dtype=np.float32)
251
+ # except Exception as e:
252
+ # return f"Error in preprocessing audio: {str(e)}"
253
+
254
+ # # Speech-to-text function with long-form transcription support
255
+ # @lru_cache(maxsize=10)
256
+ # def transcribe_audio(audio_path):
257
+ # try:
258
+ # audio_array = preprocess_audio(audio_path)
259
+ # if isinstance(audio_array, str): # Error message from preprocessing
260
+ # return audio_array
261
+ # result = speech_to_text(audio_array)
262
+ # # Combine text from multiple segments for long-form transcription
263
+ # transcription = " ".join(segment["text"] for segment in result["chunks"])
264
+ # return transcription
265
+ # except Exception as e:
266
+ # return f"Error in transcription: {str(e)}"
267
+
268
+ # # Text-to-image function
269
+ # @lru_cache(maxsize=10)
270
+ # def generate_image_from_text(text):
271
+ # try:
272
+ # image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
273
+ # return image
274
+ # except Exception as e:
275
+ # return f"Error in image generation: {str(e)}"
276
+
277
+ # # Question answering function
278
+ # def answer_question(question):
279
+ # try:
280
+ # context = """Imran Khan is a Pakistani politician, former cricketer, and philanthropist. He is the 22nd Prime Minister of Pakistan, serving from 2018 to 2022. Khan is the founder of the political party Pakistan Tehreek-e-Insaf (PTI). He was one of the most successful cricketers of his time and led Pakistan to victory in the 1992 Cricket World Cup."""
281
+ # answer = qa_pipeline(question=question, context=context)
282
+ # return answer['answer']
283
+ # except Exception as e:
284
+ # return f"Error in answering question: {str(e)}"
285
+
286
+ # # Combined processing function
287
+ # def process_audio_and_generate_results(audio_path):
288
+ # transcription_result = {"result": None}
289
+ # image_result = {"result": None}
290
+
291
+ # # Function to run transcription and image generation in parallel
292
+ # def transcription_thread():
293
+ # transcription_result["result"] = transcribe_audio(audio_path)
294
+
295
+ # def image_generation_thread():
296
+ # transcription = transcription_result["result"]
297
+ # if transcription and "Error" not in transcription:
298
+ # image_result["result"] = generate_image_from_text(transcription)
299
+
300
+ # # Start both tasks in parallel
301
+ # t1 = threading.Thread(target=transcription_thread)
302
+ # t2 = threading.Thread(target=image_generation_thread)
303
+
304
+ # t1.start()
305
+ # t2.start()
306
+
307
+ # t1.join() # Wait for transcription to finish
308
+ # t2.join() # Wait for image generation to finish
309
+
310
+ # transcription = transcription_result["result"]
311
+ # image = image_result["result"]
312
+
313
+ # if "Error" in transcription:
314
+ # return None, transcription
315
+ # if isinstance(image, str) and "Error" in image:
316
+ # return None, image
317
+
318
+ # return image, transcription
319
+
320
+ # # Gradio interface for speech-to-text
321
+ # speech_to_text_iface = gr.Interface(
322
+ # fn=transcribe_audio,
323
+ # inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
324
+ # outputs=gr.Textbox(label="Transcription"),
325
+ # title="Speech-to-Text Transcription",
326
+ # description="Upload an audio file to transcribe speech into text.",
327
+ # )
328
+
329
+ # # Gradio interface for voice-to-image and chat
330
+ # voice_to_image_and_chat_iface = gr.Interface(
331
+ # fn=process_audio_and_generate_results,
332
+ # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
333
+ # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
334
+ # title="Voice-to-Image",
335
+ # description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
336
+ # )
337
+
338
+ # # Gradio interface for Question Answering
339
+ # qa_iface = gr.Interface(
340
+ # fn=answer_question,
341
+ # inputs=gr.Textbox(label="Ask a question"),
342
+ # outputs=gr.Textbox(label="Answer"),
343
+ # title="Question Answering",
344
+ # description="Ask a factual question, and get an answer.",
345
+ # )
346
+
347
+ # # Combined Gradio app
348
+ # iface = gr.TabbedInterface(
349
+ # interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, qa_iface],
350
+ # tab_names=["Speech-to-Text", "Voice-to-Image", "Question Answering"]
351
+ # )
352
+
353
+ # # Launch Gradio interface
354
+ # iface.launch(debug=True, share=True)
355
+
356
 
357
 
358