Files changed (1) hide show
  1. app.py +26 -39
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # import subprocess
2
 
3
  # # Install required libraries
@@ -16,7 +17,7 @@
16
  # import torch
17
  # import gradio as gr
18
  # from functools import lru_cache
19
- # from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
20
  # from huggingface_hub import login
21
  # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
 
@@ -61,11 +62,6 @@
61
  # text_to_image.safety_checker = None
62
  # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
 
64
- # # Load ChatGPT-like conversational model
65
- # chat_model_name = "microsoft/DialoGPT-medium"
66
- # chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
67
- # chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name)
68
-
69
  # # Preprocess audio file into NumPy array
70
  # def preprocess_audio(audio_path):
71
  # try:
@@ -97,16 +93,6 @@
97
  # except Exception as e:
98
  # return f"Error in image generation: {str(e)}"
99
 
100
- # # ChatGPT-like conversational response
101
- # def chat_with_gpt(prompt):
102
- # try:
103
- # inputs = chat_tokenizer.encode(prompt, return_tensors="pt")
104
- # outputs = chat_model.generate(inputs, max_length=200, pad_token_id=chat_tokenizer.eos_token_id)
105
- # response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
106
- # return response
107
- # except Exception as e:
108
- # return f"Error in chat response: {str(e)}"
109
-
110
  # # Combined processing function
111
  # def process_audio_and_generate_results(audio_path):
112
  # transcription_result = {"result": None}
@@ -150,33 +136,25 @@
150
  # description="Upload an audio file to transcribe speech into text.",
151
  # )
152
 
153
- # # Gradio interface for voice-to-image and chat
154
- # voice_to_image_and_chat_iface = gr.Interface(
155
  # fn=process_audio_and_generate_results,
156
  # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
157
  # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
158
- # title="Voice-to-Image and Chat",
159
- # description="Upload an audio file to transcribe speech to text, generate an image based on the transcription, or chat with GPT.",
160
- # )
161
-
162
- # # Gradio interface for ChatGPT-like functionality
163
- # chat_iface = gr.Interface(
164
- # fn=chat_with_gpt,
165
- # inputs=gr.Textbox(label="Enter your prompt for ChatGPT"),
166
- # outputs=gr.Textbox(label="ChatGPT Response"),
167
- # title="ChatGPT",
168
- # description="Chat with GPT-like conversational AI.",
169
  # )
170
 
171
  # # Combined Gradio app
172
  # iface = gr.TabbedInterface(
173
- # interface_list=[speech_to_text_iface, voice_to_image_and_chat_iface, chat_iface],
174
- # tab_names=["Speech-to-Text", "Voice-to-Image & Chat", "ChatGPT"]
175
  # )
176
 
177
  # # Launch Gradio interface
178
  # iface.launch(debug=True, share=True)
179
 
 
180
  import subprocess
181
 
182
  # Install required libraries
@@ -240,7 +218,7 @@ text_to_image.enable_attention_slicing()
240
  text_to_image.safety_checker = None
241
  text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
242
 
243
- # Preprocess audio file into NumPy array
244
  def preprocess_audio(audio_path):
245
  try:
246
  audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
@@ -248,25 +226,35 @@ def preprocess_audio(audio_path):
248
  except Exception as e:
249
  return f"Error in preprocessing audio: {str(e)}"
250
 
251
- # Speech-to-text function with long-form transcription support
252
  @lru_cache(maxsize=10)
253
  def transcribe_audio(audio_path):
254
  try:
255
  audio_array = preprocess_audio(audio_path)
256
  if isinstance(audio_array, str): # Error message from preprocessing
257
  return audio_array
258
- result = speech_to_text(audio_array)
259
- # Combine text from multiple segments for long-form transcription
260
- transcription = " ".join(segment["text"] for segment in result["chunks"])
 
 
 
 
 
 
 
 
 
 
261
  return transcription
262
  except Exception as e:
263
  return f"Error in transcription: {str(e)}"
264
 
265
- # Text-to-image function
266
  @lru_cache(maxsize=10)
267
  def generate_image_from_text(text):
268
  try:
269
- image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
270
  return image
271
  except Exception as e:
272
  return f"Error in image generation: {str(e)}"
@@ -337,4 +325,3 @@ iface.launch(debug=True, share=True)
337
 
338
 
339
 
340
-
 
1
+
2
  # import subprocess
3
 
4
  # # Install required libraries
 
17
  # import torch
18
  # import gradio as gr
19
  # from functools import lru_cache
20
+ # from transformers import pipeline
21
  # from huggingface_hub import login
22
  # from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
23
 
 
62
  # text_to_image.safety_checker = None
63
  # text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
64
 
 
 
 
 
 
65
  # # Preprocess audio file into NumPy array
66
  # def preprocess_audio(audio_path):
67
  # try:
 
93
  # except Exception as e:
94
  # return f"Error in image generation: {str(e)}"
95
 
 
 
 
 
 
 
 
 
 
 
96
  # # Combined processing function
97
  # def process_audio_and_generate_results(audio_path):
98
  # transcription_result = {"result": None}
 
136
  # description="Upload an audio file to transcribe speech into text.",
137
  # )
138
 
139
+ # # Gradio interface for voice-to-image
140
+ # voice_to_image_iface = gr.Interface(
141
  # fn=process_audio_and_generate_results,
142
  # inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
143
  # outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
144
+ # title="Voice-to-Image",
145
+ # description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
 
 
 
 
 
 
 
 
 
146
  # )
147
 
148
  # # Combined Gradio app
149
  # iface = gr.TabbedInterface(
150
+ # interface_list=[speech_to_text_iface, voice_to_image_iface],
151
+ # tab_names=["Speech-to-Text", "Voice-to-Image"]
152
  # )
153
 
154
  # # Launch Gradio interface
155
  # iface.launch(debug=True, share=True)
156
 
157
+
158
  import subprocess
159
 
160
  # Install required libraries
 
218
  text_to_image.safety_checker = None
219
  text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
220
 
221
+ # Preprocess audio file into NumPy array with chunking for long files
222
  def preprocess_audio(audio_path):
223
  try:
224
  audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
 
226
  except Exception as e:
227
  return f"Error in preprocessing audio: {str(e)}"
228
 
229
+ # Speech-to-text function with chunking support for long-form transcription
230
  @lru_cache(maxsize=10)
231
  def transcribe_audio(audio_path):
232
  try:
233
  audio_array = preprocess_audio(audio_path)
234
  if isinstance(audio_array, str): # Error message from preprocessing
235
  return audio_array
236
+
237
+ chunk_size = 30 * 16000 # 30 seconds per chunk
238
+ num_chunks = int(np.ceil(len(audio_array) / chunk_size))
239
+ transcription = ""
240
+
241
+ for i in range(num_chunks):
242
+ start = i * chunk_size
243
+ end = min((i + 1) * chunk_size, len(audio_array))
244
+ chunk = audio_array[start:end]
245
+
246
+ result = speech_to_text(chunk)
247
+ transcription += " ".join(segment["text"] for segment in result["chunks"])
248
+
249
  return transcription
250
  except Exception as e:
251
  return f"Error in transcription: {str(e)}"
252
 
253
+ # Text-to-image function for HD image generation
254
  @lru_cache(maxsize=10)
255
  def generate_image_from_text(text):
256
  try:
257
+ image = text_to_image(text, height=1024, width=1024).images[0] # HD image resolution
258
  return image
259
  except Exception as e:
260
  return f"Error in image generation: {str(e)}"
 
325
 
326
 
327