mfrng commited on
Commit
2c7a0a6
·
verified ·
1 Parent(s): fb7b885

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -198
app.py CHANGED
@@ -16,11 +16,7 @@ import webrtcvad
16
  from pesq import pesq
17
  from pystoi import stoi
18
 
19
- # Models placeholder imports
20
- # from demucs import DemucsModel # For voice isolation
21
- # from voicefixer import VoiceFixer # For audio restoration
22
-
23
- # -- Helper functions --
24
 
25
  def load_audio(file_obj):
26
  y, sr = librosa.load(file_obj, sr=16000)
@@ -55,116 +51,63 @@ def plot_spectrogram(y, sr, title):
55
 
56
  def compute_snr(original, enhanced):
57
  noise = original - enhanced
58
- snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
59
  return snr
60
 
61
  def vad_plot(y, sr, title):
62
- # Parameters
 
 
 
 
63
  frame_duration_ms = 30
64
- hop_length = int(sr * frame_duration_ms / 1000)
65
- n_fft = 2048
66
-
67
- # Compute STFT
68
- S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length)) ** 2
69
- freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
70
-
71
- # Focus on 80–3000 Hz (speech band)
72
- speech_band = np.where((freqs >= 80) & (freqs <= 3000))[0]
73
- speech_energy = S[speech_band, :].mean(axis=0)
74
-
75
- # Normalize energy
76
- speech_energy /= np.max(speech_energy) + 1e-6
77
-
78
- # Threshold for voice activity (tune as needed)
79
- voice_mask = speech_energy > 0.1
80
-
81
- # Time axis for plotting
82
- times = librosa.frames_to_time(np.arange(len(voice_mask)), sr=sr, hop_length=hop_length)
83
-
84
- # Merge voiced intervals
85
- intervals = []
86
- start = None
87
- for i, voiced in enumerate(voice_mask):
88
- if voiced and start is None:
89
- start = times[i]
90
- elif not voiced and start is not None:
91
- intervals.append((start, times[i]))
92
- start = None
93
- if start is not None:
94
- intervals.append((start, times[-1]))
95
-
96
- # Plot waveform + shaded voice regions
97
- plt.figure(figsize=(10, 2))
98
- librosa.display.waveshow(y, sr=sr, alpha=0.6)
99
- for (start_t, end_t) in intervals:
100
- plt.axvspan(start_t, end_t, color='green', alpha=0.3)
101
- plt.title(title + " (Voice Regions: 80–3000Hz energy)")
102
- plt.tight_layout()
103
 
 
 
 
 
 
 
104
  buf = io.BytesIO()
105
  plt.savefig(buf, format='png')
106
  plt.close()
107
  buf.seek(0)
108
  return buf
109
 
110
- def amplify_voice_fft(y, sr, gain_db=10):
111
- # Short-Time Fourier Transform
112
- hop_length = 512
113
- D = librosa.stft(y, n_fft=2048, hop_length=hop_length)
114
- mag, phase = np.abs(D), np.angle(D)
115
-
116
- freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
117
- voice_band = np.where((freqs >= 80) & (freqs <= 3000))[0]
118
-
119
- # Convert gain from dB to amplitude
120
- gain_amp = 10 ** (gain_db / 20.0)
121
-
122
- # Amplify only the voice frequency band
123
- mag[voice_band, :] *= gain_amp
124
-
125
- # Reconstruct
126
- D_new = mag * np.exp(1j * phase)
127
- y_out = librosa.istft(D_new, hop_length=hop_length)
128
- return y_out
129
-
130
- def amplify_voice(y, target_db=-20):
131
- rms = np.sqrt(np.mean(y**2))
132
- if rms > 0:
133
- current_db = 20 * np.log10(rms)
134
- gain = 10 ** ((target_db - current_db) / 20)
135
- y = y * gain
136
- return y
137
-
138
  def compute_pesq_mfcc_stoi(original_path, enhanced_path):
139
  sr = 16000
140
  original, _ = librosa.load(original_path, sr=sr)
141
  enhanced, _ = librosa.load(enhanced_path, sr=sr)
142
-
143
  pesq_score = pesq(sr, original, enhanced, 'wb')
144
  stoi_score = stoi(original, enhanced, sr, extended=False)
145
-
146
  mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
147
  mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
148
-
149
- # Compute MFCC distance (mean absolute difference)
150
  mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
151
-
152
  return pesq_score, stoi_score, mfcc_diff
153
 
154
- # Enhancement functions
155
 
156
  def noise_reduction(y, sr):
157
  return reduce_noise(y=y, sr=sr)
158
 
159
  def voice_isolation(y, sr):
160
- # Placeholder: Implement with Demucs or similar
161
- # For demo, return input
162
  return y
163
 
164
  def reverb_cleanup(y, sr):
165
- # Simple dereverberation placeholder: median filtering
166
- y_dereverb = medfilt(y, kernel_size=5)
167
- return y_dereverb
168
 
169
  def volume_normalize(y):
170
  peak = np.max(np.abs(y))
@@ -173,22 +116,16 @@ def volume_normalize(y):
173
  return y
174
 
175
  def language_aware_tuning(y, sr):
176
- # Placeholder for EQ adjustments by language
177
- # For demo, apply slight high-pass filter
178
- y_hp = librosa.effects.preemphasis(y)
179
- return y_hp
180
-
181
- # Main processing function
182
-
183
- def process_files(
184
- files,
185
- noise_reduc,
186
- voice_iso,
187
- reverb_clean,
188
- vol_norm,
189
- lang_tune,
190
- progress=gr.Progress()
191
- ):
192
  results = []
193
  metrics = []
194
  temp_dir = tempfile.mkdtemp()
@@ -202,73 +139,32 @@ def process_files(
202
  y, sr = load_audio(file_obj)
203
  original_y = y.copy()
204
 
205
- # Enhancement pipeline
206
- if noise_reduc:
207
- y = noise_reduction(y, sr)
208
- if voice_iso:
209
- y = voice_isolation(y, sr)
210
- if reverb_clean:
211
- y = reverb_cleanup(y, sr)
212
- if vol_norm:
213
- y = amplify_voice_fft(y, sr, gain_db=8)
214
- y = volume_normalize(y)
215
- if lang_tune:
216
- y = language_aware_tuning(y, sr)
217
-
218
- # Amplify voice as final step
219
- y = amplify_voice(y)
220
-
221
- # Extract extension and construct filenames
222
- base_name, ext = os.path.splitext(file_obj.name)
223
- ext = ext.lower()
224
- ext_format = ext[1:].upper() if ext.startswith('.') else ext.upper()
225
-
226
- enhanced_filename = f"{base_name}_enhanced{ext}"
227
- enhanced_path = os.path.join(temp_dir, enhanced_filename)
228
 
229
- try:
230
- sf.write(enhanced_path, y, sr, format=ext_format)
231
- except Exception:
232
- # fallback to WAV
233
- enhanced_filename = f"{base_name}_enhanced.wav"
234
- enhanced_path = os.path.join(temp_dir, enhanced_filename)
235
- sf.write(enhanced_path, y, sr)
236
 
237
- original_filename = f"{base_name}_original{ext}"
238
- original_path = os.path.join(temp_dir, original_filename)
239
 
240
- try:
241
- sf.write(original_path, original_y, sr, format=ext_format)
242
- except Exception:
243
- original_filename = f"{base_name}_original.wav"
244
- original_path = os.path.join(temp_dir, original_filename)
245
- sf.write(original_path, original_y, sr)
246
-
247
- # Generate plots
248
- waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
249
- waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
250
-
251
- spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
252
- spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
253
-
254
- vad_orig = vad_plot(original_y, sr, "Original VAD")
255
- vad_enh = vad_plot(y, sr, "Enhanced VAD")
256
-
257
- # Save plots and add to zip
258
- for img_buf, name in [
259
- (waveform_orig, "waveform_original.png"),
260
- (waveform_enh, "waveform_enhanced.png"),
261
- (spectrogram_orig, "spectrogram_original.png"),
262
- (spectrogram_enh, "spectrogram_enhanced.png"),
263
- (vad_orig, "vad_original.png"),
264
- (vad_enh, "vad_enhanced.png"),
265
  ]:
266
- plot_path = os.path.join(temp_dir, f"{base_name}_{name}")
267
- with open(plot_path, "wb") as f:
268
- f.write(img_buf.read())
269
- zipf.write(plot_path, arcname=os.path.basename(plot_path))
 
 
270
 
271
- # Compute metrics
272
  try:
273
  pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
274
  except Exception:
@@ -293,51 +189,45 @@ def process_files(
293
  zipf.write(csv_path, arcname="metrics.csv")
294
 
295
  zipf.close()
296
- first_enhanced = os.path.join(temp_dir, os.path.splitext(files[0].name)[0] + "_enhanced.wav")
297
- return zip_path, first_enhanced
 
298
 
299
- # Gradio UI
 
 
 
 
 
 
 
 
300
 
301
  with gr.Blocks() as demo:
302
- gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
303
 
304
  with gr.Row():
305
- audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
306
  with gr.Row():
307
- noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise", value=True)
308
- voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background", value=True)
309
- reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects", value=True)
310
- volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume", value=True)
311
- lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language", value=True)
 
312
 
313
  enhance_btn = gr.Button("Enhance Audio")
314
-
315
- output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
316
-
317
- #enhanced_audio_preview = gr.Audio(label="Preview First Enhanced Audio", interactive=False)
318
-
319
- progress_bar = gr.Label(value="Upload files and select enhancement options.")
320
-
321
- def run_enhancement(files, nr, vi, reverb, vol, lang):
322
- if not files or len(files) == 0:
323
- return None, "❌ Please upload at least one audio file."
324
-
325
- if not (nr or vi or reverb or vol or lang):
326
- return None, "⚠️ Please enable at least one enhancement option."
327
-
328
- try:
329
- zip_path, first_enhanced_audio = process_files(files, nr, vi, reverb, vol, lang)
330
- return zip_path, first_enhanced_audio, "Processing complete. Download your ZIP file below."
331
- except Exception as e:
332
- import traceback
333
- traceback.print_exc()
334
- return None, None, f"Error during enhancement: {str(e)}"
335
 
336
  enhance_btn.click(
337
  fn=run_enhancement,
338
- inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
339
- outputs=[output_zip, progress_bar],
340
- show_progress=True,
341
  )
342
 
343
- demo.launch()
 
 
16
  from pesq import pesq
17
  from pystoi import stoi
18
 
19
+ # --- Helper Functions ---
 
 
 
 
20
 
21
  def load_audio(file_obj):
22
  y, sr = librosa.load(file_obj, sr=16000)
 
51
 
52
  def compute_snr(original, enhanced):
53
  noise = original - enhanced
54
+ snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-10))
55
  return snr
56
 
57
  def vad_plot(y, sr, title):
58
+ vad = webrtcvad.Vad(2)
59
+ if sr != 16000:
60
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
61
+ sr = 16000
62
+
63
  frame_duration_ms = 30
64
+ frame_size = int(sr * frame_duration_ms / 1000)
65
+ if len(y) % frame_size != 0:
66
+ pad_len = frame_size - (len(y) % frame_size)
67
+ y = np.pad(y, (0, pad_len))
68
+
69
+ frames = np.split(y, len(y) // frame_size)
70
+ voiced = []
71
+ for frame in frames:
72
+ pcm = (frame * 32767).astype(np.int16).tobytes()
73
+ try:
74
+ voiced.append(vad.is_speech(pcm, sr))
75
+ except Exception:
76
+ voiced.append(False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ plt.figure(figsize=(10, 1.5))
79
+ plt.plot(voiced, drawstyle='steps-mid')
80
+ plt.title(title)
81
+ plt.xlabel("Frame Index")
82
+ plt.ylabel("Speech")
83
+ plt.tight_layout()
84
  buf = io.BytesIO()
85
  plt.savefig(buf, format='png')
86
  plt.close()
87
  buf.seek(0)
88
  return buf
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def compute_pesq_mfcc_stoi(original_path, enhanced_path):
91
  sr = 16000
92
  original, _ = librosa.load(original_path, sr=sr)
93
  enhanced, _ = librosa.load(enhanced_path, sr=sr)
 
94
  pesq_score = pesq(sr, original, enhanced, 'wb')
95
  stoi_score = stoi(original, enhanced, sr, extended=False)
 
96
  mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
97
  mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
 
 
98
  mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
 
99
  return pesq_score, stoi_score, mfcc_diff
100
 
101
+ # --- Enhancement Functions ---
102
 
103
  def noise_reduction(y, sr):
104
  return reduce_noise(y=y, sr=sr)
105
 
106
  def voice_isolation(y, sr):
 
 
107
  return y
108
 
109
  def reverb_cleanup(y, sr):
110
+ return medfilt(y, kernel_size=5)
 
 
111
 
112
  def volume_normalize(y):
113
  peak = np.max(np.abs(y))
 
116
  return y
117
 
118
  def language_aware_tuning(y, sr):
119
+ return librosa.effects.preemphasis(y)
120
+
121
+ def amplify(y, factor=1.5):
122
+ y = y * factor
123
+ y = np.clip(y, -1.0, 1.0)
124
+ return y
125
+
126
+ # --- Processing Function ---
127
+
128
+ def process_files(files, noise_reduc, voice_iso, reverb_clean, vol_norm, lang_tune, amplify_audio, progress=gr.Progress()):
 
 
 
 
 
 
129
  results = []
130
  metrics = []
131
  temp_dir = tempfile.mkdtemp()
 
139
  y, sr = load_audio(file_obj)
140
  original_y = y.copy()
141
 
142
+ if noise_reduc: y = noise_reduction(y, sr)
143
+ if voice_iso: y = voice_isolation(y, sr)
144
+ if reverb_clean: y = reverb_cleanup(y, sr)
145
+ if vol_norm: y = volume_normalize(y)
146
+ if lang_tune: y = language_aware_tuning(y, sr)
147
+ if amplify_audio: y = amplify(y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ base_name = os.path.splitext(file_obj.name)[0]
150
+ original_path = os.path.join(temp_dir, f"{base_name}_original.wav")
151
+ enhanced_path = os.path.join(temp_dir, f"{base_name}_enhanced.wav")
 
 
 
 
152
 
153
+ save_audio(original_y, sr, original_path)
154
+ save_audio(y, sr, enhanced_path)
155
 
156
+ for func, suffix in [
157
+ (plot_waveform, "waveform"),
158
+ (plot_spectrogram, "spectrogram"),
159
+ (vad_plot, "vad")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  ]:
161
+ for label, data in [("original", original_y), ("enhanced", y)]:
162
+ img = func(data, sr, f"{label.title()} {suffix.title()}")
163
+ img_path = os.path.join(temp_dir, f"{base_name}_{suffix}_{label}.png")
164
+ with open(img_path, "wb") as f:
165
+ f.write(img.read())
166
+ zipf.write(img_path, arcname=os.path.basename(img_path))
167
 
 
168
  try:
169
  pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
170
  except Exception:
 
189
  zipf.write(csv_path, arcname="metrics.csv")
190
 
191
  zipf.close()
192
+ return zip_path
193
+
194
+ # --- Gradio UI ---
195
 
196
+ def run_enhancement(files, nr, vi, reverb, vol, lang, amp):
197
+ if not files:
198
+ return None, None, "Please upload at least one audio file.", gr.update(visible=False)
199
+ if not (nr or vi or reverb or vol or lang or amp):
200
+ return None, None, "Enable at least one enhancement option.", gr.update(visible=True, value="No enhancements selected!")
201
+ zip_path = process_files(files, nr, vi, reverb, vol, lang, amp)
202
+ wav_files = [f for f in os.listdir(os.path.dirname(zip_path)) if f.endswith("_enhanced.wav")]
203
+ first_output_wav = os.path.join(os.path.dirname(zip_path), wav_files[0]) if wav_files else None
204
+ return zip_path, first_output_wav, "Enhancement complete.", gr.update(visible=False)
205
 
206
  with gr.Blocks() as demo:
207
+ gr.Markdown("## AudioVoiceEnhancer.AI - Upload, Enhance, and Analyze Voice Files")
208
 
209
  with gr.Row():
210
+ audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple")
211
  with gr.Row():
212
+ noise_checkbox = gr.Checkbox(value=True, label="Noise Reduction")
213
+ voice_iso_checkbox = gr.Checkbox(value=True, label="Voice Isolation")
214
+ reverb_checkbox = gr.Checkbox(value=True, label="Reverb Cleanup")
215
+ volume_checkbox = gr.Checkbox(value=True, label="Volume Normalize")
216
+ lang_checkbox = gr.Checkbox(value=True, label="Language-Aware Tuning")
217
+ amplify_checkbox = gr.Checkbox(value=False, label="Amplify (Boost Volume)")
218
 
219
  enhance_btn = gr.Button("Enhance Audio")
220
+ warning_text = gr.Textbox(visible=False, label="Warning", interactive=False)
221
+ output_zip = gr.File(label="Download ZIP")
222
+ playback = gr.Audio(label="Preview Enhanced Audio", type="filepath")
223
+ progress_label = gr.Label("Status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  enhance_btn.click(
226
  fn=run_enhancement,
227
+ inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox, amplify_checkbox],
228
+ outputs=[output_zip, playback, progress_label, warning_text],
229
+ show_progress=True
230
  )
231
 
232
+ if __name__ == "__main__":
233
+ demo.launch()