mfrng commited on
Commit
798cb1c
·
verified ·
1 Parent(s): 2c7a0a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -119
app.py CHANGED
@@ -16,8 +16,6 @@ import webrtcvad
16
  from pesq import pesq
17
  from pystoi import stoi
18
 
19
- # --- Helper Functions ---
20
-
21
  def load_audio(file_obj):
22
  y, sr = librosa.load(file_obj, sr=16000)
23
  return y, sr
@@ -29,59 +27,47 @@ def plot_waveform(y, sr, title):
29
  plt.figure(figsize=(10, 2))
30
  librosa.display.waveshow(y, sr=sr)
31
  plt.title(title)
32
- plt.tight_layout()
33
  buf = io.BytesIO()
 
34
  plt.savefig(buf, format='png')
35
  plt.close()
36
  buf.seek(0)
37
  return buf
38
 
39
  def plot_spectrogram(y, sr, title):
40
- plt.figure(figsize=(10, 4))
41
  D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
42
  librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
43
  plt.colorbar(format='%+2.0f dB')
44
  plt.title(title)
45
- plt.tight_layout()
46
  buf = io.BytesIO()
 
47
  plt.savefig(buf, format='png')
48
  plt.close()
49
  buf.seek(0)
50
  return buf
51
 
52
- def compute_snr(original, enhanced):
53
- noise = original - enhanced
54
- snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-10))
55
- return snr
56
-
57
  def vad_plot(y, sr, title):
58
  vad = webrtcvad.Vad(2)
59
  if sr != 16000:
60
  y = librosa.resample(y, orig_sr=sr, target_sr=16000)
61
  sr = 16000
62
-
63
  frame_duration_ms = 30
64
  frame_size = int(sr * frame_duration_ms / 1000)
65
- if len(y) % frame_size != 0:
66
- pad_len = frame_size - (len(y) % frame_size)
67
- y = np.pad(y, (0, pad_len))
68
-
69
  frames = np.split(y, len(y) // frame_size)
70
  voiced = []
71
  for frame in frames:
72
  pcm = (frame * 32767).astype(np.int16).tobytes()
73
  try:
74
  voiced.append(vad.is_speech(pcm, sr))
75
- except Exception:
76
  voiced.append(False)
77
-
78
  plt.figure(figsize=(10, 1.5))
79
  plt.plot(voiced, drawstyle='steps-mid')
80
  plt.title(title)
81
- plt.xlabel("Frame Index")
82
- plt.ylabel("Speech")
83
- plt.tight_layout()
84
  buf = io.BytesIO()
 
85
  plt.savefig(buf, format='png')
86
  plt.close()
87
  buf.seek(0)
@@ -93,141 +79,117 @@ def compute_pesq_mfcc_stoi(original_path, enhanced_path):
93
  enhanced, _ = librosa.load(enhanced_path, sr=sr)
94
  pesq_score = pesq(sr, original, enhanced, 'wb')
95
  stoi_score = stoi(original, enhanced, sr, extended=False)
96
- mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
97
- mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
98
- mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
 
99
  return pesq_score, stoi_score, mfcc_diff
100
 
101
- # --- Enhancement Functions ---
102
-
103
- def noise_reduction(y, sr):
104
- return reduce_noise(y=y, sr=sr)
105
-
106
- def voice_isolation(y, sr):
107
- return y
108
-
109
- def reverb_cleanup(y, sr):
110
- return medfilt(y, kernel_size=5)
111
-
112
- def volume_normalize(y):
113
- peak = np.max(np.abs(y))
114
- if peak > 0:
115
- y = y / peak
116
- return y
117
-
118
- def language_aware_tuning(y, sr):
119
- return librosa.effects.preemphasis(y)
120
-
121
- def amplify(y, factor=1.5):
122
- y = y * factor
123
- y = np.clip(y, -1.0, 1.0)
124
- return y
125
 
126
- # --- Processing Function ---
 
 
 
 
127
 
128
- def process_files(files, noise_reduc, voice_iso, reverb_clean, vol_norm, lang_tune, amplify_audio, progress=gr.Progress()):
129
- results = []
130
- metrics = []
131
  temp_dir = tempfile.mkdtemp()
132
- zip_path = os.path.join(temp_dir, "enhanced_results.zip")
133
  zipf = zipfile.ZipFile(zip_path, 'w')
134
-
135
  total = len(files)
 
136
  for i, file_obj in enumerate(files):
137
  progress((i + 1) / total, desc=f"Processing {file_obj.name}")
138
-
139
  y, sr = load_audio(file_obj)
140
  original_y = y.copy()
141
 
142
- if noise_reduc: y = noise_reduction(y, sr)
143
- if voice_iso: y = voice_isolation(y, sr)
144
- if reverb_clean: y = reverb_cleanup(y, sr)
145
- if vol_norm: y = volume_normalize(y)
146
- if lang_tune: y = language_aware_tuning(y, sr)
147
- if amplify_audio: y = amplify(y)
148
-
149
- base_name = os.path.splitext(file_obj.name)[0]
150
- original_path = os.path.join(temp_dir, f"{base_name}_original.wav")
151
- enhanced_path = os.path.join(temp_dir, f"{base_name}_enhanced.wav")
152
-
153
- save_audio(original_y, sr, original_path)
154
- save_audio(y, sr, enhanced_path)
155
-
156
- for func, suffix in [
157
- (plot_waveform, "waveform"),
158
- (plot_spectrogram, "spectrogram"),
159
- (vad_plot, "vad")
160
- ]:
161
- for label, data in [("original", original_y), ("enhanced", y)]:
162
- img = func(data, sr, f"{label.title()} {suffix.title()}")
163
- img_path = os.path.join(temp_dir, f"{base_name}_{suffix}_{label}.png")
164
  with open(img_path, "wb") as f:
165
- f.write(img.read())
166
  zipf.write(img_path, arcname=os.path.basename(img_path))
167
 
168
- try:
169
- pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
170
- except Exception:
171
- pesq_score, stoi_score, mfcc_diff = None, None, None
 
 
 
172
 
173
  snr = compute_snr(original_y, y)
174
-
175
  metrics.append({
176
  "file": file_obj.name,
177
- "SNR (dB)": snr,
178
  "PESQ": pesq_score,
179
  "STOI": stoi_score,
180
  "MFCC Diff": mfcc_diff
181
  })
182
 
183
- zipf.write(original_path, arcname=os.path.basename(original_path))
184
- zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
185
-
186
- metrics_df = pd.DataFrame(metrics)
187
- csv_path = os.path.join(temp_dir, "metrics.csv")
188
- metrics_df.to_csv(csv_path, index=False)
189
- zipf.write(csv_path, arcname="metrics.csv")
190
 
 
 
 
 
191
  zipf.close()
192
- return zip_path
193
 
194
- # --- Gradio UI ---
 
 
195
 
196
- def run_enhancement(files, nr, vi, reverb, vol, lang, amp):
197
  if not files:
198
- return None, None, "Please upload at least one audio file.", gr.update(visible=False)
199
- if not (nr or vi or reverb or vol or lang or amp):
200
- return None, None, "Enable at least one enhancement option.", gr.update(visible=True, value="No enhancements selected!")
201
- zip_path = process_files(files, nr, vi, reverb, vol, lang, amp)
202
- wav_files = [f for f in os.listdir(os.path.dirname(zip_path)) if f.endswith("_enhanced.wav")]
203
- first_output_wav = os.path.join(os.path.dirname(zip_path), wav_files[0]) if wav_files else None
204
- return zip_path, first_output_wav, "Enhancement complete.", gr.update(visible=False)
205
 
206
  with gr.Blocks() as demo:
207
- gr.Markdown("## AudioVoiceEnhancer.AI - Upload, Enhance, and Analyze Voice Files")
208
 
 
209
  with gr.Row():
210
- audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple")
211
- with gr.Row():
212
- noise_checkbox = gr.Checkbox(value=True, label="Noise Reduction")
213
- voice_iso_checkbox = gr.Checkbox(value=True, label="Voice Isolation")
214
- reverb_checkbox = gr.Checkbox(value=True, label="Reverb Cleanup")
215
- volume_checkbox = gr.Checkbox(value=True, label="Volume Normalize")
216
- lang_checkbox = gr.Checkbox(value=True, label="Language-Aware Tuning")
217
- amplify_checkbox = gr.Checkbox(value=False, label="Amplify (Boost Volume)")
218
-
219
- enhance_btn = gr.Button("Enhance Audio")
220
- warning_text = gr.Textbox(visible=False, label="Warning", interactive=False)
221
  output_zip = gr.File(label="Download ZIP")
222
- playback = gr.Audio(label="Preview Enhanced Audio", type="filepath")
223
- progress_label = gr.Label("Status")
224
 
225
- enhance_btn.click(
226
  fn=run_enhancement,
227
- inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox, amplify_checkbox],
228
- outputs=[output_zip, playback, progress_label, warning_text],
229
  show_progress=True
230
  )
231
 
232
- if __name__ == "__main__":
233
- demo.launch()
 
16
  from pesq import pesq
17
  from pystoi import stoi
18
 
 
 
19
  def load_audio(file_obj):
20
  y, sr = librosa.load(file_obj, sr=16000)
21
  return y, sr
 
27
  plt.figure(figsize=(10, 2))
28
  librosa.display.waveshow(y, sr=sr)
29
  plt.title(title)
 
30
  buf = io.BytesIO()
31
+ plt.tight_layout()
32
  plt.savefig(buf, format='png')
33
  plt.close()
34
  buf.seek(0)
35
  return buf
36
 
37
  def plot_spectrogram(y, sr, title):
38
+ plt.figure(figsize=(10, 3))
39
  D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
40
  librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
41
  plt.colorbar(format='%+2.0f dB')
42
  plt.title(title)
 
43
  buf = io.BytesIO()
44
+ plt.tight_layout()
45
  plt.savefig(buf, format='png')
46
  plt.close()
47
  buf.seek(0)
48
  return buf
49
 
 
 
 
 
 
50
  def vad_plot(y, sr, title):
51
  vad = webrtcvad.Vad(2)
52
  if sr != 16000:
53
  y = librosa.resample(y, orig_sr=sr, target_sr=16000)
54
  sr = 16000
 
55
  frame_duration_ms = 30
56
  frame_size = int(sr * frame_duration_ms / 1000)
57
+ y = np.pad(y, (0, frame_size - len(y) % frame_size)) if len(y) % frame_size != 0 else y
 
 
 
58
  frames = np.split(y, len(y) // frame_size)
59
  voiced = []
60
  for frame in frames:
61
  pcm = (frame * 32767).astype(np.int16).tobytes()
62
  try:
63
  voiced.append(vad.is_speech(pcm, sr))
64
+ except:
65
  voiced.append(False)
 
66
  plt.figure(figsize=(10, 1.5))
67
  plt.plot(voiced, drawstyle='steps-mid')
68
  plt.title(title)
 
 
 
69
  buf = io.BytesIO()
70
+ plt.tight_layout()
71
  plt.savefig(buf, format='png')
72
  plt.close()
73
  buf.seek(0)
 
79
  enhanced, _ = librosa.load(enhanced_path, sr=sr)
80
  pesq_score = pesq(sr, original, enhanced, 'wb')
81
  stoi_score = stoi(original, enhanced, sr, extended=False)
82
+ mfcc_diff = np.mean(np.abs(
83
+ librosa.feature.mfcc(original, sr, n_mfcc=13) -
84
+ librosa.feature.mfcc(enhanced, sr, n_mfcc=13)
85
+ ))
86
  return pesq_score, stoi_score, mfcc_diff
87
 
88
+ def compute_snr(original, enhanced):
89
+ noise = original - enhanced
90
+ snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-9))
91
+ return snr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ def noise_reduction(y, sr): return reduce_noise(y=y, sr=sr)
94
+ def voice_isolation(y, sr): return y # Placeholder
95
+ def reverb_cleanup(y, sr): return medfilt(y, kernel_size=5)
96
+ def volume_normalize(y): return y / np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else y
97
+ def language_aware_tuning(y, sr): return librosa.effects.preemphasis(y)
98
 
99
+ def process_files(files, nr, vi, reverb, vol, lang, skip_metrics=False, progress=gr.Progress()):
100
+ results, metrics = [], []
 
101
  temp_dir = tempfile.mkdtemp()
102
+ zip_path = os.path.join(temp_dir, "enhanced_output.zip")
103
  zipf = zipfile.ZipFile(zip_path, 'w')
 
104
  total = len(files)
105
+
106
  for i, file_obj in enumerate(files):
107
  progress((i + 1) / total, desc=f"Processing {file_obj.name}")
 
108
  y, sr = load_audio(file_obj)
109
  original_y = y.copy()
110
 
111
+ if nr: y = noise_reduction(y, sr)
112
+ if vi: y = voice_isolation(y, sr)
113
+ if reverb: y = reverb_cleanup(y, sr)
114
+ if vol: y = volume_normalize(y)
115
+ if lang: y = language_aware_tuning(y, sr)
116
+
117
+ name = os.path.splitext(file_obj.name)[0]
118
+ orig_path = os.path.join(temp_dir, f"{name}_original.wav")
119
+ enh_path = os.path.join(temp_dir, f"{name}_enhanced.wav")
120
+ save_audio(original_y, sr, orig_path)
121
+ save_audio(y, sr, enh_path)
122
+
123
+ for plot_func, label in [(plot_waveform, "waveform"), (plot_spectrogram, "spectrogram"), (vad_plot, "vad")]:
124
+ for typ, signal in [("original", original_y), ("enhanced", y)]:
125
+ buf = plot_func(signal, sr, f"{typ.title()} {label.title()}")
126
+ img_path = os.path.join(temp_dir, f"{name}_{label}_{typ}.png")
 
 
 
 
 
 
127
  with open(img_path, "wb") as f:
128
+ f.write(buf.read())
129
  zipf.write(img_path, arcname=os.path.basename(img_path))
130
 
131
+ if skip_metrics:
132
+ pesq_score = stoi_score = mfcc_diff = None
133
+ else:
134
+ try:
135
+ pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(orig_path, enh_path)
136
+ except:
137
+ pesq_score, stoi_score, mfcc_diff = None, None, None
138
 
139
  snr = compute_snr(original_y, y)
 
140
  metrics.append({
141
  "file": file_obj.name,
142
+ "SNR": snr,
143
  "PESQ": pesq_score,
144
  "STOI": stoi_score,
145
  "MFCC Diff": mfcc_diff
146
  })
147
 
148
+ zipf.write(orig_path, arcname=os.path.basename(orig_path))
149
+ zipf.write(enh_path, arcname=os.path.basename(enh_path))
 
 
 
 
 
150
 
151
+ df = pd.DataFrame(metrics)
152
+ metrics_path = os.path.join(temp_dir, "metrics.csv")
153
+ df.to_csv(metrics_path, index=False)
154
+ zipf.write(metrics_path, arcname="metrics.csv")
155
  zipf.close()
 
156
 
157
+ enhanced_files = [f for f in os.listdir(temp_dir) if f.endswith("_enhanced.wav")]
158
+ preview_path = os.path.join(temp_dir, enhanced_files[0]) if enhanced_files else None
159
+ return zip_path, preview_path
160
 
161
+ def run_enhancement(files, nr, vi, reverb, vol, lang, skip_metrics):
162
  if not files:
163
+ return None, None, "Upload audio files.", gr.update(visible=False)
164
+ if not any([nr, vi, reverb, vol, lang]):
165
+ return None, None, "Select at least one enhancement.", gr.update(visible=True, value="No enhancements selected.")
166
+ zip_path, preview = process_files(files, nr, vi, reverb, vol, lang, skip_metrics)
167
+ return zip_path, preview, "Done!", gr.update(visible=False)
 
 
168
 
169
  with gr.Blocks() as demo:
170
+ gr.Markdown("## 🎧 AudioVoiceEnhancer.AI")
171
 
172
+ files = gr.File(label="Upload Audio", file_types=[".wav", ".mp3"], file_count="multiple")
173
  with gr.Row():
174
+ nr = gr.Checkbox(label="Noise Reduction", value=True)
175
+ vi = gr.Checkbox(label="Voice Isolation", value=True)
176
+ reverb = gr.Checkbox(label="Reverb Cleanup", value=True)
177
+ vol = gr.Checkbox(label="Volume Normalize", value=True)
178
+ lang = gr.Checkbox(label="Language-Aware Tuning", value=True)
179
+ skip_metrics = gr.Checkbox(label="🚀 Skip PESQ/STOI for Speed", value=True)
180
+
181
+ run_btn = gr.Button("Enhance Audio")
182
+ warning = gr.Textbox(visible=False, label="Warning")
 
 
183
  output_zip = gr.File(label="Download ZIP")
184
+ output_audio = gr.Audio(label="Preview Enhanced", type="filepath")
185
+ label = gr.Label("Status")
186
 
187
+ run_btn.click(
188
  fn=run_enhancement,
189
+ inputs=[files, nr, vi, reverb, vol, lang, skip_metrics],
190
+ outputs=[output_zip, output_audio, label, warning],
191
  show_progress=True
192
  )
193
 
194
+ demo.queue()
195
+ demo.launch()