mfrng commited on
Commit
2ed41fc
·
verified ·
1 Parent(s): a2b60a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -260
app.py CHANGED
@@ -1,260 +1,272 @@
1
- import os
2
- import io
3
- import tempfile
4
- import zipfile
5
- import numpy as np
6
- import pandas as pd
7
- import librosa
8
- import librosa.display
9
- import matplotlib.pyplot as plt
10
- import soundfile as sf
11
- import gradio as gr
12
-
13
- from scipy.signal import medfilt
14
- from noisereduce import reduce_noise
15
- import webrtcvad
16
- from pesq import pesq
17
- from pystoi import stoi
18
-
19
- # Models placeholder imports
20
- # from demucs import DemucsModel # For voice isolation
21
- # from voicefixer import VoiceFixer # For audio restoration
22
-
23
- # -- Helper functions --
24
-
25
- def load_audio(file_obj):
26
- y, sr = librosa.load(file_obj, sr=16000)
27
- return y, sr
28
-
29
- def save_audio(y, sr, path):
30
- sf.write(path, y, sr)
31
-
32
- def plot_waveform(y, sr, title):
33
- plt.figure(figsize=(10, 2))
34
- librosa.display.waveshow(y, sr=sr)
35
- plt.title(title)
36
- plt.tight_layout()
37
- buf = io.BytesIO()
38
- plt.savefig(buf, format='png')
39
- plt.close()
40
- buf.seek(0)
41
- return buf
42
-
43
- def plot_spectrogram(y, sr, title):
44
- plt.figure(figsize=(10, 4))
45
- D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
46
- librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
47
- plt.colorbar(format='%+2.0f dB')
48
- plt.title(title)
49
- plt.tight_layout()
50
- buf = io.BytesIO()
51
- plt.savefig(buf, format='png')
52
- plt.close()
53
- buf.seek(0)
54
- return buf
55
-
56
- def compute_snr(original, enhanced):
57
- noise = original - enhanced
58
- snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
59
- return snr
60
-
61
- def vad_plot(y, sr, title):
62
- vad = webrtcvad.Vad(2) # Aggressiveness 0-3
63
- frame_duration = 30 # ms
64
- frame_length = int(sr * frame_duration / 1000)
65
- frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)]
66
- voiced = [vad.is_speech((frame * 32767).astype(np.int16).tobytes(), sr) for frame in frames]
67
- times = np.arange(len(voiced)) * frame_duration / 1000
68
-
69
- plt.figure(figsize=(10, 2))
70
- plt.plot(times, voiced, drawstyle='steps-pre')
71
- plt.ylim(-0.1, 1.1)
72
- plt.title(title)
73
- plt.xlabel('Time (s)')
74
- plt.ylabel('Voiced (1) / Unvoiced (0)')
75
- plt.tight_layout()
76
- buf = io.BytesIO()
77
- plt.savefig(buf, format='png')
78
- plt.close()
79
- buf.seek(0)
80
- return buf
81
-
82
- def compute_pesq_mfcc_stoi(original_path, enhanced_path):
83
- sr = 16000
84
- original, _ = librosa.load(original_path, sr=sr)
85
- enhanced, _ = librosa.load(enhanced_path, sr=sr)
86
-
87
- pesq_score = pesq(sr, original, enhanced, 'wb')
88
- stoi_score = stoi(original, enhanced, sr, extended=False)
89
-
90
- mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
91
- mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
92
-
93
- # Compute MFCC distance (mean absolute difference)
94
- mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
95
-
96
- return pesq_score, stoi_score, mfcc_diff
97
-
98
- # Enhancement functions
99
-
100
- def noise_reduction(y, sr):
101
- return reduce_noise(y=y, sr=sr)
102
-
103
- def voice_isolation(y, sr):
104
- # Placeholder: Implement with Demucs or similar
105
- # For demo, return input
106
- return y
107
-
108
- def reverb_cleanup(y, sr):
109
- # Simple dereverberation placeholder: median filtering
110
- y_dereverb = medfilt(y, kernel_size=5)
111
- return y_dereverb
112
-
113
- def volume_normalize(y):
114
- peak = np.max(np.abs(y))
115
- if peak > 0:
116
- y = y / peak
117
- return y
118
-
119
- def language_aware_tuning(y, sr):
120
- # Placeholder for EQ adjustments by language
121
- # For demo, apply slight high-pass filter
122
- y_hp = librosa.effects.preemphasis(y)
123
- return y_hp
124
-
125
- # Main processing function
126
-
127
- def process_files(
128
- files,
129
- noise_reduc,
130
- voice_iso,
131
- reverb_clean,
132
- vol_norm,
133
- lang_tune,
134
- progress=gr.Progress()
135
- ):
136
- results = []
137
- metrics = []
138
- temp_dir = tempfile.mkdtemp()
139
- zip_path = os.path.join(temp_dir, "enhanced_results.zip")
140
- zipf = zipfile.ZipFile(zip_path, 'w')
141
-
142
- total = len(files)
143
- for i, file_obj in enumerate(files):
144
- progress((i + 1) / total, desc=f"Processing {file_obj.name}")
145
-
146
- y, sr = load_audio(file_obj)
147
- original_y = y.copy()
148
-
149
- # Enhancement pipeline
150
- if noise_reduc:
151
- y = noise_reduction(y, sr)
152
- if voice_iso:
153
- y = voice_isolation(y, sr)
154
- if reverb_clean:
155
- y = reverb_cleanup(y, sr)
156
- if vol_norm:
157
- y = volume_normalize(y)
158
- if lang_tune:
159
- y = language_aware_tuning(y, sr)
160
-
161
- # Save enhanced audio
162
- enhanced_filename = os.path.splitext(file_obj.name)[0] + "_enhanced.wav"
163
- enhanced_path = os.path.join(temp_dir, enhanced_filename)
164
- save_audio(y, sr, enhanced_path)
165
-
166
- # Save original audio for comparison
167
- original_filename = os.path.splitext(file_obj.name)[0] + "_original.wav"
168
- original_path = os.path.join(temp_dir, original_filename)
169
- save_audio(original_y, sr, original_path)
170
-
171
- # Generate plots
172
- waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
173
- waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
174
-
175
- spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
176
- spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
177
-
178
- vad_orig = vad_plot(original_y, sr, "Original VAD")
179
- vad_enh = vad_plot(y, sr, "Enhanced VAD")
180
-
181
- # Save plots to files and add to zip
182
- plot_files = []
183
- for img_buf, name in [
184
- (waveform_orig, "waveform_original.png"),
185
- (waveform_enh, "waveform_enhanced.png"),
186
- (spectrogram_orig, "spectrogram_original.png"),
187
- (spectrogram_enh, "spectrogram_enhanced.png"),
188
- (vad_orig, "vad_original.png"),
189
- (vad_enh, "vad_enhanced.png"),
190
- ]:
191
- path = os.path.join(temp_dir, f"{os.path.splitext(file_obj.name)[0]}_{name}")
192
- with open(path, "wb") as f:
193
- f.write(img_buf.read())
194
- zipf.write(path, arcname=os.path.basename(path))
195
- plot_files.append(path)
196
-
197
- # Compute audio quality metrics
198
- try:
199
- pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
200
- except Exception as e:
201
- pesq_score, stoi_score, mfcc_diff = None, None, None
202
-
203
- snr = compute_snr(original_y, y)
204
-
205
- # Collect metrics
206
- metrics.append({
207
- "file": file_obj.name,
208
- "SNR (dB)": snr,
209
- "PESQ": pesq_score,
210
- "STOI": stoi_score,
211
- "MFCC Diff": mfcc_diff
212
- })
213
-
214
- # Add original and enhanced audio to zip
215
- zipf.write(original_path, arcname=os.path.basename(original_path))
216
- zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
217
-
218
- # Save metrics CSV
219
- metrics_df = pd.DataFrame(metrics)
220
- csv_path = os.path.join(temp_dir, "metrics.csv")
221
- metrics_df.to_csv(csv_path, index=False)
222
- zipf.write(csv_path, arcname="metrics.csv")
223
-
224
- zipf.close()
225
- return zip_path
226
-
227
- # Gradio UI
228
-
229
- with gr.Blocks() as demo:
230
- gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
231
-
232
- with gr.Row():
233
- audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
234
- with gr.Row():
235
- noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise")
236
- voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background")
237
- reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects")
238
- volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume")
239
- lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language")
240
-
241
- enhance_btn = gr.Button("Enhance Audio")
242
-
243
- output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
244
-
245
- progress_bar = gr.Label(value="Upload files and select enhancement options.")
246
-
247
- def run_enhancement(files, nr, vi, reverb, vol, lang):
248
- if not files or len(files) == 0:
249
- return None, "Please upload at least one audio file."
250
- path = process_files(files, nr, vi, reverb, vol, lang)
251
- return path, "Processing complete. Download your ZIP file below."
252
-
253
- enhance_btn.click(
254
- fn=run_enhancement,
255
- inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
256
- outputs=[output_zip, progress_bar],
257
- show_progress=True,
258
- )
259
-
260
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import tempfile
4
+ import zipfile
5
+ import numpy as np
6
+ import pandas as pd
7
+ import librosa
8
+ import librosa.display
9
+ import matplotlib.pyplot as plt
10
+ import soundfile as sf
11
+ import gradio as gr
12
+
13
+ from scipy.signal import medfilt
14
+ from noisereduce import reduce_noise
15
+ import webrtcvad
16
+ from pesq import pesq
17
+ from pystoi import stoi
18
+
19
+ # Models placeholder imports
20
+ # from demucs import DemucsModel # For voice isolation
21
+ # from voicefixer import VoiceFixer # For audio restoration
22
+
23
+ # -- Helper functions --
24
+
25
+ def load_audio(file_obj):
26
+ y, sr = librosa.load(file_obj, sr=16000)
27
+ return y, sr
28
+
29
+ def save_audio(y, sr, path):
30
+ sf.write(path, y, sr)
31
+
32
+ def plot_waveform(y, sr, title):
33
+ plt.figure(figsize=(10, 2))
34
+ librosa.display.waveshow(y, sr=sr)
35
+ plt.title(title)
36
+ plt.tight_layout()
37
+ buf = io.BytesIO()
38
+ plt.savefig(buf, format='png')
39
+ plt.close()
40
+ buf.seek(0)
41
+ return buf
42
+
43
+ def plot_spectrogram(y, sr, title):
44
+ plt.figure(figsize=(10, 4))
45
+ D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
46
+ librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
47
+ plt.colorbar(format='%+2.0f dB')
48
+ plt.title(title)
49
+ plt.tight_layout()
50
+ buf = io.BytesIO()
51
+ plt.savefig(buf, format='png')
52
+ plt.close()
53
+ buf.seek(0)
54
+ return buf
55
+
56
+ def compute_snr(original, enhanced):
57
+ noise = original - enhanced
58
+ snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
59
+ return snr
60
+
61
+ def vad_plot(y, sr, title):
62
+ # webrtcvad requires 16-bit mono PCM, sample rate 16000, 10/20/30 ms chunks
63
+ import webrtcvad
64
+ import numpy as np
65
+
66
+ vad = webrtcvad.Vad(2)
67
+
68
+ if sr != 16000:
69
+ import librosa
70
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
71
+ sr = 16000
72
+
73
+ frame_duration_ms = 30 # Can be 10, 20, or 30
74
+ frame_size = int(sr * frame_duration_ms / 1000) # samples per frame
75
+
76
+ # Pad signal to be multiple of frame_size
77
+ if len(y) % frame_size != 0:
78
+ pad_len = frame_size - (len(y) % frame_size)
79
+ y = np.pad(y, (0, pad_len))
80
+
81
+ frames = np.split(y, len(y) // frame_size)
82
+ voiced = []
83
+
84
+ for frame in frames:
85
+ pcm = (frame * 32767).astype(np.int16).tobytes()
86
+ try:
87
+ voiced.append(vad.is_speech(pcm, sr))
88
+ except Exception as e:
89
+ print("VAD error:", e)
90
+ voiced.append(False)
91
+
92
+ return voiced
93
+
94
+ def compute_pesq_mfcc_stoi(original_path, enhanced_path):
95
+ sr = 16000
96
+ original, _ = librosa.load(original_path, sr=sr)
97
+ enhanced, _ = librosa.load(enhanced_path, sr=sr)
98
+
99
+ pesq_score = pesq(sr, original, enhanced, 'wb')
100
+ stoi_score = stoi(original, enhanced, sr, extended=False)
101
+
102
+ mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
103
+ mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
104
+
105
+ # Compute MFCC distance (mean absolute difference)
106
+ mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
107
+
108
+ return pesq_score, stoi_score, mfcc_diff
109
+
110
+ # Enhancement functions
111
+
112
+ def noise_reduction(y, sr):
113
+ return reduce_noise(y=y, sr=sr)
114
+
115
+ def voice_isolation(y, sr):
116
+ # Placeholder: Implement with Demucs or similar
117
+ # For demo, return input
118
+ return y
119
+
120
+ def reverb_cleanup(y, sr):
121
+ # Simple dereverberation placeholder: median filtering
122
+ y_dereverb = medfilt(y, kernel_size=5)
123
+ return y_dereverb
124
+
125
+ def volume_normalize(y):
126
+ peak = np.max(np.abs(y))
127
+ if peak > 0:
128
+ y = y / peak
129
+ return y
130
+
131
+ def language_aware_tuning(y, sr):
132
+ # Placeholder for EQ adjustments by language
133
+ # For demo, apply slight high-pass filter
134
+ y_hp = librosa.effects.preemphasis(y)
135
+ return y_hp
136
+
137
+ # Main processing function
138
+
139
+ def process_files(
140
+ files,
141
+ noise_reduc,
142
+ voice_iso,
143
+ reverb_clean,
144
+ vol_norm,
145
+ lang_tune,
146
+ progress=gr.Progress()
147
+ ):
148
+ results = []
149
+ metrics = []
150
+ temp_dir = tempfile.mkdtemp()
151
+ zip_path = os.path.join(temp_dir, "enhanced_results.zip")
152
+ zipf = zipfile.ZipFile(zip_path, 'w')
153
+
154
+ total = len(files)
155
+ for i, file_obj in enumerate(files):
156
+ progress((i + 1) / total, desc=f"Processing {file_obj.name}")
157
+
158
+ y, sr = load_audio(file_obj)
159
+ original_y = y.copy()
160
+
161
+ # Enhancement pipeline
162
+ if noise_reduc:
163
+ y = noise_reduction(y, sr)
164
+ if voice_iso:
165
+ y = voice_isolation(y, sr)
166
+ if reverb_clean:
167
+ y = reverb_cleanup(y, sr)
168
+ if vol_norm:
169
+ y = volume_normalize(y)
170
+ if lang_tune:
171
+ y = language_aware_tuning(y, sr)
172
+
173
+ # Save enhanced audio
174
+ enhanced_filename = os.path.splitext(file_obj.name)[0] + "_enhanced.wav"
175
+ enhanced_path = os.path.join(temp_dir, enhanced_filename)
176
+ save_audio(y, sr, enhanced_path)
177
+
178
+ # Save original audio for comparison
179
+ original_filename = os.path.splitext(file_obj.name)[0] + "_original.wav"
180
+ original_path = os.path.join(temp_dir, original_filename)
181
+ save_audio(original_y, sr, original_path)
182
+
183
+ # Generate plots
184
+ waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
185
+ waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
186
+
187
+ spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
188
+ spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
189
+
190
+ vad_orig = vad_plot(original_y, sr, "Original VAD")
191
+ vad_enh = vad_plot(y, sr, "Enhanced VAD")
192
+
193
+ # Save plots to files and add to zip
194
+ plot_files = []
195
+ for img_buf, name in [
196
+ (waveform_orig, "waveform_original.png"),
197
+ (waveform_enh, "waveform_enhanced.png"),
198
+ (spectrogram_orig, "spectrogram_original.png"),
199
+ (spectrogram_enh, "spectrogram_enhanced.png"),
200
+ (vad_orig, "vad_original.png"),
201
+ (vad_enh, "vad_enhanced.png"),
202
+ ]:
203
+ path = os.path.join(temp_dir, f"{os.path.splitext(file_obj.name)[0]}_{name}")
204
+ with open(path, "wb") as f:
205
+ f.write(img_buf.read())
206
+ zipf.write(path, arcname=os.path.basename(path))
207
+ plot_files.append(path)
208
+
209
+ # Compute audio quality metrics
210
+ try:
211
+ pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
212
+ except Exception as e:
213
+ pesq_score, stoi_score, mfcc_diff = None, None, None
214
+
215
+ snr = compute_snr(original_y, y)
216
+
217
+ # Collect metrics
218
+ metrics.append({
219
+ "file": file_obj.name,
220
+ "SNR (dB)": snr,
221
+ "PESQ": pesq_score,
222
+ "STOI": stoi_score,
223
+ "MFCC Diff": mfcc_diff
224
+ })
225
+
226
+ # Add original and enhanced audio to zip
227
+ zipf.write(original_path, arcname=os.path.basename(original_path))
228
+ zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
229
+
230
+ # Save metrics CSV
231
+ metrics_df = pd.DataFrame(metrics)
232
+ csv_path = os.path.join(temp_dir, "metrics.csv")
233
+ metrics_df.to_csv(csv_path, index=False)
234
+ zipf.write(csv_path, arcname="metrics.csv")
235
+
236
+ zipf.close()
237
+ return zip_path
238
+
239
+ # Gradio UI
240
+
241
+ with gr.Blocks() as demo:
242
+ gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
243
+
244
+ with gr.Row():
245
+ audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
246
+ with gr.Row():
247
+ noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise")
248
+ voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background")
249
+ reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects")
250
+ volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume")
251
+ lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language")
252
+
253
+ enhance_btn = gr.Button("Enhance Audio")
254
+
255
+ output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
256
+
257
+ progress_bar = gr.Label(value="Upload files and select enhancement options.")
258
+
259
+ def run_enhancement(files, nr, vi, reverb, vol, lang):
260
+ if not files or len(files) == 0:
261
+ return None, "Please upload at least one audio file."
262
+ path = process_files(files, nr, vi, reverb, vol, lang)
263
+ return path, "Processing complete. Download your ZIP file below."
264
+
265
+ enhance_btn.click(
266
+ fn=run_enhancement,
267
+ inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
268
+ outputs=[output_zip, progress_bar],
269
+ show_progress=True,
270
+ )
271
+
272
+ demo.launch()