mfrng commited on
Commit
40b42fc
·
verified ·
1 Parent(s): 9beb94a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +260 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import tempfile
4
+ import zipfile
5
+ import numpy as np
6
+ import pandas as pd
7
+ import librosa
8
+ import librosa.display
9
+ import matplotlib.pyplot as plt
10
+ import soundfile as sf
11
+ import gradio as gr
12
+
13
+ from scipy.signal import medfilt
14
+ from noisereduce import reduce_noise
15
+ import webrtcvad
16
+ from pesq import pesq
17
+ from pystoi import stoi
18
+
19
+ # Models placeholder imports
20
+ # from demucs import DemucsModel # For voice isolation
21
+ # from voicefixer import VoiceFixer # For audio restoration
22
+
23
+ # -- Helper functions --
24
+
25
+ def load_audio(file_obj):
26
+ y, sr = librosa.load(file_obj, sr=16000)
27
+ return y, sr
28
+
29
+ def save_audio(y, sr, path):
30
+ sf.write(path, y, sr)
31
+
32
+ def plot_waveform(y, sr, title):
33
+ plt.figure(figsize=(10, 2))
34
+ librosa.display.waveshow(y, sr=sr)
35
+ plt.title(title)
36
+ plt.tight_layout()
37
+ buf = io.BytesIO()
38
+ plt.savefig(buf, format='png')
39
+ plt.close()
40
+ buf.seek(0)
41
+ return buf
42
+
43
+ def plot_spectrogram(y, sr, title):
44
+ plt.figure(figsize=(10, 4))
45
+ D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
46
+ librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
47
+ plt.colorbar(format='%+2.0f dB')
48
+ plt.title(title)
49
+ plt.tight_layout()
50
+ buf = io.BytesIO()
51
+ plt.savefig(buf, format='png')
52
+ plt.close()
53
+ buf.seek(0)
54
+ return buf
55
+
56
+ def compute_snr(original, enhanced):
57
+ noise = original - enhanced
58
+ snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
59
+ return snr
60
+
61
+ def vad_plot(y, sr, title):
62
+ vad = webrtcvad.Vad(2) # Aggressiveness 0-3
63
+ frame_duration = 30 # ms
64
+ frame_length = int(sr * frame_duration / 1000)
65
+ frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)]
66
+ voiced = [vad.is_speech((frame * 32767).astype(np.int16).tobytes(), sr) for frame in frames]
67
+ times = np.arange(len(voiced)) * frame_duration / 1000
68
+
69
+ plt.figure(figsize=(10, 2))
70
+ plt.plot(times, voiced, drawstyle='steps-pre')
71
+ plt.ylim(-0.1, 1.1)
72
+ plt.title(title)
73
+ plt.xlabel('Time (s)')
74
+ plt.ylabel('Voiced (1) / Unvoiced (0)')
75
+ plt.tight_layout()
76
+ buf = io.BytesIO()
77
+ plt.savefig(buf, format='png')
78
+ plt.close()
79
+ buf.seek(0)
80
+ return buf
81
+
82
+ def compute_pesq_mfcc_stoi(original_path, enhanced_path):
83
+ sr = 16000
84
+ original, _ = librosa.load(original_path, sr=sr)
85
+ enhanced, _ = librosa.load(enhanced_path, sr=sr)
86
+
87
+ pesq_score = pesq(sr, original, enhanced, 'wb')
88
+ stoi_score = stoi(original, enhanced, sr, extended=False)
89
+
90
+ mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
91
+ mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
92
+
93
+ # Compute MFCC distance (mean absolute difference)
94
+ mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
95
+
96
+ return pesq_score, stoi_score, mfcc_diff
97
+
98
+ # Enhancement functions
99
+
100
+ def noise_reduction(y, sr):
101
+ return reduce_noise(y=y, sr=sr)
102
+
103
+ def voice_isolation(y, sr):
104
+ # Placeholder: Implement with Demucs or similar
105
+ # For demo, return input
106
+ return y
107
+
108
+ def reverb_cleanup(y, sr):
109
+ # Simple dereverberation placeholder: median filtering
110
+ y_dereverb = medfilt(y, kernel_size=5)
111
+ return y_dereverb
112
+
113
+ def volume_normalize(y):
114
+ peak = np.max(np.abs(y))
115
+ if peak > 0:
116
+ y = y / peak
117
+ return y
118
+
119
+ def language_aware_tuning(y, sr):
120
+ # Placeholder for EQ adjustments by language
121
+ # For demo, apply slight high-pass filter
122
+ y_hp = librosa.effects.preemphasis(y)
123
+ return y_hp
124
+
125
+ # Main processing function
126
+
127
+ def process_files(
128
+ files,
129
+ noise_reduc,
130
+ voice_iso,
131
+ reverb_clean,
132
+ vol_norm,
133
+ lang_tune,
134
+ progress=gr.Progress()
135
+ ):
136
+ results = []
137
+ metrics = []
138
+ temp_dir = tempfile.mkdtemp()
139
+ zip_path = os.path.join(temp_dir, "enhanced_results.zip")
140
+ zipf = zipfile.ZipFile(zip_path, 'w')
141
+
142
+ total = len(files)
143
+ for i, file_obj in enumerate(files):
144
+ progress((i + 1) / total, desc=f"Processing {file_obj.name}")
145
+
146
+ y, sr = load_audio(file_obj)
147
+ original_y = y.copy()
148
+
149
+ # Enhancement pipeline
150
+ if noise_reduc:
151
+ y = noise_reduction(y, sr)
152
+ if voice_iso:
153
+ y = voice_isolation(y, sr)
154
+ if reverb_clean:
155
+ y = reverb_cleanup(y, sr)
156
+ if vol_norm:
157
+ y = volume_normalize(y)
158
+ if lang_tune:
159
+ y = language_aware_tuning(y, sr)
160
+
161
+ # Save enhanced audio
162
+ enhanced_filename = os.path.splitext(file_obj.name)[0] + "_enhanced.wav"
163
+ enhanced_path = os.path.join(temp_dir, enhanced_filename)
164
+ save_audio(y, sr, enhanced_path)
165
+
166
+ # Save original audio for comparison
167
+ original_filename = os.path.splitext(file_obj.name)[0] + "_original.wav"
168
+ original_path = os.path.join(temp_dir, original_filename)
169
+ save_audio(original_y, sr, original_path)
170
+
171
+ # Generate plots
172
+ waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
173
+ waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
174
+
175
+ spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
176
+ spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
177
+
178
+ vad_orig = vad_plot(original_y, sr, "Original VAD")
179
+ vad_enh = vad_plot(y, sr, "Enhanced VAD")
180
+
181
+ # Save plots to files and add to zip
182
+ plot_files = []
183
+ for img_buf, name in [
184
+ (waveform_orig, "waveform_original.png"),
185
+ (waveform_enh, "waveform_enhanced.png"),
186
+ (spectrogram_orig, "spectrogram_original.png"),
187
+ (spectrogram_enh, "spectrogram_enhanced.png"),
188
+ (vad_orig, "vad_original.png"),
189
+ (vad_enh, "vad_enhanced.png"),
190
+ ]:
191
+ path = os.path.join(temp_dir, f"{os.path.splitext(file_obj.name)[0]}_{name}")
192
+ with open(path, "wb") as f:
193
+ f.write(img_buf.read())
194
+ zipf.write(path, arcname=os.path.basename(path))
195
+ plot_files.append(path)
196
+
197
+ # Compute audio quality metrics
198
+ try:
199
+ pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
200
+ except Exception as e:
201
+ pesq_score, stoi_score, mfcc_diff = None, None, None
202
+
203
+ snr = compute_snr(original_y, y)
204
+
205
+ # Collect metrics
206
+ metrics.append({
207
+ "file": file_obj.name,
208
+ "SNR (dB)": snr,
209
+ "PESQ": pesq_score,
210
+ "STOI": stoi_score,
211
+ "MFCC Diff": mfcc_diff
212
+ })
213
+
214
+ # Add original and enhanced audio to zip
215
+ zipf.write(original_path, arcname=os.path.basename(original_path))
216
+ zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
217
+
218
+ # Save metrics CSV
219
+ metrics_df = pd.DataFrame(metrics)
220
+ csv_path = os.path.join(temp_dir, "metrics.csv")
221
+ metrics_df.to_csv(csv_path, index=False)
222
+ zipf.write(csv_path, arcname="metrics.csv")
223
+
224
+ zipf.close()
225
+ return zip_path
226
+
227
+ # Gradio UI
228
+
229
+ with gr.Blocks() as demo:
230
+ gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
231
+
232
+ with gr.Row():
233
+ audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
234
+ with gr.Row():
235
+ noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise")
236
+ voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background")
237
+ reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects")
238
+ volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume")
239
+ lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language")
240
+
241
+ enhance_btn = gr.Button("Enhance Audio")
242
+
243
+ output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
244
+
245
+ progress_bar = gr.Label(value="Upload files and select enhancement options.")
246
+
247
+ def run_enhancement(files, nr, vi, reverb, vol, lang):
248
+ if not files or len(files) == 0:
249
+ return None, "Please upload at least one audio file."
250
+ path = process_files(files, nr, vi, reverb, vol, lang)
251
+ return path, "Processing complete. Download your ZIP file below."
252
+
253
+ enhance_btn.click(
254
+ fn=run_enhancement,
255
+ inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
256
+ outputs=[output_zip, progress_bar],
257
+ show_progress=True,
258
+ )
259
+
260
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.37.0
2
+ librosa==0.9.2
3
+ matplotlib==3.7.1
4
+ soundfile==0.12.1
5
+ numpy==1.24.3
6
+ pandas==1.5.3
7
+ scipy==1.10.1
8
+ noisereduce==2.0.1
9
+ webrtcvad==2.0.10
10
+ pesq==0.0.3
11
+ pystoi==0.3.5