SaoYear
commited on
Commit
·
2547be7
1
Parent(s):
785ef15
add demo page application
Browse files
app.py
CHANGED
|
@@ -23,13 +23,16 @@ import librosa as lb
|
|
| 23 |
import yaml
|
| 24 |
import numpy as np
|
| 25 |
import matplotlib.pyplot as plt
|
|
|
|
| 26 |
from model.cleanmel import CleanMel
|
| 27 |
from model.vocos.pretrained import Vocos
|
| 28 |
from model.stft import InputSTFT, TargetMel
|
| 29 |
|
| 30 |
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 31 |
def read_audio(file_path):
|
|
|
|
| 32 |
audio, sample_rate = sf.read(file_path)
|
|
|
|
| 33 |
if audio.ndim > 1:
|
| 34 |
# select the loudest channel if stereo
|
| 35 |
audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
|
|
@@ -176,29 +179,39 @@ def reset_everything():
|
|
| 176 |
demo = gr.Blocks()
|
| 177 |
with gr.Blocks(title="CleanMel Demo") as demo:
|
| 178 |
gr.Markdown("## CleanMel Demo")
|
| 179 |
-
gr.Markdown("This demo showcases the CleanMel model for speech enhancement. \
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
with gr.Row():
|
| 184 |
-
audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")
|
| 185 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
| 192 |
clear_btn = gr.Button(
|
| 193 |
"🗑️ Clear All",
|
| 194 |
variant="secondary",
|
| 195 |
size="lg"
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
|
| 199 |
output_mel = gr.Image(label="Output LogMel Spectrogram", type="filepath", visible=True)
|
| 200 |
output_np = gr.File(label="Enhanced LogMel Spec. (.npy)", type="filepath")
|
| 201 |
-
|
| 202 |
enhance_button_map_L.click(
|
| 203 |
enhance_cleanmel_L_map,
|
| 204 |
inputs=audio_input,
|
|
@@ -212,14 +225,37 @@ with gr.Blocks(title="CleanMel Demo") as demo:
|
|
| 212 |
)
|
| 213 |
|
| 214 |
enhance_button_map_S.click(
|
| 215 |
-
enhance_cleanmel_S_map,
|
| 216 |
-
inputs=audio_input,
|
| 217 |
outputs=[output_audio, output_mel, output_np]
|
| 218 |
)
|
| 219 |
|
| 220 |
enhance_button_mask_S.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
enhance_cleanmel_S_mask,
|
| 222 |
-
inputs=
|
| 223 |
outputs=[output_audio, output_mel, output_np]
|
| 224 |
)
|
| 225 |
|
|
|
|
| 23 |
import yaml
|
| 24 |
import numpy as np
|
| 25 |
import matplotlib.pyplot as plt
|
| 26 |
+
from pydub import AudioSegment
|
| 27 |
from model.cleanmel import CleanMel
|
| 28 |
from model.vocos.pretrained import Vocos
|
| 29 |
from model.stft import InputSTFT, TargetMel
|
| 30 |
|
| 31 |
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 32 |
def read_audio(file_path):
|
| 33 |
+
assert file_path.endswith(('.wav', '.flac')), "Unsupported audio format. Please upload a .wav, .flac file."
|
| 34 |
audio, sample_rate = sf.read(file_path)
|
| 35 |
+
|
| 36 |
if audio.ndim > 1:
|
| 37 |
# select the loudest channel if stereo
|
| 38 |
audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
|
|
|
|
| 179 |
demo = gr.Blocks()
|
| 180 |
with gr.Blocks(title="CleanMel Demo") as demo:
|
| 181 |
gr.Markdown("## CleanMel Demo")
|
| 182 |
+
gr.Markdown("This demo showcases the CleanMel model for speech enhancement. <br> \
|
| 183 |
+
Only **.wav** and **.flac** files are supported. <br> \
|
| 184 |
+
--- <br> \
|
| 185 |
+
The model is running on CPU. Please be patient and wait for the result. <br> \
|
| 186 |
+
Inference time reference: <br> \
|
| 187 |
+
- CleanMel_L: **10 mins** for **10-second** audio <br> \
|
| 188 |
+
- CleanMel_S: **4 mins** for **10-second** audio <br> ")
|
| 189 |
|
| 190 |
with gr.Row():
|
|
|
|
| 191 |
with gr.Column():
|
| 192 |
+
audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")
|
| 193 |
+
audio_input_record = gr.Audio(label="Input Audio (Record)", type="filepath", sources="microphone")
|
| 194 |
+
with gr.Row():
|
| 195 |
+
with gr.Column():
|
| 196 |
+
enhance_button_map_S = gr.Button("Enhance File (offline CleanMel_S_map)")
|
| 197 |
+
enhance_button_mask_S = gr.Button("Enhance File (offline CleanMel_S_mask)")
|
| 198 |
+
enhance_button_map_L = gr.Button("Enhance File (offline CleanMel_L_map)")
|
| 199 |
+
enhance_button_mask_L = gr.Button("Enhance File (offline CleanMel_L_mask)")
|
| 200 |
|
| 201 |
+
with gr.Column():
|
| 202 |
+
enhance_button_map_Sr = gr.Button("Enhance Recorded Audio (offline CleanMel_S_map)")
|
| 203 |
+
enhance_button_mask_Sr = gr.Button("Enhance Recorded Audio (offline CleanMel_S_mask)")
|
| 204 |
+
enhance_button_map_Lr = gr.Button("Enhance Recorded Audio (offline CleanMel_L_map)")
|
| 205 |
+
enhance_button_mask_Lr = gr.Button("Enhance Recorded Audio (offline CleanMel_L_mask)")
|
| 206 |
+
with gr.Row():
|
| 207 |
clear_btn = gr.Button(
|
| 208 |
"🗑️ Clear All",
|
| 209 |
variant="secondary",
|
| 210 |
size="lg"
|
| 211 |
+
)
|
|
|
|
| 212 |
output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
|
| 213 |
output_mel = gr.Image(label="Output LogMel Spectrogram", type="filepath", visible=True)
|
| 214 |
output_np = gr.File(label="Enhanced LogMel Spec. (.npy)", type="filepath")
|
|
|
|
| 215 |
enhance_button_map_L.click(
|
| 216 |
enhance_cleanmel_L_map,
|
| 217 |
inputs=audio_input,
|
|
|
|
| 225 |
)
|
| 226 |
|
| 227 |
enhance_button_map_S.click(
|
| 228 |
+
enhance_cleanmel_S_map,
|
| 229 |
+
inputs=audio_input,
|
| 230 |
outputs=[output_audio, output_mel, output_np]
|
| 231 |
)
|
| 232 |
|
| 233 |
enhance_button_mask_S.click(
|
| 234 |
+
enhance_cleanmel_S_mask,
|
| 235 |
+
inputs=audio_input,
|
| 236 |
+
outputs=[output_audio, output_mel, output_np]
|
| 237 |
+
)
|
| 238 |
+
enhance_button_map_Lr.click(
|
| 239 |
+
enhance_cleanmel_L_map,
|
| 240 |
+
inputs=audio_input_record,
|
| 241 |
+
outputs=[output_audio, output_mel, output_np]
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
enhance_button_mask_Lr.click(
|
| 245 |
+
enhance_cleanmel_L_mask,
|
| 246 |
+
inputs=audio_input_record,
|
| 247 |
+
outputs=[output_audio, output_mel, output_np]
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
enhance_button_map_Sr.click(
|
| 251 |
+
enhance_cleanmel_S_map,
|
| 252 |
+
inputs=audio_input_record,
|
| 253 |
+
outputs=[output_audio, output_mel, output_np]
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
enhance_button_mask_Sr.click(
|
| 257 |
enhance_cleanmel_S_mask,
|
| 258 |
+
inputs=audio_input_record,
|
| 259 |
outputs=[output_audio, output_mel, output_np]
|
| 260 |
)
|
| 261 |
|