Spaces:

SaoYear
/

CleanMel

Sleeping

SaoYear commited on Jun 18, 2025

Commit

651ebfd

1 Parent(s): 8521c95

+Small models

Files changed (3) hide show

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ def install_mamba():
     subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.4.0/causal_conv1d-1.4.0+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
     subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v1.2.0.post1/mamba_ssm-1.2.0.post1+cu122torch2.2cxx11abiTRUE-cp310-cp310-linux_x86_64.whl"))
-# install_mamba()
 import torch
 import spaces
@@ -30,11 +30,11 @@ from model.vocos.pretrained import Vocos
 from model.stft import InputSTFT, TargetMel
 DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 def read_audio(file_path):
     audio, sample_rate = sf.read(file_path)
     if audio.ndim > 1:
-        audio = audio[:, 0]
     if sample_rate != 16000:
         audio = lb.resample(audio, orig_sr=sample_rate, target_sr=16000)
         sample_rate = 16000
@@ -178,7 +178,9 @@ def reset_everything():
 demo = gr.Blocks()
 with gr.Blocks(title="CleanMel Demo") as demo:
     gr.Markdown("## CleanMel Demo")
-    gr.Markdown("This demo showcases the CleanMel model for speech enhancement.")
     with gr.Row():
         audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")

     subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.4.0/causal_conv1d-1.4.0+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
     subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v1.2.0.post1/mamba_ssm-1.2.0.post1+cu122torch2.2cxx11abiTRUE-cp310-cp310-linux_x86_64.whl"))
+install_mamba()
 import torch
 import spaces
 from model.stft import InputSTFT, TargetMel
 DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 def read_audio(file_path):
     audio, sample_rate = sf.read(file_path)
     if audio.ndim > 1:
+        # select the loudest channel if stereo
+        audio = audio[:, np.argmax(np.abs(audio).mean(axis=0))]
     if sample_rate != 16000:
         audio = lb.resample(audio, orig_sr=sample_rate, target_sr=16000)
         sample_rate = 16000
 demo = gr.Blocks()
 with gr.Blocks(title="CleanMel Demo") as demo:
     gr.Markdown("## CleanMel Demo")
+    gr.Markdown("This demo showcases the CleanMel model for speech enhancement. \n \
+        Since the model is running on CPU, it may take a while to process the audio. \n \
+        Please be patient and wait for the result. \n")
     with gr.Row():
         audio_input = gr.Audio(label="Input Audio", type="filepath", sources="upload")

ckpts/CleanMel/offline_CleanMel_S_map.ckpt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b76eab8ff6944d7a3402901d57ef0bf1b7ef8e22e34849457ceaee20c37d35e4
+size 10101102

ckpts/CleanMel/offline_CleanMel_S_mask.ckpt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:2209e32bc37cba6c901ec3f04a31b0d12f60074b34816edca4bdb5d58ce33a72
+size 10101704