Spaces:

swc2
/

Target-speaker-extraction

Running

App Files Files Community

swc2 commited on Apr 3, 2025

Commit

ef932f5

1 Parent(s): f8d6437

add model select

Browse files

Files changed (4) hide show

app.py +22 -5
decode.py +2 -2
model/spex_plus.py +5 -1
model/spex_plus_plus.py +1 -1

app.py CHANGED Viewed

@@ -8,19 +8,30 @@ from decode import InferencePipeline
 from datahandler import AudioMixer, fix_audio_format
 from omegaconf import OmegaConf
-cfg = OmegaConf.load("config/config_ira.yaml")
-inter = InferencePipeline(cfg)
 datamix = AudioMixer()
 def gradio_TSE(input_audio_path, enroll_audio_path1, enroll_audio_path2, audio_type):
     print(f"User uploaded audio path: {input_audio_path}")
     print(f"User enroll audio path:  {enroll_audio_path1}")
     print(f"User enroll audio path:  {enroll_audio_path2}")
     audio_info = sf.info(input_audio_path)
     print(f"采样率: {audio_info.samplerate} Hz")
@@ -85,6 +96,12 @@ with gr.Blocks() as demo:
             value="clean",
             label="Input audio type?"
         )
     with gr.Row():
         enroll_audio1 = gr.Audio(label="Upload your first enroll audio", type="filepath")
@@ -101,7 +118,7 @@ with gr.Blocks() as demo:
     convert_button = gr.Button("Extract")
     convert_button.click(
         fn=gradio_TSE,
-        inputs=[input_audio, enroll_audio1, enroll_audio2, audio_type],
         outputs=[noisy_audio_output, extracted_audio_output1, extracted_audio_output2]
     )

 from datahandler import AudioMixer, fix_audio_format
 from omegaconf import OmegaConf
+MODEL_CACHE = {
+    "base_model": InferencePipeline(OmegaConf.load("config/config.yaml")),
+    "iter_model": InferencePipeline(OmegaConf.load("config/config_ira.yaml"))
+}
+# cfg = OmegaConf.load("config/config_ira.yaml")
+# inter = InferencePipeline(cfg)
 datamix = AudioMixer()
 def gradio_TSE(input_audio_path, enroll_audio_path1, enroll_audio_path2, audio_type):
+    print(f"模型选择: {model_select}")
     print(f"User uploaded audio path: {input_audio_path}")
     print(f"User enroll audio path:  {enroll_audio_path1}")
     print(f"User enroll audio path:  {enroll_audio_path2}")
+    # if model_select == "base_model":
+    #     cfg_path = "config/config_base.yaml"
+    # elif model_select == "iter_model":
+    #     cfg_path = "config/config_iter.yaml"
+    # else:
+    #     raise ValueError("未知模型类型")
+    inter = MODEL_CACHE[model_select]
     audio_info = sf.info(input_audio_path)
     print(f"采样率: {audio_info.samplerate} Hz")
             value="clean",
             label="Input audio type?"
         )
+        model_select = gr.Radio(
+            choices=["base_model", "iter_model"],
+            value="iter_model",
+            label="Select Model Type"
+        )
     with gr.Row():
         enroll_audio1 = gr.Audio(label="Upload your first enroll audio", type="filepath")
     convert_button = gr.Button("Extract")
     convert_button.click(
         fn=gradio_TSE,
+        inputs=[input_audio, enroll_audio1, enroll_audio2, audio_type, model_select],
         outputs=[noisy_audio_output, extracted_audio_output1, extracted_audio_output2]
     )

decode.py CHANGED Viewed

@@ -31,7 +31,7 @@ class NnetComputer(object):
             aux = aux.unsqueeze(0)
             print("raw",raw.shape)
             print("aux",aux.shape)
-            sps,spk_pred,emb = self.nnet(raw, aux, aux_len)
             sp_samps = np.squeeze(sps.detach().cpu().numpy())
             return sp_samps
@@ -58,7 +58,7 @@ class InferencePipeline:
         return out_wav
 if __name__ == "__main__":
-    cfg = OmegaConf.load("config/config_ira.yaml")
     pipeline = InferencePipeline(cfg)
     mix_path = "test_output_mixture.wav"

             aux = aux.unsqueeze(0)
             print("raw",raw.shape)
             print("aux",aux.shape)
+            sps = self.nnet(raw, aux, aux_len)
             sp_samps = np.squeeze(sps.detach().cpu().numpy())
             return sp_samps
         return out_wav
 if __name__ == "__main__":
+    cfg = OmegaConf.load("config/config.yaml")
     pipeline = InferencePipeline(cfg)
     mix_path = "test_output_mixture.wav"

model/spex_plus.py CHANGED Viewed

@@ -122,8 +122,12 @@ class SpEx_Plus(nn.Module):
         S1 = w1 * m1
         S2 = w2 * m2
         S3 = w3 * m3
-        return self.decoder_1d_short(S1), self.decoder_1d_middle(S2)[:, :xlen1], self.decoder_1d_long(S3)[:, :xlen1], self.pred_linear(aux)
 class Extractor(nn.Module):
     def __init__(self,

         S1 = w1 * m1
         S2 = w2 * m2
         S3 = w3 * m3
+        out1 = self.decoder_1d_short(S1)
+        # out2 = self.decoder_1d_middle(S2)[:, :xlen1]
+        # out3 = self.decoder_1d_long(S3)[:, :xlen1]
+        return self.decoder_1d_short(S1)
 class Extractor(nn.Module):
     def __init__(self,

model/spex_plus_plus.py CHANGED Viewed

@@ -206,7 +206,7 @@ class SpEx_Plus_Double(nn.Module):
         est3 = self.ira(est2, aux, aux_len,xlen1, xlen2, xlen3, w1, w2, w3)
-        return est3,self.pred_linear(aux), aux
 class Extractor(nn.Module):
     def __init__(self,

         est3 = self.ira(est2, aux, aux_len,xlen1, xlen2, xlen3, w1, w2, w3)
+        return est3
 class Extractor(nn.Module):
     def __init__(self,