CLAPSep

Runtime error

App Files Files Community

AisakaMikoto commited on Feb 29, 2024

Commit

33171a6

verified ·

1 Parent(s): f5e7d93

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -5

app.py CHANGED Viewed

@@ -25,7 +25,20 @@ model.eval()
-def inference(audio_file_path: str, text_p: str, text_n: str):
     print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
     mixture, _ = librosa.load(audio_file_path, sr=32000)
@@ -38,7 +51,7 @@ def inference(audio_file_path: str, text_p: str, text_n: str):
     sep_segments = []
     for chunk in mixture_chunks:
         with torch.no_grad():
-            sep_segments.append(model.inference_from_data(chunk.unsqueeze(0), [text_p], [text_n]))
     sep_segment = torch.concat(sep_segments, dim=1)
@@ -49,8 +62,10 @@ with gr.Blocks(title="CLAPSep") as demo:
     with gr.Row():
         with gr.Column():
             input_audio = gr.Audio(label="Mixture", type="filepath")
-            text_p = gr.Textbox(label="Positive Query")
-            text_n = gr.Textbox(label="Negative Query")
         with gr.Column():
             with gr.Column():
                 output_audio = gr.Audio(label="Separation Result", scale=10)
@@ -62,7 +77,7 @@ with gr.Blocks(title="CLAPSep") as demo:
                     interactive=True,
                 )
                 button.click(
-                    fn=inference, inputs=[input_audio, text_p, text_n], outputs=[output_audio]
                 )

+def inference(audio_file_path: str, text_p: str, audio_file_path_p: str, text_n: str, audio_file_path_n: str):
+    # handling queries
+    with torch.no_grad():
+        embed_pos, embed_neg = torch.chunk(model.clap_model.get_text_embedding([text_p, text_n],
+                                                                              use_tensor=True), dim=0, chunks=2)
+        embed_pos = torch.zeros_like(embed_pos) if text_p == '' else embed_pos
+        embed_neg = torch.zeros_like(embed_neg) if text_n == '' else embed_neg
+        embed_pos += (model.clap_model.get_audio_embedding_from_filelist(
+            [audio_file_path_p]) if audio_file_path_p is not None else torch.zeros_like(embed_pos))
+        embed_neg += (model.clap_model.get_audio_embedding_from_filelist(
+            [audio_file_path_n]) if audio_file_path_n is not None else torch.zeros_like(embed_neg))
     print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
     mixture, _ = librosa.load(audio_file_path, sr=32000)
     sep_segments = []
     for chunk in mixture_chunks:
         with torch.no_grad():
+            sep_segments.append(model.inference_from_data(chunk.unsqueeze(0), embed_pos, embed_neg))
     sep_segment = torch.concat(sep_segments, dim=1)
     with gr.Row():
         with gr.Column():
             input_audio = gr.Audio(label="Mixture", type="filepath")
+            text_p = gr.Textbox(label="Positive Query Text")
+            text_n = gr.Textbox(label="Negative Query Text")
+            query_audio_p = gr.Audio(label="Positive Query Audio (optional)", type="filepath")
+            query_audio_n = gr.Audio(label="Negative Query Audio (optional)", type="filepath")
         with gr.Column():
             with gr.Column():
                 output_audio = gr.Audio(label="Separation Result", scale=10)
                     interactive=True,
                 )
                 button.click(
+                    fn=inference, inputs=[input_audio, text_p, query_audio_p, text_n, query_audio_n], outputs=[output_audio]
                 )