Update app.py
Browse files
app.py
CHANGED
|
@@ -25,7 +25,20 @@ model.eval()
|
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
-
def inference(audio_file_path: str, text_p: str, text_n: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
|
| 30 |
|
| 31 |
mixture, _ = librosa.load(audio_file_path, sr=32000)
|
|
@@ -38,7 +51,7 @@ def inference(audio_file_path: str, text_p: str, text_n: str):
|
|
| 38 |
sep_segments = []
|
| 39 |
for chunk in mixture_chunks:
|
| 40 |
with torch.no_grad():
|
| 41 |
-
sep_segments.append(model.inference_from_data(chunk.unsqueeze(0),
|
| 42 |
|
| 43 |
sep_segment = torch.concat(sep_segments, dim=1)
|
| 44 |
|
|
@@ -49,8 +62,10 @@ with gr.Blocks(title="CLAPSep") as demo:
|
|
| 49 |
with gr.Row():
|
| 50 |
with gr.Column():
|
| 51 |
input_audio = gr.Audio(label="Mixture", type="filepath")
|
| 52 |
-
text_p = gr.Textbox(label="Positive Query")
|
| 53 |
-
text_n = gr.Textbox(label="Negative Query")
|
|
|
|
|
|
|
| 54 |
with gr.Column():
|
| 55 |
with gr.Column():
|
| 56 |
output_audio = gr.Audio(label="Separation Result", scale=10)
|
|
@@ -62,7 +77,7 @@ with gr.Blocks(title="CLAPSep") as demo:
|
|
| 62 |
interactive=True,
|
| 63 |
)
|
| 64 |
button.click(
|
| 65 |
-
fn=inference, inputs=[input_audio, text_p, text_n], outputs=[output_audio]
|
| 66 |
)
|
| 67 |
|
| 68 |
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
+
def inference(audio_file_path: str, text_p: str, audio_file_path_p: str, text_n: str, audio_file_path_n: str):
|
| 29 |
+
# handling queries
|
| 30 |
+
with torch.no_grad():
|
| 31 |
+
embed_pos, embed_neg = torch.chunk(model.clap_model.get_text_embedding([text_p, text_n],
|
| 32 |
+
use_tensor=True), dim=0, chunks=2)
|
| 33 |
+
embed_pos = torch.zeros_like(embed_pos) if text_p == '' else embed_pos
|
| 34 |
+
embed_neg = torch.zeros_like(embed_neg) if text_n == '' else embed_neg
|
| 35 |
+
embed_pos += (model.clap_model.get_audio_embedding_from_filelist(
|
| 36 |
+
[audio_file_path_p]) if audio_file_path_p is not None else torch.zeros_like(embed_pos))
|
| 37 |
+
embed_neg += (model.clap_model.get_audio_embedding_from_filelist(
|
| 38 |
+
[audio_file_path_n]) if audio_file_path_n is not None else torch.zeros_like(embed_neg))
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
|
| 43 |
|
| 44 |
mixture, _ = librosa.load(audio_file_path, sr=32000)
|
|
|
|
| 51 |
sep_segments = []
|
| 52 |
for chunk in mixture_chunks:
|
| 53 |
with torch.no_grad():
|
| 54 |
+
sep_segments.append(model.inference_from_data(chunk.unsqueeze(0), embed_pos, embed_neg))
|
| 55 |
|
| 56 |
sep_segment = torch.concat(sep_segments, dim=1)
|
| 57 |
|
|
|
|
| 62 |
with gr.Row():
|
| 63 |
with gr.Column():
|
| 64 |
input_audio = gr.Audio(label="Mixture", type="filepath")
|
| 65 |
+
text_p = gr.Textbox(label="Positive Query Text")
|
| 66 |
+
text_n = gr.Textbox(label="Negative Query Text")
|
| 67 |
+
query_audio_p = gr.Audio(label="Positive Query Audio (optional)", type="filepath")
|
| 68 |
+
query_audio_n = gr.Audio(label="Negative Query Audio (optional)", type="filepath")
|
| 69 |
with gr.Column():
|
| 70 |
with gr.Column():
|
| 71 |
output_audio = gr.Audio(label="Separation Result", scale=10)
|
|
|
|
| 77 |
interactive=True,
|
| 78 |
)
|
| 79 |
button.click(
|
| 80 |
+
fn=inference, inputs=[input_audio, text_p, query_audio_p, text_n, query_audio_n], outputs=[output_audio]
|
| 81 |
)
|
| 82 |
|
| 83 |
|