music-source-separation-com

Running

App Files Files Community

csukuangfj commited on Aug 24, 2023

Commit

48092b3

1 Parent(s): cbd589e

small fixes

Browse files

Files changed (1) hide show

app.py +23 -70

app.py CHANGED Viewed

@@ -21,6 +21,7 @@
 import logging
 import os
 import tempfile
 import time
 from datetime import datetime
@@ -90,7 +91,7 @@ def process_microphone(in_filename: str):
 def process(in_filename: str):
     logging.info(f"in_filename: {in_filename}")
-    waveform = load_audio(waveform)
     duration = waveform.shape[0] / 44100  # in seconds
     vocals = load_model("vocals.pt")
@@ -107,49 +108,40 @@ def process(in_filename: str):
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
-    metadata = torchaudio.info(filename)
-    duration = metadata.num_frames / sample_rate
     rtf = (end - start) / duration
     logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
     info = f"""
-    Wave duration  : {duration: .3f} s <br/>
     Processing time: {end - start: .3f} s <br/>
     RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
     """
-    if rtf > 1:
-        info += (
-            "<br/>We are loading the model for the first run. "
-            "Please run again to measure the real RTF.<br/>"
-        )
     logging.info(info)
     logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
-    return text, build_html_output(info)
-title = "# Automatic Speech Recognition with Next-gen Kaldi"
-description = """
-This space shows how to do automatic speech recognition with Next-gen Kaldi.
-Please visit
-<https://huggingface.co/spaces/k2-fsa/streaming-automatic-speech-recognition>
-for streaming speech recognition with **Next-gen Kaldi**.
-It is running on CPU within a docker container provided by Hugging Face.
-See more information by visiting the following links:
-- <https://github.com/k2-fsa/icefall>
-- <https://github.com/k2-fsa/sherpa>
-- <https://github.com/k2-fsa/k2>
-- <https://github.com/lhotse-speech/lhotse>
-If you want to deploy it locally, please see
-<https://k2-fsa.github.io/sherpa/>
-"""
 # css style is copied from
 # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
@@ -161,50 +153,11 @@ css = """
 """
-def update_model_dropdown(language: str):
-    if language in language_to_models:
-        choices = language_to_models[language]
-        return gr.Dropdown.update(choices=choices, value=choices[0])
-    raise ValueError(f"Unsupported language: {language}")
 demo = gr.Blocks(css=css)
 with demo:
     gr.Markdown(title)
-    language_choices = list(language_to_models.keys())
-    language_radio = gr.Radio(
-        label="Language",
-        choices=language_choices,
-        value=language_choices[0],
-    )
-    model_dropdown = gr.Dropdown(
-        choices=language_to_models[language_choices[0]],
-        label="Select a model",
-        value=language_to_models[language_choices[0]][0],
-    )
-    language_radio.change(
-        update_model_dropdown,
-        inputs=language_radio,
-        outputs=model_dropdown,
-    )
-    decoding_method_radio = gr.Radio(
-        label="Decoding method",
-        choices=["greedy_search", "modified_beam_search"],
-        value="greedy_search",
-    )
-    num_active_paths_slider = gr.Slider(
-        minimum=1,
-        value=4,
-        step=1,
-        label="Number of active paths for modified_beam_search",
-    )
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):

 import logging
 import os
+from pydub import AudioSegment
 import tempfile
 import time
 from datetime import datetime
 def process(in_filename: str):
     logging.info(f"in_filename: {in_filename}")
+    waveform = load_audio(in_filename)
     duration = waveform.shape[0] / 44100  # in seconds
     vocals = load_model("vocals.pt")
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
+    vocals_wave = (vocals_wave.t() * 32768).to(torch.int16)
+    accompaniment_wave = (accompaniment_wave.t() * 32768).to(torch.int16)
+    vocals_sound = AudioSegment(
+        data=vocals_wave.numpy().tobytes(), sample_width=2, frame_rate=44100, channels=2
+    )
+    vocals_filename = in_filename + "-vocals.mp3"
+    vocals_sound.export(vocals_filename, format="mp3", bitrate="128k")
+    accompaniment_sound = AudioSegment(
+        data=accompaniment_wave.numpy().tobytes(),
+        sample_width=2,
+        frame_rate=44100,
+        channels=2,
+    )
+    accompaniment_filename = in_filename + "-accompaniment.mp3"
+    accompaniment_sound.export(accompaniment_filename, format="mp3", bitrate="128k")
     rtf = (end - start) / duration
     logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
     info = f"""
+    Input duration  : {duration: .3f} s <br/>
     Processing time: {end - start: .3f} s <br/>
     RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
     """
     logging.info(info)
     logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
+    return vocals_filename, accompaniment_filename, build_html_output(info)
+title = "# Music source separation with Spleeter in PyTorch"
 # css style is copied from
 # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
 """
 demo = gr.Blocks(css=css)
 with demo:
     gr.Markdown(title)
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):