Spaces:

swc2
/

Target-speaker-extraction

Running

App Files Files Community

swc2 commited on Jan 13, 2025

Commit

d72375d

1 Parent(s): 679330d

app revise test

Browse files

Files changed (1) hide show

app.py +26 -85

app.py CHANGED Viewed

@@ -1,87 +1,28 @@
 import gradio as gr
-import torch as th
-import numpy as np
-from nnet.spex_plus import SpEx_Plus
-from utils.logger import get_logger
-from utils.audio import WaveReader, write_wav
-logger = get_logger(__name__)
-class NnetComputer(object):
-    def __init__(self, cpt_dir, gpuid, nnet_conf):
-        self.device = th.device("cuda:{}".format(gpuid)) if gpuid >= 0 else th.device("cpu")
-        nnet = self._load_nnet(cpt_dir, nnet_conf)
-        self.nnet = nnet.to(self.device) if gpuid >= 0 else nnet
-        # set eval model
-        self.nnet.eval()
-    def _load_nnet(self, cpt_dir, nnet_conf):
-        nnet = SpEx_Plus(**nnet_conf)
-        cpt_fname = os.path.join(cpt_dir, "59.pt.tar")
-        cpt = th.load(cpt_fname, map_location="cpu")
-        nnet.load_state_dict(cpt["model_state_dict"])
-        logger.info("Load checkpoint from {}, epoch {:d}".format(
-            cpt_fname, cpt["epoch"]))
-        return nnet
-    def compute(self, samps, aux_samps, aux_samps_len):
-        with th.no_grad():
-            raw = th.tensor(samps, dtype=th.float32, device=self.device)
-            aux = th.tensor(aux_samps, dtype=th.float32, device=self.device)
-            aux_len = th.tensor(aux_samps_len, dtype=th.float32, device=self.device)
-            aux = aux.unsqueeze(0)
-            sps, sps2, sps3, spk_pred = self.nnet(raw, aux, aux_len)
-            sp_samps = np.squeeze(sps.detach().cpu().numpy())
-            return sp_samps
-def compute_output(input_audio, use_gpu, checkpoint, output_dir):
-    # Prepare mix_input and aux_input based on the input_audio
-    mix_input = {}  # Modify this to include your mix_input
-    aux_input = {}  # Modify this to include your aux_input
-    # Set GPU index based on the user's choice
-    gpu_index = -1 if not use_gpu else 0
-    # Run the computation
-    nnet_conf = {
-        "L1": int(0.0025 * 16000),
-        "L2": int(0.01 * 16000),
-        "L3": int(0.02 * 16000),
-        "N": 256,
-        "B": 8,
-        "O": 256,
-        "P": 512,
-        "Q": 3,
-        "num_spks": 395,
-        "spk_embed_dim": 256,
-        "causal": False
-    }
-    computer = NnetComputer(checkpoint, gpu_index, nnet_conf)
-    for key, mix_samps in mix_input:
-        aux_samps = aux_input[key]
-        logger.info("Compute on utterance {}...".format(key))
-        samps = computer.compute(mix_samps, aux_samps, len(aux_samps))
-        norm = np.linalg.norm(mix_samps, np.inf)
-        samps = samps[:mix_samps.size]
-        # Normalize the output
-        samps = samps * norm / np.max(np.abs(samps))
-        # Write output to the specified directory
-        write_wav(os.path.join(output_dir, "{}.wav".format(key)), samps, sample_rate=args.sample_rate)
-    logger.info("Compute over {:d} utterances".format(len(mix_input)))
-# Define the Gradio interface
-inputs = [
-    gr.Audio(name="input_audio", label="Input Audio"),
-    gr.Checkbox(name="use_gpu", label="Use GPU"),
-    gr.TextInput(name="checkpoint", label="Checkpoint Directory"),
-    gr.TextInput(name="output_dir", label="Output Directory")
-]
-output = gr.Interface(
-    fn=compute_output,
-    inputs=inputs,
-    outputs=None,
-    title="Audio Processing with Neural Network",
-    description="Process audio input using a neural network model.",
-    theme="compact"
 )
-output.launch()

 import gradio as gr
+from inference import InferencePipeline
+i = InferencePipeline()
+def gradio_voice_conversion(audio_file_path):
+    """
+    Wrapper function to handle Gradio's audio input and pass the file path to the voice conversion function.
+    Gradio passes audio data as a tuple: (temp file path, sample rate).
+    """
+    # Gradio passes audio as (temp file path, sample rate)
+    #audio_file_path = audio_data[0]  # Extract the file path
+    print(f"Here is the audio_file_path: {audio_file_path}")
+    #print(f"Here is the audio_file_path[0]: {audio_file_path[0]}")
+    return i.voice_conversion(audio_file_path)
+# Define your Gradio interface
+demo = gr.Interface(
+    fn=gradio_voice_conversion,  # Use the wrapper function for voice conversion
+    inputs=gr.Audio(label="Record or upload your voice", type="filepath"),  # Specify that you want the filepath
+    outputs=gr.Audio(label="Converted Voice"),
+    title="Voice Conversion Demo",
+    description="Voice Conversion: Transform the input voice to a target voice.",
+    allow_flagging="never"
 )
+if __name__ == "__main__":
+    demo.launch()