Spaces:

LEMAS-Project
/

LEMAS-Edit

Running on Zero

App Files Files Community

Approximetal commited on Jan 2

Commit

0c63007

verified ·

1 Parent(s): be1395e

Update inference_gradio.py

Browse files

Files changed (1) hide show

inference_gradio.py +43 -18

inference_gradio.py CHANGED Viewed

@@ -75,14 +75,18 @@ class UVR5:
     """Small wrapper around the bundled uvr5 implementation for denoising."""
     def __init__(self, model_dir):
         code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
         self.model = self.load_model(model_dir, code_dir)
     def load_model(self, model_dir, code_dir):
-        import sys, json
         if code_dir not in sys.path:
             sys.path.append(code_dir)
         from multiprocess_cuda_infer import ModelData, Inference
         model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
         config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
         with open(config_path, "r", encoding="utf-8") as f:
@@ -93,7 +97,7 @@ class UVR5:
             result_path = model_dir,
             device = 'cpu',
             process_method = "MDX-Net",
-            base_dir=model_dir,
             **configs
         )
@@ -390,11 +394,12 @@ class MMSAlignModel:
 class WhisperxModel:
     def __init__(self, model_name):
-        from whisperx import load_model
         from pathlib import Path
         prompt = None  # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
-        # Prefer a local VAD model (to avoid network download / 301 issues)
         vad_fp = Path(MODELS_PATH) / "whisperx-vad-segmentation.bin"
         if not vad_fp.is_file():
             logging.warning(
@@ -402,6 +407,30 @@ class WhisperxModel:
                 vad_fp,
             )
             vad_fp = None
         self.model = load_model(
             model_name,
@@ -515,21 +544,17 @@ def get_audio_slice(audio, words_info, start_time, end_time, max_len=10, sr=1600
 def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name):  # , audiosr_name):
     global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
-    # if voicecraft_model:
-    #     del denoise_model
-    #     del transcribe_model
-    #     del align_model
-    #     del voicecraft_model
-    #     del audiosr
     torch.cuda.empty_cache()
     gc.collect()
     if denoise_model_name == "UVR5":
-        # Prefer the generic MODELS_PATH root for denoiser assets so that
-        # HF Spaces (where pretrained models are often mounted separately)
-        # and local runs share the same layout.
-        denoise_root = MODELS_PATH  # e.g. "./pretrained_models" or env override
-        denoise_model = UVR5(os.path.join(denoise_root, "uvr5"))
     elif denoise_model_name == "DeepFilterNet":
         denoise_model = DeepFilterNet("./pretrained_models/denoiser_model.onnx")
@@ -1177,10 +1202,10 @@ def get_app():
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description="VoiceCraft gradio app.")
-    parser.add_argument("--demo-path", default="./demo", help="Path to demo directory")
-    parser.add_argument("--tmp-path", default="./pretrained_models/demo/tmp", help="Path to tmp directory")
     parser.add_argument("--port", default=41020, type=int, help="App port")
     parser.add_argument("--share", action="store_true", help="Launch with public url")
     parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")

     """Small wrapper around the bundled uvr5 implementation for denoising."""
     def __init__(self, model_dir):
+        # Code directory is always the local `uvr5` folder in this repo
         code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
         self.model = self.load_model(model_dir, code_dir)
     def load_model(self, model_dir, code_dir):
+        import sys, json, os
         if code_dir not in sys.path:
             sys.path.append(code_dir)
         from multiprocess_cuda_infer import ModelData, Inference
+        # In the minimal LEMAS-TTS layout, UVR5 weights live under:
+        #   <pretrained_models>/uvr5/models/MDX_Net_Models/model_data/
+        # Here `model_dir` points to that `model_data` directory.
         model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
         config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
         with open(config_path, "r", encoding="utf-8") as f:
             result_path = model_dir,
             device = 'cpu',
             process_method = "MDX-Net",
+            base_dir=code_dir,
             **configs
         )
 class WhisperxModel:
     def __init__(self, model_name):
         from pathlib import Path
+        import whisperx.vad as wx_vad
+        from whisperx import load_model
         prompt = None  # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
+        # Prefer a local VAD model (to avoid network download / checksum issues)
         vad_fp = Path(MODELS_PATH) / "whisperx-vad-segmentation.bin"
         if not vad_fp.is_file():
             logging.warning(
                 vad_fp,
             )
             vad_fp = None
+        else:
+            # Monkey-patch whisperx.vad.load_vad_model so it loads our local
+            # segmentation model without enforcing the baked-in SHA256 check.
+            def _patched_load_vad_model(device, vad_onset=0.500, vad_offset=0.363, use_auth_token=None, model_fp=None):
+                import torch
+                from pyannote.audio import Model
+                from pyannote.audio.pipelines import VoiceActivitySegmentation
+                model_path = str(model_fp) if model_fp is not None else str(vad_fp)
+                model = Model.from_pretrained(model_path, use_auth_token=use_auth_token)
+                hyperparameters = {
+                    "onset": vad_onset,
+                    "offset": vad_offset,
+                    "min_duration_on": 0.1,
+                    "min_duration_off": 0.1,
+                }
+                vad_pipeline = VoiceActivitySegmentation(
+                    segmentation=model,
+                    device=torch.device(device),
+                )
+                vad_pipeline.instantiate(hyperparameters)
+                return vad_pipeline
+            wx_vad.load_vad_model = _patched_load_vad_model
         self.model = load_model(
             model_name,
 def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name):  # , audiosr_name):
     global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
     torch.cuda.empty_cache()
     gc.collect()
     if denoise_model_name == "UVR5":
+        # Follow LEMAS-TTS layout but resolve from MODELS_PATH (./pretrained_models by default),
+        # so that only the main TTS checkpoints can live in hf:// mounts while all
+        # auxiliary models (UVR5, vocoder, prosody encoder, etc.) are loaded from
+        # the local `pretrained_models` folder.
+        from pathlib import Path
+        uv_root = Path(MODELS_PATH) / "uvr5" / "models" / "MDX_Net_Models" / "model_data"
+        denoise_model = UVR5(str(uv_root))
     elif denoise_model_name == "DeepFilterNet":
         denoise_model = DeepFilterNet("./pretrained_models/denoiser_model.onnx")
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser(description="LEMAS-Edit gradio app.")
+    parser.add_argument("--demo-path", default="./pretrained_models/demo", help="Path to demo directory")
+    parser.add_argument("--tmp-path", default="./pretrained_models/tmp", help="Path to tmp directory")
     parser.add_argument("--port", default=41020, type=int, help="App port")
     parser.add_argument("--share", action="store_true", help="Launch with public url")
     parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")