Approximetal commited on
Commit
fe9a2a5
·
verified ·
1 Parent(s): ec61e4c

Update gradio_mix.py

Browse files
Files changed (1) hide show
  1. gradio_mix.py +50 -31
gradio_mix.py CHANGED
@@ -42,11 +42,27 @@ langid.set_languages(['es','pt','zh','en','de','fr','it', 'ru', 'id', 'vi'])
42
 
43
 
44
  os.environ['CURL_CA_BUNDLE'] = ''
45
- DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
46
- TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
47
  MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
48
 
49
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  ASR_DEVICE = "cpu" # force whisperx/pyannote to CPU to avoid cuDNN issues
51
  whisper_model, align_model = None, None
52
  tts_edit_model = None
@@ -75,14 +91,18 @@ class UVR5:
75
  """Small wrapper around the bundled uvr5 implementation for denoising."""
76
 
77
  def __init__(self, model_dir):
 
78
  code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
79
  self.model = self.load_model(model_dir, code_dir)
80
 
81
  def load_model(self, model_dir, code_dir):
82
- import sys, json
83
  if code_dir not in sys.path:
84
  sys.path.append(code_dir)
85
  from multiprocess_cuda_infer import ModelData, Inference
 
 
 
86
  model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
87
  config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
88
  with open(config_path, "r", encoding="utf-8") as f:
@@ -93,6 +113,9 @@ class UVR5:
93
  result_path = model_dir,
94
  device = 'cpu',
95
  process_method = "MDX-Net",
 
 
 
96
  base_dir=model_dir,
97
  **configs
98
  )
@@ -332,7 +355,10 @@ class MMSAlignModel:
332
  def __init__(self):
333
  from torchaudio.pipelines import MMS_FA as bundle
334
  self.mms_model = bundle.get_model()
335
- self.mms_model.to(device)
 
 
 
336
  self.mms_tokenizer = bundle.get_tokenizer()
337
  self.mms_aligner = bundle.get_aligner()
338
  self.text_normalizer = ur.Uroman()
@@ -354,7 +380,7 @@ class MMSAlignModel:
354
 
355
  def compute_alignments(self, waveform: torch.Tensor, tokens):
356
  with torch.inference_mode():
357
- emission, _ = self.mms_model(waveform.to(device))
358
  token_spans = self.mms_aligner(emission[0], tokens)
359
  return emission, token_spans
360
 
@@ -373,7 +399,7 @@ class MMSAlignModel:
373
  assert len(text_normed) == len(raw_text), f"normalized text len != raw text len: {len(text_normed)} != {len(raw_text)}"
374
  tokens = self.mms_tokenizer(text_normed)
375
  with torch.inference_mode():
376
- emission, _ = self.mms_model(waveform.to(device))
377
  token_spans = self.mms_aligner(emission[0], tokens)
378
  num_frames = emission.size(1)
379
  ratio = waveform.size(1) / num_frames
@@ -391,18 +417,10 @@ class MMSAlignModel:
391
  class WhisperxModel:
392
  def __init__(self, model_name):
393
  from whisperx import load_model
394
- from pathlib import Path
395
  prompt = None # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
396
 
397
- # Prefer a local VAD model (to avoid network download / 301 issues)
398
- vad_fp = Path(MODELS_PATH) / "whisperx-vad-segmentation.bin"
399
- if not vad_fp.is_file():
400
- logging.warning(
401
- "Local whisperx VAD not found at %s, falling back to default download path.",
402
- vad_fp,
403
- )
404
- vad_fp = None
405
-
406
  self.model = load_model(
407
  model_name,
408
  ASR_DEVICE,
@@ -417,7 +435,7 @@ class WhisperxModel:
417
  "multilingual": True,
418
  "hotwords": None
419
  },
420
- vad_model_fp=str(vad_fp) if vad_fp is not None else None,
421
  )
422
 
423
  def transcribe(self, audio_info, lang=None):
@@ -515,17 +533,20 @@ def get_audio_slice(audio, words_info, start_time, end_time, max_len=10, sr=1600
515
  def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name): # , audiosr_name):
516
 
517
  global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
518
- # if voicecraft_model:
519
- # del denoise_model
520
- # del transcribe_model
521
- # del align_model
522
- # del voicecraft_model
523
- # del audiosr
524
  torch.cuda.empty_cache()
525
  gc.collect()
526
 
527
  if denoise_model_name == "UVR5":
528
- denoise_model = UVR5(os.path.join(str(PRETRAINED_ROOT), "uvr5"))
 
 
 
 
 
 
 
 
 
529
  elif denoise_model_name == "DeepFilterNet":
530
  denoise_model = DeepFilterNet("./pretrained_models/denoiser_model.onnx")
531
 
@@ -943,8 +964,7 @@ def get_app():
943
  # InvalidPathError with local filesystem paths.
944
  _demo_value = None
945
  demo_candidates = [
946
- os.path.join(DEMO_PATH, "V-00013_en-US.wav"),
947
- os.path.join(os.path.dirname(__file__), "..", "VoiceCraft", "demo", "V-00013_en-US.wav"),
948
  ]
949
  for demo_path in demo_candidates:
950
  try:
@@ -1174,11 +1194,10 @@ def get_app():
1174
  if __name__ == "__main__":
1175
  import argparse
1176
 
1177
- parser = argparse.ArgumentParser(description="VoiceCraft gradio app.")
1178
 
1179
- parser.add_argument("--demo-path", default="./demo", help="Path to demo directory")
1180
- parser.add_argument("--tmp-path", default="/cto_labs/vistring/zhaozhiyuan/outputs/voicecraft/tmp", help="Path to tmp directory")
1181
- parser.add_argument("--models-path", default="/cto_labs/vistring/zhaozhiyuan/outputs/voicecraft/pretrain/VoiceCraft", help="Path to voicecraft models directory")
1182
  parser.add_argument("--port", default=41020, type=int, help="App port")
1183
  parser.add_argument("--share", action="store_true", help="Launch with public url")
1184
  parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")
 
42
 
43
 
44
  os.environ['CURL_CA_BUNDLE'] = ''
45
+ DEMO_PATH = os.getenv("DEMO_PATH", "./pretrained_models/demo")
46
+ TMP_PATH = os.getenv("TMP_PATH", "./pretrained_models/demo/temp")
47
  MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
48
 
49
+ # Pick device for the TTS editing model. By default we try CUDA, but fall
50
+ # back to CPU if the CUDA stack is not actually usable (e.g. kernel image
51
+ # mismatch on older GPUs). You can override via LEMAS_DEVICE env (e.g. "cpu"
52
+ # or "cuda").
53
+ def _pick_device():
54
+ forced = os.getenv("LEMAS_DEVICE")
55
+ if forced:
56
+ return forced
57
+ if torch.cuda.is_available():
58
+ try:
59
+ torch.zeros(1).to("cuda")
60
+ return "cuda"
61
+ except Exception as e:
62
+ logging.warning("CUDA appears available but failed (%s); falling back to CPU.", e)
63
+ return "cpu"
64
+
65
+ device = _pick_device()
66
  ASR_DEVICE = "cpu" # force whisperx/pyannote to CPU to avoid cuDNN issues
67
  whisper_model, align_model = None, None
68
  tts_edit_model = None
 
91
  """Small wrapper around the bundled uvr5 implementation for denoising."""
92
 
93
  def __init__(self, model_dir):
94
+ # Code directory is always the local `uvr5` folder in this repo
95
  code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
96
  self.model = self.load_model(model_dir, code_dir)
97
 
98
  def load_model(self, model_dir, code_dir):
99
+ import sys, json, os
100
  if code_dir not in sys.path:
101
  sys.path.append(code_dir)
102
  from multiprocess_cuda_infer import ModelData, Inference
103
+ # In the minimal LEMAS-TTS layout, UVR5 weights live under:
104
+ # <pretrained_models>/uvr5/models/MDX_Net_Models/model_data/
105
+ # Here `model_dir` points to that `model_data` directory.
106
  model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
107
  config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
108
  with open(config_path, "r", encoding="utf-8") as f:
 
113
  result_path = model_dir,
114
  device = 'cpu',
115
  process_method = "MDX-Net",
116
+ # Keep base_dir and model_dir the same so all UVR5 metadata
117
+ # (model_data.json, model_name_mapper.json, etc.) are resolved
118
+ # under `pretrained_models/uvr5`, matching LEMAS-TTS inference.
119
  base_dir=model_dir,
120
  **configs
121
  )
 
355
  def __init__(self):
356
  from torchaudio.pipelines import MMS_FA as bundle
357
  self.mms_model = bundle.get_model()
358
+ # MMS forced alignment is relatively light; keep it on CPU to avoid
359
+ # CUDA kernel / arch mismatches on environments where the main TTS
360
+ # model still uses GPU.
361
+ self.mms_model.to("cpu")
362
  self.mms_tokenizer = bundle.get_tokenizer()
363
  self.mms_aligner = bundle.get_aligner()
364
  self.text_normalizer = ur.Uroman()
 
380
 
381
  def compute_alignments(self, waveform: torch.Tensor, tokens):
382
  with torch.inference_mode():
383
+ emission, _ = self.mms_model(waveform.to("cpu"))
384
  token_spans = self.mms_aligner(emission[0], tokens)
385
  return emission, token_spans
386
 
 
399
  assert len(text_normed) == len(raw_text), f"normalized text len != raw text len: {len(text_normed)} != {len(raw_text)}"
400
  tokens = self.mms_tokenizer(text_normed)
401
  with torch.inference_mode():
402
+ emission, _ = self.mms_model(waveform.to("cpu"))
403
  token_spans = self.mms_aligner(emission[0], tokens)
404
  num_frames = emission.size(1)
405
  ratio = waveform.size(1) / num_frames
 
417
  class WhisperxModel:
418
  def __init__(self, model_name):
419
  from whisperx import load_model
 
420
  prompt = None # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
421
 
422
+ # Use the lighter Silero VAD backend to avoid pyannote checkpoints
423
+ # and their PyTorch 2.6 `weights_only` pickling issues.
 
 
 
 
 
 
 
424
  self.model = load_model(
425
  model_name,
426
  ASR_DEVICE,
 
435
  "multilingual": True,
436
  "hotwords": None
437
  },
438
+ vad_method="silero",
439
  )
440
 
441
  def transcribe(self, audio_info, lang=None):
 
533
  def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name): # , audiosr_name):
534
 
535
  global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
 
 
 
 
 
 
536
  torch.cuda.empty_cache()
537
  gc.collect()
538
 
539
  if denoise_model_name == "UVR5":
540
+ # Simple layout: UVR5 assets live directly under:
541
+ # <MODELS_PATH>/uvr5
542
+ # with files:
543
+ # Kim_Vocal_1.onnx
544
+ # MDX-Net-Kim-Vocal1.json
545
+ # model_data.json
546
+ # model_name_mapper.json
547
+ from pathlib import Path
548
+ uv_root = Path(MODELS_PATH) / "uvr5"
549
+ denoise_model = UVR5(str(uv_root))
550
  elif denoise_model_name == "DeepFilterNet":
551
  denoise_model = DeepFilterNet("./pretrained_models/denoiser_model.onnx")
552
 
 
964
  # InvalidPathError with local filesystem paths.
965
  _demo_value = None
966
  demo_candidates = [
967
+ os.path.join(DEMO_PATH, "test.wav"),
 
968
  ]
969
  for demo_path in demo_candidates:
970
  try:
 
1194
  if __name__ == "__main__":
1195
  import argparse
1196
 
1197
+ parser = argparse.ArgumentParser(description="LEMAS-Edit gradio app.")
1198
 
1199
+ parser.add_argument("--demo-path", default="./pretrained_models/demo", help="Path to demo directory")
1200
+ parser.add_argument("--tmp-path", default="./pretrained_models/tmp", help="Path to tmp directory")
 
1201
  parser.add_argument("--port", default=41020, type=int, help="App port")
1202
  parser.add_argument("--share", action="store_true", help="Launch with public url")
1203
  parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")