Spaces:

LEMAS-Project
/

LEMAS-Edit

Running on Zero

App Files Files Community

Approximetal commited on 12 days ago

Commit

39c9309

verified ·

1 Parent(s): d56376a

Update gradio_mix.py

Browse files

Files changed (1) hide show

gradio_mix.py +49 -18

gradio_mix.py CHANGED Viewed

@@ -15,6 +15,7 @@ import jieba, zhconv
 from pypinyin.core import Pinyin
 from pypinyin import Style
 from lemas_tts.api import TTS, PRETRAINED_ROOT, CKPTS_ROOT
 from lemas_tts.infer.edit_multilingual import gen_wav_multilingual
 from lemas_tts.infer.text_norm.txt2pinyin import (
@@ -46,21 +47,18 @@ DEMO_PATH = os.getenv("DEMO_PATH", "./pretrained_models/demo")
 TMP_PATH = os.getenv("TMP_PATH", "./pretrained_models/demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
-# Pick device for the TTS editing model. By default we try CUDA, but fall
-# back to CPU if the CUDA stack is not actually usable (e.g. kernel image
-# mismatch on older GPUs). You can override via LEMAS_DEVICE env (e.g. "cpu"
-# or "cuda").
 def _pick_device():
     forced = os.getenv("LEMAS_DEVICE")
     if forced:
         return forced
-    if torch.cuda.is_available():
-        try:
-            torch.zeros(1).to("cuda")
-            return "cuda"
-        except Exception as e:
-            logging.warning("CUDA appears available but failed (%s); falling back to CPU.", e)
-    return "cpu"
 device = _pick_device()
 ASR_DEVICE = "cpu"  # force whisperx/pyannote to CPU to avoid cuDNN issues
@@ -355,10 +353,8 @@ class MMSAlignModel:
     def __init__(self):
         from torchaudio.pipelines import MMS_FA as bundle
         self.mms_model = bundle.get_model()
-        # MMS forced alignment is relatively light; keep it on CPU to avoid
-        # CUDA kernel / arch mismatches on environments where the main TTS
-        # model still uses GPU.
-        self.mms_model.to("cpu")
         self.mms_tokenizer = bundle.get_tokenizer()
         self.mms_aligner = bundle.get_aligner()
         self.text_normalizer = ur.Uroman()
@@ -380,7 +376,7 @@ class MMSAlignModel:
     def compute_alignments(self, waveform: torch.Tensor, tokens):
         with torch.inference_mode():
-            emission, _ = self.mms_model(waveform.to("cpu"))
             token_spans = self.mms_aligner(emission[0], tokens)
         return emission, token_spans
@@ -399,7 +395,7 @@ class MMSAlignModel:
         assert len(text_normed) == len(raw_text), f"normalized text len != raw text len: {len(text_normed)} != {len(raw_text)}"
         tokens = self.mms_tokenizer(text_normed)
         with torch.inference_mode():
-            emission, _ = self.mms_model(waveform.to("cpu"))
             token_spans = self.mms_aligner(emission[0], tokens)
         num_frames = emission.size(1)
         ratio = waveform.size(1) / num_frames
@@ -562,12 +558,41 @@ def load_models(lemas_model_name, whisper_model_name, alignment_model_name, deno
     # Load LEMAS-TTS editing model (selected multilingual variant)
     from pathlib import Path
     ckpt_dir = Path(CKPTS_ROOT) / lemas_model_name
     ckpt_candidates = sorted(
         list(ckpt_dir.glob("*.safetensors")) + list(ckpt_dir.glob("*.pt"))
     )
     if not ckpt_candidates:
-        raise gr.Error(f"No LEMAS-TTS ckpt found under {ckpt_dir}")
     ckpt_file = str(ckpt_candidates[-1])
     vocab_file = Path(PRETRAINED_ROOT) / "data" / lemas_model_name / "vocab.txt"
@@ -1201,6 +1226,12 @@ if __name__ == "__main__":
     parser.add_argument("--port", default=41020, type=int, help="App port")
     parser.add_argument("--share", action="store_true", help="Launch with public url")
     parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")
     os.environ["USER"] = os.getenv("USER", "user")
     args = parser.parse_args()

 from pypinyin.core import Pinyin
 from pypinyin import Style
+from cached_path import cached_path
 from lemas_tts.api import TTS, PRETRAINED_ROOT, CKPTS_ROOT
 from lemas_tts.infer.edit_multilingual import gen_wav_multilingual
 from lemas_tts.infer.text_norm.txt2pinyin import (
 TMP_PATH = os.getenv("TMP_PATH", "./pretrained_models/demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
+# HF location for large TTS checkpoints (too big for Space storage).
+# Mirrors LEMAS-TTS `inference_gradio.py`.
+HF_PRETRAINED_ROOT = "hf://LEMAS-Project/LEMAS-TTS/pretrained_models"
+# Pick device for the TTS editing model.
+# - Default: "cuda" if available, else "cpu"
+# - You can override via LEMAS_DEVICE env (e.g. "cpu" or "cuda").
 def _pick_device():
     forced = os.getenv("LEMAS_DEVICE")
     if forced:
         return forced
+    return "cuda" if torch.cuda.is_available() else "cpu"
 device = _pick_device()
 ASR_DEVICE = "cpu"  # force whisperx/pyannote to CPU to avoid cuDNN issues
     def __init__(self):
         from torchaudio.pipelines import MMS_FA as bundle
         self.mms_model = bundle.get_model()
+        # Keep MMS on the same device as the main edit model unless overridden.
+        self.mms_model.to(device)
         self.mms_tokenizer = bundle.get_tokenizer()
         self.mms_aligner = bundle.get_aligner()
         self.text_normalizer = ur.Uroman()
     def compute_alignments(self, waveform: torch.Tensor, tokens):
         with torch.inference_mode():
+            emission, _ = self.mms_model(waveform.to(device))
             token_spans = self.mms_aligner(emission[0], tokens)
         return emission, token_spans
         assert len(text_normed) == len(raw_text), f"normalized text len != raw text len: {len(text_normed)} != {len(raw_text)}"
         tokens = self.mms_tokenizer(text_normed)
         with torch.inference_mode():
+            emission, _ = self.mms_model(waveform.to(device))
             token_spans = self.mms_aligner(emission[0], tokens)
         num_frames = emission.size(1)
         ratio = waveform.size(1) / num_frames
     # Load LEMAS-TTS editing model (selected multilingual variant)
     from pathlib import Path
+    # Local ckpt search under the standard CKPTS_ROOT layout
     ckpt_dir = Path(CKPTS_ROOT) / lemas_model_name
     ckpt_candidates = sorted(
         list(ckpt_dir.glob("*.safetensors")) + list(ckpt_dir.glob("*.pt"))
     )
+    # Fallbacks for simpler layouts: allow ckpts directly under CKPTS_ROOT,
+    # e.g. ./pretrained_models/ckpts/multilingual_grl.safetensors
+    if not ckpt_candidates:
+        root_candidates = sorted(
+            list(Path(CKPTS_ROOT).glob(f"{lemas_model_name}*.safetensors"))
+            + list(Path(CKPTS_ROOT).glob(f"{lemas_model_name}*.pt"))
+        )
+        ckpt_candidates = root_candidates
+    # If no local ckpt is found, fall back to remote HF checkpoints
+    # (using the same mapping as LEMAS-TTS `inference_gradio.py`).
     if not ckpt_candidates:
+        remote_ckpts = {
+            "multilingual_grl": f"{HF_PRETRAINED_ROOT}/ckpts/multilingual_grl/multilingual_grl.safetensors",
+            "multilingual_prosody": f"{HF_PRETRAINED_ROOT}/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
+        }
+        remote_path = remote_ckpts.get(lemas_model_name)
+        if remote_path is not None:
+            try:
+                resolved = cached_path(remote_path)
+                ckpt_candidates = [Path(resolved)]
+                logging.info("Resolved remote ckpt %s -> %s", remote_path, resolved)
+            except Exception as e:
+                raise gr.Error(f"Failed to download remote ckpt {remote_path}: {e}")
+    if not ckpt_candidates:
+        raise gr.Error(
+            f"No LEMAS-TTS ckpt found for '{lemas_model_name}' under {ckpt_dir} "
+            f"or {CKPTS_ROOT}"
+        )
     ckpt_file = str(ckpt_candidates[-1])
     vocab_file = Path(PRETRAINED_ROOT) / "data" / lemas_model_name / "vocab.txt"
     parser.add_argument("--port", default=41020, type=int, help="App port")
     parser.add_argument("--share", action="store_true", help="Launch with public url")
     parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")
+    parser.add_argument(
+        "--models-path",
+        default="./pretrained_models",
+        dest="models_path",
+        help="Path to pretrained_models root (mirrors LEMAS-TTS layout).",
+    )
     os.environ["USER"] = os.getenv("USER", "user")
     args = parser.parse_args()