BlueV2

Running

notmax123 commited on Apr 25

Commit

615a636

1 Parent(s): 35eb04b

Use v2 model repositories for Space runtime

Fetch ONNX assets from the v2 bundle and voice-export safetensors from blue-v2 so uploaded reference voices are not mixed with old v1 checkpoints or voice JSONs.

Made-with: Cursor

Files changed (3) hide show

app.py +37 -12
download_models.py +14 -20
export_new_voice.py +16 -16

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from num2words import num2words
 import gradio as gr
 import onnxruntime as ort
-from download_models import download_blue_models, download_default_voices, download_renikud
 # ------------------------------------------------------------------
 # Paths
@@ -42,6 +42,12 @@ VOCAB_PATH = next(
 def _needs_download() -> bool:
     required = ["text_encoder.onnx", "vector_estimator.onnx", "vocoder.onnx",
                 "duration_predictor.onnx"]
     for fn in required:
         p = os.path.join(ONNX_DIR, fn)
         if not os.path.exists(p) or os.path.getsize(p) < 1000:
@@ -572,13 +578,25 @@ TTS = BlueTTS(ONNX_DIR, CONFIG_PATH, VOCAB_PATH, RENIKUD_PATH)
 def discover_voices() -> Dict[str, str]:
     out: Dict[str, str] = {}
     for p in sorted(glob.glob(os.path.join(VOICES_DIR, "*.json"))):
         label = os.path.splitext(os.path.basename(p))[0]
         pretty = label.replace("_", " ").replace("spk ", "Speaker ").title()
         out[pretty] = p
     return out
-VOICES: Dict[str, str] = discover_voices() or {"Default": next(iter(discover_voices().values()), "")}
 VOICE_STYLES: Dict[str, Style] = {name: load_voice_style([path]) for name, path in VOICES.items()}
@@ -603,19 +621,20 @@ def _hash_file(path: str) -> str:
 def _ensure_pt_weights() -> dict[str, str]:
-    """Make sure pt checkpoints are on disk; download from notmax123/blue if missing."""
     needed: dict[str, Optional[str]] = {k: _find_pt_weight(v) for k, v in PT_WEIGHT_ALIASES.items()}
     if any(v is None for v in needed.values()):
         from huggingface_hub import hf_hub_download
         import shutil
         os.makedirs("pt_weights", exist_ok=True)
-        for fn in ("blue_codec.safetensors", "duration_predictor_final.pt",
-                   "vf_estimetor.pt", "stats_multilingual.pt"):
             dest = os.path.join("pt_weights", fn)
             if not os.path.exists(dest):
-                print(f"[INFO] Fetching notmax123/blue/{fn} …")
                 cached = hf_hub_download(
-                    repo_id="notmax123/blue", filename=fn, repo_type="model",
                     token=os.environ.get("HF_TOKEN") or None,
                 )
                 shutil.copy2(cached, dest)
@@ -695,6 +714,12 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
             err = f'<div class="stats-bar"><span class="stat-pill">❌ voice clone failed: {e}</span></div>'
             return None, err
     else:
         style = VOICE_STYLES[voice]
     wav, sr = TTS.synthesize(
         expand_numbers(text, lang=lang), lang=lang, style=style,
@@ -726,10 +751,10 @@ PT_WEIGHTS_SEARCH = [
     "pt_weights",
 ]
 PT_WEIGHT_ALIASES: dict[str, list[str]] = {
-    "ae_ckpt":  ["blue_codec.safetensors", "blue_codec.pt"],
-    "ttl_ckpt": ["vf_estimetor.pt", "vf_estimator.pt"],
-    "dp_ckpt":  ["duration_predictor_final.pt", "duration_predictor.pt"],
-    "stats":    ["stats_multilingual.pt", "stats.pt"],
 }
@@ -870,7 +895,7 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
         with gr.Column(elem_classes="ref-panel"):
             voice_source_input = gr.Radio(
                 choices=[("Saved voice", "saved"), ("Uploaded reference", "upload")],
-                value="saved",
                 label="Voice source",
             )
             ref_wav_input = gr.Audio(

 import gradio as gr
 import onnxruntime as ort
+from download_models import BLUE_REPO, download_blue_models, download_default_voices, download_renikud
 # ------------------------------------------------------------------
 # Paths
 def _needs_download() -> bool:
     required = ["text_encoder.onnx", "vector_estimator.onnx", "vocoder.onnx",
                 "duration_predictor.onnx"]
+    repo_marker = os.path.join(ONNX_DIR, ".repo_id")
+    if not os.path.exists(repo_marker):
+        return True
+    with open(repo_marker) as f:
+        if f.read().strip() != BLUE_REPO:
+            return True
     for fn in required:
         p = os.path.join(ONNX_DIR, fn)
         if not os.path.exists(p) or os.path.getsize(p) < 1000:
 def discover_voices() -> Dict[str, str]:
     out: Dict[str, str] = {}
     for p in sorted(glob.glob(os.path.join(VOICES_DIR, "*.json"))):
+        try:
+            with open(p) as f:
+                payload = json.load(f)
+            ttl = payload.get("style_ttl")
+            if ttl:
+                arr = np.array(ttl["data"], dtype=np.float32)
+                if float(arr.std()) > 0.3:
+                    print(f"[INFO] Skipping incompatible voice JSON {p} (style_ttl std={arr.std():.3f})")
+                    continue
+        except Exception as e:
+            print(f"[WARN] Skipping unreadable voice JSON {p}: {e}")
+            continue
         label = os.path.splitext(os.path.basename(p))[0]
         pretty = label.replace("_", " ").replace("spk ", "Speaker ").title()
         out[pretty] = p
     return out
+VOICES: Dict[str, str] = discover_voices()
 VOICE_STYLES: Dict[str, Style] = {name: load_voice_style([path]) for name, path in VOICES.items()}
 def _ensure_pt_weights() -> dict[str, str]:
+    """Make sure v2 PyTorch/safetensors checkpoints are on disk."""
     needed: dict[str, Optional[str]] = {k: _find_pt_weight(v) for k, v in PT_WEIGHT_ALIASES.items()}
     if any(v is None for v in needed.values()):
         from huggingface_hub import hf_hub_download
         import shutil
         os.makedirs("pt_weights", exist_ok=True)
+        repo_id = os.environ.get("BLUE_PT_REPO", "notmax123/blue-v2")
+        for fn in ("blue_codec.safetensors", "duration_predictor_final.safetensors",
+                   "vf_estimetor.safetensors", "stats_multilingual.safetensors"):
             dest = os.path.join("pt_weights", fn)
             if not os.path.exists(dest):
+                print(f"[INFO] Fetching {repo_id}/{fn} …")
                 cached = hf_hub_download(
+                    repo_id=repo_id, filename=fn, repo_type="model",
                     token=os.environ.get("HF_TOKEN") or None,
                 )
                 shutil.copy2(cached, dest)
             err = f'<div class="stats-bar"><span class="stat-pill">❌ voice clone failed: {e}</span></div>'
             return None, err
     else:
+        if not VOICE_STYLES:
+            err = (
+                '<div class="stats-bar"><span class="stat-pill">'
+                'No saved v2 voices are installed. Choose "Uploaded reference" and upload audio.</span></div>'
+            )
+            return None, err
         style = VOICE_STYLES[voice]
     wav, sr = TTS.synthesize(
         expand_numbers(text, lang=lang), lang=lang, style=style,
     "pt_weights",
 ]
 PT_WEIGHT_ALIASES: dict[str, list[str]] = {
+    "ae_ckpt":  ["blue_codec.safetensors"],
+    "ttl_ckpt": ["vf_estimetor.safetensors"],
+    "dp_ckpt":  ["duration_predictor_final.safetensors"],
+    "stats":    ["stats_multilingual.safetensors"],
 }
         with gr.Column(elem_classes="ref-panel"):
             voice_source_input = gr.Radio(
                 choices=[("Saved voice", "saved"), ("Uploaded reference", "upload")],
+                value="saved" if VOICE_STYLES else "upload",
                 label="Voice source",
             )
             ref_wav_input = gr.Audio(

download_models.py CHANGED Viewed

@@ -1,35 +1,30 @@
-"""Download the slim BlueTTS ONNX bundle + a couple of sample voices."""
 import os
 import shutil
-from huggingface_hub import hf_hub_download, list_repo_files
-BLUE_REPO = "notmax123/blue-onnx"
 RENIKUD_REPO = "thewh1teagle/renikud"
-# Core slim bundle: 4 ONNX files + tts config.
 BLUE_FILES = [
     "text_encoder.onnx",
     "vector_estimator.onnx",
     "vocoder.onnx",
     "duration_predictor.onnx",
-    "tts.json",
 ]
-# Default voices fetched for the UI. Users can drop additional voice JSONs into
-# the same directory (e.g. by exporting with ``export_new_voice.py``) and they
-# will be picked up automatically.
-DEFAULT_VOICES: dict[str, str] = {
-    "Female": "voices/all_voices/female/spk_00014.json",
-    "Male":   "voices/all_voices/male/spk_00017.json",
-}
 def _is_valid(path: str, min_bytes: int = 100) -> bool:
     return os.path.exists(path) and os.path.getsize(path) >= min_bytes
-def _fetch(repo_id: str, filename: str, dest: str, min_bytes: int = 100) -> None:
-    if _is_valid(dest, min_bytes):
         print(f"Already present: {dest} ({os.path.getsize(dest):,} bytes)")
         return
     os.makedirs(os.path.dirname(dest) or ".", exist_ok=True)
@@ -44,14 +39,13 @@ def _fetch(repo_id: str, filename: str, dest: str, min_bytes: int = 100) -> None
 def download_blue_models(dest_dir: str = "onnx_slim") -> None:
     os.makedirs(dest_dir, exist_ok=True)
     for filename in BLUE_FILES:
         dest = os.path.join(dest_dir, filename)
-        try:
-            _fetch(BLUE_REPO, filename, dest, min_bytes=100)
-        except Exception as e:
-            print(f"  FAILED {filename}: {e}")
-            if filename.endswith(".onnx"):
-                raise
 def download_default_voices(dest_dir: str = "voices") -> dict[str, str]:

+"""Download the slim BlueTTS ONNX bundle + matching sample voices."""
 import os
 import shutil
+from huggingface_hub import hf_hub_download
+BLUE_REPO = os.environ.get("BLUE_ONNX_REPO", "notmax123/blue-onnx-v2")
 RENIKUD_REPO = "thewh1teagle/renikud"
+# Core slim bundle. Config is kept in the Space repo as root tts.json.
 BLUE_FILES = [
     "text_encoder.onnx",
     "vector_estimator.onnx",
     "vocoder.onnx",
     "duration_predictor.onnx",
 ]
+# Users can drop matching v2 voice JSONs into ./voices. The v2 ONNX repo does
+# not ship default voices, and old v1 voices are not compatible.
+DEFAULT_VOICES: dict[str, str] = {}
 def _is_valid(path: str, min_bytes: int = 100) -> bool:
     return os.path.exists(path) and os.path.getsize(path) >= min_bytes
+def _fetch(repo_id: str, filename: str, dest: str, min_bytes: int = 100, *, force: bool = False) -> None:
+    if not force and _is_valid(dest, min_bytes):
         print(f"Already present: {dest} ({os.path.getsize(dest):,} bytes)")
         return
     os.makedirs(os.path.dirname(dest) or ".", exist_ok=True)
 def download_blue_models(dest_dir: str = "onnx_slim") -> None:
     os.makedirs(dest_dir, exist_ok=True)
+    marker = os.path.join(dest_dir, ".repo_id")
+    force = not os.path.exists(marker) or open(marker).read().strip() != BLUE_REPO
     for filename in BLUE_FILES:
         dest = os.path.join(dest_dir, filename)
+        _fetch(BLUE_REPO, filename, dest, min_bytes=100, force=force)
+    with open(marker, "w") as f:
+        f.write(BLUE_REPO + "\n")
 def download_default_voices(dest_dir: str = "voices") -> dict[str, str]:

export_new_voice.py CHANGED Viewed

@@ -5,17 +5,17 @@ Build a *voice style* JSON for Blue (BlueTTS) from one reference WAV.
 See repo README for usage. Requires the BlueTTS training codebase on
 ``PYTHONPATH`` and the PyTorch checkpoints (``blue_codec.safetensors``,
-``vf_estimator.safetensors``, ``duration_predictor.safetensors``,
-``stats_multilingual.pt``).
     PYTHONPATH=training uv run python export_new_voice.py \
         --ref_wav /path/to/ref.wav \
         --out voices/mine.json \
         --config tts.json \
         --ae_ckpt pt_weights/blue_codec.safetensors \
-        --ttl_ckpt pt_weights/vf_estimator.safetensors \
-        --dp_ckpt pt_weights/duration_predictor.safetensors \
-        --stats pt_weights/stats_multilingual.pt
 """
 from __future__ import annotations
@@ -40,12 +40,12 @@ if _TRAINING not in sys.path:
 from bluecodec.autoencoder.latent_encoder import LatentEncoder  # noqa: E402
 from models.utils import LinearMelSpectrogram, compress_latents, load_ttl_config  # noqa: E402
-HF_REPO_ID = "notmax123/blue"
 HF_WEIGHT_SIZES: dict[str, int] = {
     "blue_codec.safetensors": 245_114_104,
-    "duration_predictor.safetensors": 2_040_512,
-    "stats_multilingual.pt": 3_133,
-    "vf_estimator.safetensors": 179_313_224,
 }
@@ -93,7 +93,7 @@ def load_stats(device: str, preferred: str, fallback: str = "stats.pt"):
     stats_path = preferred if os.path.exists(preferred) else fallback
     if not os.path.exists(stats_path):
         raise FileNotFoundError(f"Missing stats file: tried {preferred} and {fallback}")
-    stats = torch.load(stats_path, map_location=device, weights_only=False)
     mean = stats["mean"].to(device).view(1, -1, 1)
     std = stats["std"].to(device).view(1, -1, 1)
     return mean, std, stats_path
@@ -144,9 +144,9 @@ def export_voice_style(
     *,
     config: str = "tts.json",
     ae_ckpt: str = "blue_codec.safetensors",
-    ttl_ckpt: str = "vf_estimator.safetensors",
-    dp_ckpt: str = "duration_predictor.safetensors",
-    stats: str = "stats_multilingual.pt",
     device: str = "cpu",
     out_pt: str | None = None,
     verify_hf_sizes_flag: bool = False,
@@ -320,9 +320,9 @@ def main() -> None:
     ap.add_argument("--out", type=str, default="voice.json")
     ap.add_argument("--out_pt", type=str, default=None)
     ap.add_argument("--ae_ckpt", type=str, default="blue_codec.safetensors")
-    ap.add_argument("--stats", type=str, default="stats_multilingual.pt")
-    ap.add_argument("--ttl_ckpt", type=str, default="vf_estimator.safetensors")
-    ap.add_argument("--dp_ckpt", type=str, default="duration_predictor.safetensors")
     ap.add_argument("--verify_hf_sizes", action="store_true")
     ap.add_argument("--device", type=str, default="cpu")
     ap.add_argument("--config", type=str, default="tts.json")

 See repo README for usage. Requires the BlueTTS training codebase on
 ``PYTHONPATH`` and the PyTorch checkpoints (``blue_codec.safetensors``,
+``vf_estimetor.safetensors``, ``duration_predictor_final.safetensors``,
+``stats_multilingual.safetensors``).
     PYTHONPATH=training uv run python export_new_voice.py \
         --ref_wav /path/to/ref.wav \
         --out voices/mine.json \
         --config tts.json \
         --ae_ckpt pt_weights/blue_codec.safetensors \
+        --ttl_ckpt pt_weights/vf_estimetor.safetensors \
+        --dp_ckpt pt_weights/duration_predictor_final.safetensors \
+        --stats pt_weights/stats_multilingual.safetensors
 """
 from __future__ import annotations
 from bluecodec.autoencoder.latent_encoder import LatentEncoder  # noqa: E402
 from models.utils import LinearMelSpectrogram, compress_latents, load_ttl_config  # noqa: E402
+HF_REPO_ID = "notmax123/blue-v2"
 HF_WEIGHT_SIZES: dict[str, int] = {
     "blue_codec.safetensors": 245_114_104,
+    "duration_predictor_final.safetensors": 2_040_744,
+    "stats_multilingual.safetensors": 1_416,
+    "vf_estimetor.safetensors": 174_487_392,
 }
     stats_path = preferred if os.path.exists(preferred) else fallback
     if not os.path.exists(stats_path):
         raise FileNotFoundError(f"Missing stats file: tried {preferred} and {fallback}")
+    stats = load_torch_or_safetensors(stats_path, map_location=device)
     mean = stats["mean"].to(device).view(1, -1, 1)
     std = stats["std"].to(device).view(1, -1, 1)
     return mean, std, stats_path
     *,
     config: str = "tts.json",
     ae_ckpt: str = "blue_codec.safetensors",
+    ttl_ckpt: str = "vf_estimetor.safetensors",
+    dp_ckpt: str = "duration_predictor_final.safetensors",
+    stats: str = "stats_multilingual.safetensors",
     device: str = "cpu",
     out_pt: str | None = None,
     verify_hf_sizes_flag: bool = False,
     ap.add_argument("--out", type=str, default="voice.json")
     ap.add_argument("--out_pt", type=str, default=None)
     ap.add_argument("--ae_ckpt", type=str, default="blue_codec.safetensors")
+    ap.add_argument("--stats", type=str, default="stats_multilingual.safetensors")
+    ap.add_argument("--ttl_ckpt", type=str, default="vf_estimetor.safetensors")
+    ap.add_argument("--dp_ckpt", type=str, default="duration_predictor_final.safetensors")
     ap.add_argument("--verify_hf_sizes", action="store_true")
     ap.add_argument("--device", type=str, default="cpu")
     ap.add_argument("--config", type=str, default="tts.json")