SoundImage-VoiceClone

Runtime error

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 12

Commit

b8a3553

verified ·

1 Parent(s): 22bde2c

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -18

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import spaces
 from zonos.model import Zonos
-from zonos.conditioning import make_cond_dict, supported_language_codes
 # We'll keep a global dictionary of loaded models to avoid reloading
 MODELS_CACHE = {}
@@ -13,6 +13,15 @@ device = "cuda"
 banner_url = "https://huggingface.co/datasets/Steveeeeeeen/random_images/resolve/main/ZonosHeader.png"
 BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 150px; max-width: 300px;"> </div>'
 def load_model(model_name: str):
     """
     Loads or retrieves a cached Zonos model, sets it to eval and bfloat16.
@@ -28,15 +37,20 @@ def load_model(model_name: str):
     return MODELS_CACHE[model_name]
 @spaces.GPU(duration=90)
-def tts(text, speaker_audio, selected_language, model_choice):
     """
     text: str (Text prompt to synthesize)
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
-    selected_language: str (language code)
     model_choice: str (which Zonos model to use, e.g., "Zyphra/Zonos-v0.1-hybrid")
     Returns (sr_out, wav_out_numpy).
     """
     model = load_model(model_choice)
     if not text:
@@ -52,12 +66,11 @@ def tts(text, speaker_audio, selected_language, model_choice):
     # Convert to Torch tensor
     wav_tensor = torch.from_numpy(wav_np).float()
-    # If stereo (shape [channels, samples]) or multi-channel, downmix to mono
-    # e.g. shape (2, samples) -> shape (samples,) by averaging
     if wav_tensor.ndim == 2 and wav_tensor.shape[0] > 1:
-        wav_tensor = wav_tensor.mean(dim=0)  # shape => (samples,)
-    # Now add a batch dimension => shape (1, samples)
     wav_tensor = wav_tensor.unsqueeze(0)
     # Get speaker embedding
@@ -66,12 +79,12 @@ def tts(text, speaker_audio, selected_language, model_choice):
         spk_embedding = spk_embedding.to(device, dtype=torch.bfloat16)
     # Prepare conditioning dictionary
-    cond_dict = make_cond_dict(
-        text=text,
-        speaker=spk_embedding,
-        language=selected_language,
-        device=device,
-    )
     conditioning = model.prepare_conditioning(cond_dict)
     # Generate codes
@@ -106,8 +119,6 @@ def build_demo():
             ref_audio_input = gr.Audio(
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
-                # Optionally add mono=True if you want Gradio to always downmix automatically:
-                # mono=True
             )
         model_dropdown = gr.Dropdown(
@@ -116,10 +127,12 @@ def build_demo():
             value="Zyphra/Zonos-v0.1-hybrid",
             interactive=True,
         )
         language_dropdown = gr.Dropdown(
-            label="Language Code",
-            choices=supported_language_codes,
-            value="en-us",
             interactive=True,
         )

 import spaces
 from zonos.model import Zonos
+from zonos.conditioning import make_cond_dict  # Keep this; remove supported_language_codes
 # We'll keep a global dictionary of loaded models to avoid reloading
 MODELS_CACHE = {}
 banner_url = "https://huggingface.co/datasets/Steveeeeeeen/random_images/resolve/main/ZonosHeader.png"
 BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 150px; max-width: 300px;"> </div>'
+# Define a list of tuples: (Display Label, Language Code)
+LANGUAGES = [
+    ("English",  "en-us"),
+    ("Japanese", "ja"),
+    ("Chinese",  "cmn"),
+    ("French",   "fr-fr"),
+    ("German",   "de"),
+]
 def load_model(model_name: str):
     """
     Loads or retrieves a cached Zonos model, sets it to eval and bfloat16.
     return MODELS_CACHE[model_name]
 @spaces.GPU(duration=90)
+def tts(text, speaker_audio, selected_language_label, model_choice):
     """
     text: str (Text prompt to synthesize)
     speaker_audio: (sample_rate, numpy_array) from Gradio if type="numpy"
+    selected_language_label: str (the display name from the dropdown, e.g. "Chinese")
     model_choice: str (which Zonos model to use, e.g., "Zyphra/Zonos-v0.1-hybrid")
     Returns (sr_out, wav_out_numpy).
     """
+    # Map from label -> actual language code
+    label_to_code = dict(LANGUAGES)
+    # Convert the human-readable label back to the code
+    selected_language = label_to_code[selected_language_label]
     model = load_model(model_choice)
     if not text:
     # Convert to Torch tensor
     wav_tensor = torch.from_numpy(wav_np).float()
+    # If stereo or multi-channel, downmix to mono
     if wav_tensor.ndim == 2 and wav_tensor.shape[0] > 1:
+        wav_tensor = wav_tensor.mean(dim=0)  # => (samples,)
+    # Add batch dimension => (1, samples)
     wav_tensor = wav_tensor.unsqueeze(0)
     # Get speaker embedding
         spk_embedding = spk_embedding.to(device, dtype=torch.bfloat16)
     # Prepare conditioning dictionary
+    cond_dict = {
+        "text": text,
+        "speaker": spk_embedding,
+        "language": selected_language,  # Use the code here
+        "device": device,
+    }
     conditioning = model.prepare_conditioning(cond_dict)
     # Generate codes
             ref_audio_input = gr.Audio(
                 label="Reference Audio (Speaker Cloning)",
                 type="numpy"
             )
         model_dropdown = gr.Dropdown(
             value="Zyphra/Zonos-v0.1-hybrid",
             interactive=True,
         )
+        # For the language dropdown, we display only the friendly label
         language_dropdown = gr.Dropdown(
+            label="Language",
+            choices=[label for (label, code) in LANGUAGES],
+            value="English",  # default display
             interactive=True,
         )