Chatterbox-Multilingual-TTS-API

Running

App Files Files Community

rahul7star commited on Jan 5

Commit

aaaab74

verified ·

1 Parent(s): 91fdade

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -41

app.py CHANGED Viewed

@@ -149,11 +149,10 @@ def get_or_load_model():
                 map_location="cpu"
             )
-            # Extra safety: force CPU
             if hasattr(MODEL, "to"):
                 MODEL = MODEL.to("cpu")
-            # Disable gradients (CPU optimization)
             MODEL.eval()
             for p in MODEL.parameters():
                 p.requires_grad = False
@@ -177,13 +176,11 @@ except Exception as e:
     )
 def set_seed(seed: int):
-    """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
-    if DEVICE == "cuda":
-        torch.cuda.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
     random.seed(seed)
     np.random.seed(seed)
 def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
     """
@@ -206,37 +203,14 @@ def generate_tts_audio(
     seed_num_input: int = 0,
     cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
-    """
-    Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
-    Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
-    This tool synthesizes natural-sounding speech from input text. When a reference audio file
-    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
-    maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
-    Args:
-        text_input (str): The text to synthesize into speech (maximum 300 characters)
-        language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
-        audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
-        exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
-        temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
-        seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
-        cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
-    Returns:
-        tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
-    """
-    current_model = get_or_load_model()
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
     if seed_num_input != 0:
         set_seed(int(seed_num_input))
-    print(f"Generating audio for text: '{text_input[:50]}...'")
-    # Handle optional audio prompt
     chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
     generate_kwargs = {
@@ -244,19 +218,22 @@ def generate_tts_audio(
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
     }
     if chosen_prompt:
         generate_kwargs["audio_prompt_path"] = chosen_prompt
-        print(f"Using audio prompt: {chosen_prompt}")
-    else:
-        print("No audio prompt provided; using default voice.")
-    wav = current_model.generate(
-        text_input[:300],  # Truncate text to max chars
-        language_id=language_id,
-        **generate_kwargs
-    )
-    print("Audio generation complete.")
-    return (current_model.sr, wav.squeeze(0).numpy())
 with gr.Blocks() as demo:
     gr.Markdown(

                 map_location="cpu"
             )
+            # Absolute safety
             if hasattr(MODEL, "to"):
                 MODEL = MODEL.to("cpu")
             MODEL.eval()
             for p in MODEL.parameters():
                 p.requires_grad = False
     )
 def set_seed(seed: int):
+    """CPU-only reproducibility."""
     torch.manual_seed(seed)
     random.seed(seed)
     np.random.seed(seed)
 def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
     """
     seed_num_input: int = 0,
     cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
+    current_model = get_or_load_model()
     if current_model is None:
         raise RuntimeError("TTS model is not loaded.")
     if seed_num_input != 0:
         set_seed(int(seed_num_input))
     chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
     generate_kwargs = {
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
     }
     if chosen_prompt:
         generate_kwargs["audio_prompt_path"] = chosen_prompt
+    # 🔒 CPU-safe inference
+    with torch.no_grad():
+        wav = current_model.generate(
+            text_input[:300],
+            language_id=language_id,
+            **generate_kwargs
+        )
+    # Ensure CPU numpy conversion
+    wav = wav.squeeze(0).detach().cpu().numpy()
+    return (current_model.sr, wav)
 with gr.Blocks() as demo:
     gr.Markdown(