chatterbox-tts-dhivehi

Build error

App Files Files Community

alakxender commited on Oct 11, 2025

Commit

71b9145

1 Parent(s): 281d8d1

t

Browse files

Files changed (2) hide show

app.py +101 -28
chatterbox_dhivehi.py +1 -1

app.py CHANGED Viewed

@@ -59,16 +59,18 @@ def download_model():
         print(f"Warning: Could not download model files: {e}")
         print("=" * 60)
-def load_model(checkpoint=f"{_target}/kn_cbox"):
     """Load the TTS model"""
     global MODEL
     try:
-        print(f"Loading model with checkpoint: {checkpoint}")
         MODEL = ChatterboxTTS.from_dhivehi(
-            ckpt_dir=Path(checkpoint),
-            device="cuda" if torch.cuda.is_available() else "cpu"
         )
-        print("Model loaded successfully!")
     except Exception as e:
         print(f"Error loading model: {e}")
         raise e
@@ -82,14 +84,14 @@ def set_seed(seed: int):
     random.seed(seed)
     np.random.seed(seed)
-@spaces.GPU(duration=60)
-def generate_speech(text,
-                   reference_audio,
-                   exaggeration=0.5,
-                   temperature=0.1,
-                   cfg_weight=0.5,
-                   seed=42):
-    """Generate speech from text using voice cloning"""
     global MODEL
     # Clean the input text
@@ -161,6 +163,25 @@ def generate_speech(text,
         print(error_msg)
         return None, error_msg
 def clean_text(text):
     """Clean text by removing newlines at start/end, double spaces, and extra whitespace"""
     import re
@@ -224,14 +245,15 @@ def split_sentences(text):
     return final_sentences
-@spaces.GPU
-def generate_speech_multi_sentence(text,
-                                   reference_audio,
-                                   exaggeration=0.5,
-                                   temperature=0.1,
-                                   cfg_weight=0.5,
-                                   seed=42):
-    """Generate speech from text with multi-sentence support and progress tracking"""
     global MODEL
     # Clean the input text
@@ -251,7 +273,7 @@ def generate_speech_multi_sentence(text,
     # If only one sentence or no periods, use regular method
     if len(sentences) <= 1:
         yield None, "Generating single sentence..."
-        result_audio, result_status = generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
         yield result_audio, result_status
         return
@@ -360,12 +382,32 @@ def generate_speech_multi_sentence(text,
         print(error_msg)
         yield None, error_msg
 def create_interface():
     """Create the Gradio interface"""
-    # Load the model
-    load_model()
     # Sample texts in Dhivehi
     sample_texts = [
         "ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ",
@@ -456,6 +498,21 @@ The ministry handed over the land reclamation, replacement of the port canal and
                     label="Seed",
                     info="For reproducible results"
                 )
         # Row 4: Generate button
         generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
@@ -473,6 +530,15 @@ The ministry handed over the land reclamation, replacement of the port canal and
         def set_reference_audio(audio_file):
             return audio_file
         sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input])
         sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input])
         sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input])
@@ -483,17 +549,24 @@ The ministry handed over the land reclamation, replacement of the port canal and
         ref_btn3.click(lambda: set_reference_audio("m1.wav"), outputs=[reference_audio])
         ref_btn4.click(lambda: set_reference_audio("m2.wav"), outputs=[reference_audio])
-        def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
             """Generate speech with streaming progress updates"""
             # Use the streaming generator
             for result_audio, result_status in generate_speech_multi_sentence(
-                text, reference_audio, exaggeration, temperature, cfg_weight, seed
             ):
                 yield result_audio, result_status
         generate_btn.click(
             fn=generate_with_progress,
-            inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed],
             outputs=[output_audio, status_message]
         )

         print(f"Warning: Could not download model files: {e}")
         print("=" * 60)
+def load_model(checkpoint="kn_cbox", device="cuda"):
     """Load the TTS model"""
     global MODEL
     try:
+        checkpoint_path = f"{_target}/{checkpoint}"
+        print(f"Loading model with checkpoint: {checkpoint_path}")
+        print(f"Target device: {device}")
         MODEL = ChatterboxTTS.from_dhivehi(
+            ckpt_dir=Path(checkpoint_path),
+            device=device
         )
+        print(f"Model loaded successfully on {device}!")
     except Exception as e:
         print(f"Error loading model: {e}")
         raise e
     random.seed(seed)
     np.random.seed(seed)
+# Internal implementation without decorator
+def _generate_speech_impl(text,
+                         reference_audio,
+                         exaggeration=0.5,
+                         temperature=0.1,
+                         cfg_weight=0.5,
+                         seed=42):
+    """Internal implementation of generate speech"""
     global MODEL
     # Clean the input text
         print(error_msg)
         return None, error_msg
+# GPU version with decorator
+@spaces.GPU(duration=60)
+def _generate_speech_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
+    """GPU version of generate speech"""
+    return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
+# CPU version without decorator
+def _generate_speech_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
+    """CPU version of generate speech"""
+    return _generate_speech_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
+# Router function
+def generate_speech(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True):
+    """Generate speech from text using voice cloning"""
+    if use_gpu:
+        return _generate_speech_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
+    else:
+        return _generate_speech_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
 def clean_text(text):
     """Clean text by removing newlines at start/end, double spaces, and extra whitespace"""
     import re
     return final_sentences
+# Internal implementation without decorator
+def _generate_speech_multi_sentence_impl(text,
+                                        reference_audio,
+                                        exaggeration=0.5,
+                                        temperature=0.1,
+                                        cfg_weight=0.5,
+                                        seed=42,
+                                        use_gpu=True):
+    """Internal implementation of multi-sentence speech generation"""
     global MODEL
     # Clean the input text
     # If only one sentence or no periods, use regular method
     if len(sentences) <= 1:
         yield None, "Generating single sentence..."
+        result_audio, result_status = generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu)
         yield result_audio, result_status
         return
         print(error_msg)
         yield None, error_msg
+# GPU version with decorator
+@spaces.GPU
+def _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
+    """GPU version of multi-sentence speech generation"""
+    for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=True):
+        yield result
+# CPU version without decorator
+def _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42):
+    """CPU version of multi-sentence speech generation"""
+    for result in _generate_speech_multi_sentence_impl(text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu=False):
+        yield result
+# Router function
+def generate_speech_multi_sentence(text, reference_audio, exaggeration=0.5, temperature=0.1, cfg_weight=0.5, seed=42, use_gpu=True):
+    """Generate speech from text with multi-sentence support and progress tracking"""
+    if use_gpu:
+        for result in _generate_speech_multi_sentence_gpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
+            yield result
+    else:
+        for result in _generate_speech_multi_sentence_cpu(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
+            yield result
 def create_interface():
     """Create the Gradio interface"""
     # Sample texts in Dhivehi
     sample_texts = [
         "ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ",
                     label="Seed",
                     info="For reproducible results"
                 )
+            with gr.Row():
+                model_select = gr.Dropdown(
+                    choices=["kn_cbox", "f01_cbox"],
+                    value="kn_cbox",
+                    label="Model",
+                    info="Select TTS model"
+                )
+                device_select = gr.Dropdown(
+                    choices=["GPU", "CPU"],
+                    value="GPU",
+                    label="Device",
+                    info="Select computation device"
+                )
+                reload_btn = gr.Button("🔄 Reload Model", size="sm")
+            reload_status = gr.Textbox(label="Model Status", value="Model not loaded", interactive=False)
         # Row 4: Generate button
         generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         def set_reference_audio(audio_file):
             return audio_file
+        def reload_model_handler(model_name, device_name):
+            """Reload model with selected checkpoint and device"""
+            try:
+                device = "cuda" if device_name == "GPU" else "cpu"
+                load_model(checkpoint=model_name, device=device)
+                return f"✅ Model '{model_name}' loaded successfully on {device_name}!"
+            except Exception as e:
+                return f"❌ Error loading model: {str(e)}"
         sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input])
         sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input])
         sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input])
         ref_btn3.click(lambda: set_reference_audio("m1.wav"), outputs=[reference_audio])
         ref_btn4.click(lambda: set_reference_audio("m2.wav"), outputs=[reference_audio])
+        reload_btn.click(
+            fn=reload_model_handler,
+            inputs=[model_select, device_select],
+            outputs=[reload_status]
+        )
+        def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed, device_name):
             """Generate speech with streaming progress updates"""
+            use_gpu = (device_name == "GPU")
             # Use the streaming generator
             for result_audio, result_status in generate_speech_multi_sentence(
+                text, reference_audio, exaggeration, temperature, cfg_weight, seed, use_gpu
             ):
                 yield result_audio, result_status
         generate_btn.click(
             fn=generate_with_progress,
+            inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed, device_select],
             outputs=[output_audio, status_message]
         )

chatterbox_dhivehi.py CHANGED Viewed

@@ -156,7 +156,7 @@ def from_dhivehi(
     *,
     ckpt_dir: Union[str, Path],
     device: str = "cpu",
-    force_vocab_size: int = 2500,
 ):
     """
     Construct a Dhivehi-extended ChatterboxTTS from a checkpoint directory.

     *,
     ckpt_dir: Union[str, Path],
     device: str = "cpu",
+    force_vocab_size: int = 2000,
 ):
     """
     Construct a Dhivehi-extended ChatterboxTTS from a checkpoint directory.