Spaces:

bobsackett
/

ChatterboxTTS-DNXS-Spokenwordv1

Sleeping

App Files Files Community

danneauxs commited on Aug 12, 2025

Commit

3aa3268

1 Parent(s): 131e31b

added random seed setting

Browse files

Files changed (4) hide show

config/config.py +42 -1
gradio_tabs/tab1_convert_book.py +61 -3
modules/tts_engine.py +27 -2
src/chatterbox/tts.py +2 -0

config/config.py CHANGED Viewed

@@ -22,7 +22,7 @@ MIN_CHUNK_WORDS = 4
 # ============================================================================
 # WORKER AND PERFORMANCE SETTINGS
 # ============================================================================
-MAX_WORKERS = 1
 TEST_MAX_WORKERS = 6                  # For experimentation
 USE_DYNAMIC_WORKERS = False           # Toggle for testing
 VRAM_SAFETY_THRESHOLD = 6.5           # GB
@@ -140,6 +140,7 @@ CYAN = "\033[96m"
 DEFAULT_EXAGGERATION = 0.5
 DEFAULT_CFG_WEIGHT = 0.5
 DEFAULT_TEMPERATURE = 0.85
 # Advanced Sampling Parameters (Min_P Sampler Support)
 DEFAULT_MIN_P = 0.05                   # Min probability threshold (0.0 disables)
@@ -185,6 +186,46 @@ TTS_PARAM_MAX_TOP_P = 1.0              # MAX 1.0 disables top_p
 TTS_PARAM_MIN_REPETITION_PENALTY = 1.0 # 1.0 = no penalty
 TTS_PARAM_MAX_REPETITION_PENALTY = 2.0 # Higher values too restrictive MAX 2
 # ============================================================================
 # BATCH PROCESSING SETTINGS
 # ============================================================================

 # ============================================================================
 # WORKER AND PERFORMANCE SETTINGS
 # ============================================================================
+MAX_WORKERS = 2
 TEST_MAX_WORKERS = 6                  # For experimentation
 USE_DYNAMIC_WORKERS = False           # Toggle for testing
 VRAM_SAFETY_THRESHOLD = 6.5           # GB
 DEFAULT_EXAGGERATION = 0.5
 DEFAULT_CFG_WEIGHT = 0.5
 DEFAULT_TEMPERATURE = 0.85
+DEFAULT_SEED = 0 # Random seed for generation. 0 means random.
 # Advanced Sampling Parameters (Min_P Sampler Support)
 DEFAULT_MIN_P = 0.05                   # Min probability threshold (0.0 disables)
 TTS_PARAM_MIN_REPETITION_PENALTY = 1.0 # 1.0 = no penalty
 TTS_PARAM_MAX_REPETITION_PENALTY = 2.0 # Higher values too restrictive MAX 2
+# ============================================================================
+# TTS_PRESETS
+# ============================================================================
+TTS_PRESETS = {
+    "Narration": {
+        "exaggeration": 0.5,
+        "cfg_weight": 0.5,
+        "temperature": 0.85,
+        "min_p": 0.05,
+        "top_p": 1.0,
+        "repetition_penalty": 1.2,
+        "vader_enabled": True, # Default to VADER on for nuanced presets
+        "sentiment_smoothing": True,
+        "smoothing_window": 3,
+        "smoothing_method": "rolling"
+    },
+    "Expressive": {
+        "exaggeration": 0.65,
+        "cfg_weight": 0.7,
+        "temperature": 0.95,
+        "min_p": 0.05,
+        "top_p": 1.0,
+        "repetition_penalty": 1.2,
+        "vader_enabled": True,
+        "sentiment_smoothing": True,
+        "smoothing_window": 3,
+        "smoothing_method": "rolling"
+    },
+    "Exposition": {
+        "exaggeration": 0.4,
+        "cfg_weight": 0.6,
+        "temperature": 0.75,
+        "min_p": 0.05,
+        "top_p": 1.0,
+        "repetition_penalty": 1.2,
+        "vader_enabled": False, # VADER off for consistent, clear delivery
+        "sentiment_smoothing": False
+    }
+}
 # ============================================================================
 # BATCH PROCESSING SETTINGS
 # ============================================================================

gradio_tabs/tab1_convert_book.py CHANGED Viewed

@@ -355,6 +355,17 @@ def create_convert_book_tab():
             with gr.Column(scale=1):
                 gr.Markdown("### ⚙️ Quick Settings")
                 # VADER and ASR
                 vader_enabled = gr.Checkbox(
                     label="Use VADER sentiment analysis",
@@ -545,6 +556,14 @@ def create_convert_book_tab():
                     value=16, # Default value
                     info="Number of chunks to process simultaneously when VADER is disabled for speed."
                 )
         # Action Buttons and Status
         with gr.Row():
@@ -792,7 +811,7 @@ def create_convert_book_tab():
                         sentiment_smooth_val, smooth_window_val, smooth_method_val,
                         mfcc_val, output_val, spectral_thresh_val, output_thresh_val,
                         exag_val, cfg_val, temp_val, min_p_val, top_p_val, rep_penalty_val,
-                        tts_batch_size_val):
         """Start the actual book conversion - file upload version"""
         # Validation
@@ -896,7 +915,8 @@ def create_convert_book_tab():
             'asr_enabled': asr_val,
             'asr_config': asr_config,
             'add_to_batch': add_to_batch_val,
-            'tts_batch_size': tts_batch_size_val
         }
         # Set conversion state
@@ -987,7 +1007,7 @@ def create_convert_book_tab():
             sentiment_smoothing, smoothing_window, smoothing_method,
             mfcc_validation, output_validation, spectral_threshold, output_threshold,
             exaggeration, cfg_weight, temperature, min_p, top_p, repetition_penalty,
-            tts_batch_size
         ],
         outputs=[status_display, progress_display, audio_player, audiobook_selector, m4b_file_selector]
     )
@@ -1019,6 +1039,44 @@ def create_convert_book_tab():
         inputs=[m4b_file_selector, playback_speed],
         outputs=[status_display, audio_player, audiobook_selector, m4b_file_selector]
     )
     # Progress monitoring with file-based approach
     def get_current_stats():

             with gr.Column(scale=1):
                 gr.Markdown("### ⚙️ Quick Settings")
+                # NEW: Presets
+                with gr.Row():
+                    preset_dropdown = gr.Dropdown(
+                        label="Load Preset",
+                        choices=list(TTS_PRESETS.keys()),
+                        value="Narration",
+                        interactive=True,
+                        info="Apply predefined TTS parameter settings."
+                    )
+                    apply_preset_btn = gr.Button("Apply Preset", size="sm", variant="secondary")
                 # VADER and ASR
                 vader_enabled = gr.Checkbox(
                     label="Use VADER sentiment analysis",
                     value=16, # Default value
                     info="Number of chunks to process simultaneously when VADER is disabled for speed."
                 )
+                # NEW: Random Seed
+                seed = gr.Number(
+                    label="Random Seed (0 for random)",
+                    minimum=0, maximum=999999999, step=1,
+                    value=0, # Default value
+                    info="Set a seed for reproducible generation. 0 means random."
+                )
         # Action Buttons and Status
         with gr.Row():
                         sentiment_smooth_val, smooth_window_val, smooth_method_val,
                         mfcc_val, output_val, spectral_thresh_val, output_thresh_val,
                         exag_val, cfg_val, temp_val, min_p_val, top_p_val, rep_penalty_val,
+                        tts_batch_size_val, seed_val):
         """Start the actual book conversion - file upload version"""
         # Validation
             'asr_enabled': asr_val,
             'asr_config': asr_config,
             'add_to_batch': add_to_batch_val,
+            'tts_batch_size': tts_batch_size_val,
+            'seed': seed_val
         }
         # Set conversion state
             sentiment_smoothing, smoothing_window, smoothing_method,
             mfcc_validation, output_validation, spectral_threshold, output_threshold,
             exaggeration, cfg_weight, temperature, min_p, top_p, repetition_penalty,
+            tts_batch_size, seed
         ],
         outputs=[status_display, progress_display, audio_player, audiobook_selector, m4b_file_selector]
     )
         inputs=[m4b_file_selector, playback_speed],
         outputs=[status_display, audio_player, audiobook_selector, m4b_file_selector]
     )
+    # NEW: Apply Preset Function
+    def apply_preset(preset_name):
+        if preset_name not in TTS_PRESETS:
+            return gr.update() # No change if preset not found
+        preset = TTS_PRESETS[preset_name]
+        return (
+            gr.update(value=preset.get("vader_enabled", True)),
+            gr.update(value=preset.get("sentiment_smoothing", True)),
+            gr.update(value=preset.get("smoothing_window", 3)),
+            gr.update(value=preset.get("smoothing_method", "rolling")),
+            gr.update(value=preset.get("exaggeration", DEFAULT_EXAGGERATION)),
+            gr.update(value=preset.get("cfg_weight", DEFAULT_CFG_WEIGHT)),
+            gr.update(value=preset.get("temperature", DEFAULT_TEMPERATURE)),
+            gr.update(value=preset.get("min_p", DEFAULT_MIN_P)),
+            gr.update(value=preset.get("top_p", DEFAULT_TOP_P)),
+            gr.update(value=preset.get("repetition_penalty", DEFAULT_REPETITION_PENALTY)),
+        )
+    # Connect apply_preset_btn
+    apply_preset_btn.click(
+        apply_preset,
+        inputs=[preset_dropdown],
+        outputs=[
+            vader_enabled,
+            sentiment_smoothing,
+            smoothing_window,
+            smoothing_method,
+            exaggeration,
+            cfg_weight,
+            temperature,
+            min_p,
+            top_p,
+            repetition_penalty,
+        ]
+    )
     # Progress monitoring with file-based approach
     def get_current_stats():

modules/tts_engine.py CHANGED Viewed

@@ -64,6 +64,27 @@ YELLOW = '\033[93m'
 CYAN = '\033[96m'
 RESET = '\033[0m'
 # ============================================================================
 # MEMORY AND MODEL MANAGEMENT
 # ============================================================================
@@ -234,9 +255,11 @@ def process_batch(
     batch, text_chunks_dir, audio_chunks_dir,
     voice_path, tts_params, start_time, total_chunks,
     punc_norm, basename, log_run_func, log_path, device,
-    model, asr_model, all_chunks,
     enable_asr=None
 ):
     """
     Process a batch of chunks using the batch-enabled TTS model.
     """
@@ -309,9 +332,11 @@ def process_one_chunk(
     i, chunk, text_chunks_dir, audio_chunks_dir,
     voice_path, tts_params, start_time, total_chunks,
     punc_norm, basename, log_run_func, log_path, device,
-    model, asr_model, boundary_type="none",
     enable_asr=None
 ):
     """Enhanced chunk processing with quality control, contextual silence, and deep cleanup"""
     import difflib
     from pydub import AudioSegment

 CYAN = '\033[96m'
 RESET = '\033[0m'
+import random
+import numpy as np
+import torch
+def set_seed(seed_value: int):
+    """
+    Sets the seed for torch, random, and numpy for reproducibility.
+    This is called if a non-zero seed is provided for generation.
+    """
+    torch.manual_seed(seed_value)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed_value)
+        torch.cuda.manual_seed_all(seed_value)  # if using multi-GPU
+    if torch.backends.mps.is_available():
+        # Check if torch.mps exists before calling
+        if hasattr(torch, 'mps') and torch.mps.is_available():
+            torch.mps.manual_seed(seed_value)
+    random.seed(seed_value)
+    np.random.seed(seed_value)
+    logging.info(f"Global seed set to: {seed_value}")
 # ============================================================================
 # MEMORY AND MODEL MANAGEMENT
 # ============================================================================
     batch, text_chunks_dir, audio_chunks_dir,
     voice_path, tts_params, start_time, total_chunks,
     punc_norm, basename, log_run_func, log_path, device,
+    model, asr_model, seed=0,
     enable_asr=None
 ):
+    if seed != 0:
+        set_seed(seed)
     """
     Process a batch of chunks using the batch-enabled TTS model.
     """
     i, chunk, text_chunks_dir, audio_chunks_dir,
     voice_path, tts_params, start_time, total_chunks,
     punc_norm, basename, log_run_func, log_path, device,
+    model, asr_model, seed=0, boundary_type="none",
     enable_asr=None
 ):
+    if seed != 0:
+        set_seed(seed)
     """Enhanced chunk processing with quality control, contextual silence, and deep cleanup"""
     import difflib
     from pydub import AudioSegment

src/chatterbox/tts.py CHANGED Viewed

@@ -294,6 +294,7 @@ class ChatterboxTTS:
         min_p=0.05,
         top_p=0.8,
         repetition_penalty=2.0,
     ):
         if audio_prompt_path:
             self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
@@ -360,6 +361,7 @@ class ChatterboxTTS:
         min_p=0.05,
         top_p=0.8,
         repetition_penalty=2.0,
     ):
         if audio_prompt_path:
             self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)

         min_p=0.05,
         top_p=0.8,
         repetition_penalty=2.0,
+        seed=0,
     ):
         if audio_prompt_path:
             self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
         min_p=0.05,
         top_p=0.8,
         repetition_penalty=2.0,
+        seed=0,
     ):
         if audio_prompt_path:
             self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)