import random import numpy as np import torch from chatterbox.src.chatterbox.tts import ChatterboxTTS import gradio as gr import spaces import os import re import torchaudio import threading import time from queue import Queue from dataclasses import dataclass from typing import Optional, Callable DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"🚀 Running on device: {DEVICE}") # Directory for saving audio files OUTPUT_DIR = "generated_audio" if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) # Global variables for tracking sections and paragraphs SECTION_INFO = [] # Will contain (section_number, paragraph_number, text) tuples GENERATION_COUNTS = {} # Track generation count for each paragraph # --- Global Model Initialization --- MODEL = None def get_or_load_model(): """Loads the ChatterboxTTS model if it hasn't been loaded already, and ensures it's on the correct device.""" global MODEL if MODEL is None: print("Model not loaded, initializing...") try: MODEL = ChatterboxTTS.from_pretrained(DEVICE) if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE: MODEL.to(DEVICE) print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}") except Exception as e: print(f"Error loading model: {e}") raise return MODEL # Attempt to load the model at startup. try: get_or_load_model() except Exception as e: print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}") def set_seed(seed: int): """Sets the random seed for reproducibility across torch, numpy, and random.""" torch.manual_seed(seed) if DEVICE == "cuda": torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed) def count_words(text): """Count the number of words in a text string.""" return len([word for word in text.split() if word.strip()]) def trim_audio(audio_data, start_time, end_time): """ Trim audio data between start_time and end_time (in seconds) Args: audio_data: Tuple of (sample_rate, audio_array) start_time: Start time in seconds end_time: End time in seconds (None means end of audio) Returns: Trimmed audio data in same format """ if audio_data is None: return None sr, audio_array = audio_data # Convert to numpy if needed if isinstance(audio_array, torch.Tensor): audio_array = audio_array.numpy() # Calculate sample indices start_sample = int(start_time * sr) end_sample = int(end_time * sr) if end_time is not None else len(audio_array) # Ensure bounds are valid start_sample = max(0, start_sample) end_sample = min(len(audio_array), end_sample) if start_sample >= end_sample: return None # Trim the audio trimmed_audio = audio_array[start_sample:end_sample] return (sr, trimmed_audio) def detect_silence_boundaries(audio_data, silence_threshold=0.01, min_silence_duration=0.1): """ Detect silence at the beginning and end of audio to suggest trim points Args: audio_data: Tuple of (sample_rate, audio_array) silence_threshold: Amplitude threshold below which audio is considered silence min_silence_duration: Minimum duration of silence to consider (seconds) Returns: (suggested_start, suggested_end) in seconds """ if audio_data is None: return 0, 0 sr, audio_array = audio_data # Convert to numpy if needed if isinstance(audio_array, torch.Tensor): audio_array = audio_array.numpy() # Get absolute values audio_abs = np.abs(audio_array) # Find first non-silent sample min_silence_samples = int(min_silence_duration * sr) # Find start of audio content start_idx = 0 for i in range(len(audio_abs) - min_silence_samples): if np.max(audio_abs[i:i + min_silence_samples]) > silence_threshold: start_idx = max(0, i - int(0.05 * sr)) # Add 50ms buffer break # Find end of audio content end_idx = len(audio_abs) for i in range(len(audio_abs) - min_silence_samples - 1, min_silence_samples, -1): if np.max(audio_abs[i - min_silence_samples:i]) > silence_threshold: end_idx = min(len(audio_abs), i + int(0.05 * sr)) # Add 50ms buffer break return start_idx / sr, end_idx / sr def merge_audio_files(audio_data_list, crossfade_duration=0.1, pause_duration=0.0): """ Merge multiple audio files into one with optional crossfading and pauses Args: audio_data_list: List of (sample_rate, audio_array) tuples or None values crossfade_duration: Duration of crossfade between clips in seconds pause_duration: Duration of silence to add between clips in seconds Returns: Merged audio data as (sample_rate, audio_array) tuple """ # Filter out None values and collect valid audio data valid_audio = [] sample_rate = None print(f"Processing {len(audio_data_list)} audio clips for merging...") for i, audio_data in enumerate(audio_data_list): if audio_data is not None: sr, audio_array = audio_data if sample_rate is None: sample_rate = sr elif sample_rate != sr: print(f"Warning: Sample rate mismatch. Expected {sample_rate}, got {sr}") continue # Convert to numpy if needed if isinstance(audio_array, torch.Tensor): audio_array = audio_array.numpy() # Convert to float32 for processing to avoid casting errors if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) duration = len(audio_array) / sr print(f"Audio clip {i}: {duration:.2f} seconds, {len(audio_array)} samples") valid_audio.append(audio_array) if not valid_audio: print("No valid audio found") return None if len(valid_audio) == 1: print("Only one audio clip, returning as-is") return (sample_rate, valid_audio[0]) print(f"Merging {len(valid_audio)} audio clips with {crossfade_duration}s crossfade and {pause_duration}s pause") # Calculate crossfade and pause samples crossfade_samples = int(crossfade_duration * sample_rate) pause_samples = int(pause_duration * sample_rate) print(f"Crossfade samples: {crossfade_samples}, Pause samples: {pause_samples}") # Start with the first audio clip merged_audio = valid_audio[0].copy() print(f"Starting with clip 0: {len(merged_audio)} samples") # Add each subsequent clip with crossfading and/or pauses for i in range(1, len(valid_audio)): current_clip = valid_audio[i] print(f"Merging clip {i}: {len(current_clip)} samples") # If we have both crossfade and pause, crossfade takes precedence if crossfade_samples > 0 and len(merged_audio) >= crossfade_samples and len(current_clip) >= crossfade_samples: print(f"Applying crossfade between clips") # Create crossfade # Fade out the end of merged_audio fade_out = np.linspace(1.0, 0.0, crossfade_samples, dtype=np.float32) merged_audio[-crossfade_samples:] *= fade_out # Fade in the beginning of current_clip fade_in = np.linspace(0.0, 1.0, crossfade_samples, dtype=np.float32) current_clip_faded = current_clip.copy() current_clip_faded[:crossfade_samples] *= fade_in # Overlap the crossfade region merged_audio[-crossfade_samples:] += current_clip_faded[:crossfade_samples] # Append the rest of the current clip merged_audio = np.concatenate([merged_audio, current_clip[crossfade_samples:]]) elif pause_samples > 0: print(f"Adding {pause_duration}s pause between clips") # Create silence for the pause silence = np.zeros(pause_samples, dtype=np.float32) # Add pause then the current clip merged_audio = np.concatenate([merged_audio, silence, current_clip]) else: print(f"No crossfade or pause, concatenating directly") # No crossfade or pause, just concatenate merged_audio = np.concatenate([merged_audio, current_clip]) print(f"Merged audio now: {len(merged_audio)} samples ({len(merged_audio)/sample_rate:.2f} seconds)") final_duration = len(merged_audio) / sample_rate print(f"Final merged audio: {len(merged_audio)} samples ({final_duration:.2f} seconds)") return (sample_rate, merged_audio) def save_merged_audio(merged_audio_data, section_filter=None): """ Save merged audio with appropriate naming Args: merged_audio_data: (sample_rate, audio_array) tuple section_filter: Optional section number to filter by, or None for all sections Returns: Filepath of saved merged audio """ global OUTPUT_DIR, SECTION_INFO if merged_audio_data is None: return None # Create filename based on what was merged if section_filter is not None: filename = f"merged_section_{section_filter}.wav" else: filename = "merged_all_sections.wav" filepath = os.path.join(OUTPUT_DIR, filename) # Save audio file sr, audio_array = merged_audio_data if isinstance(audio_array, np.ndarray): audio_tensor = torch.tensor(audio_array) else: audio_tensor = audio_array if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) torchaudio.save(filepath, audio_tensor, sr) print(f"Saved merged audio to {filepath}") return filepath def merge_all_generated_audio(crossfade_duration, pause_duration, *audio_inputs): """ Merge all generated audio files into one """ global SECTION_INFO # Collect all non-None audio data regardless of section info audio_data_list = [] merged_count = 0 print(f"Checking {len(audio_inputs)} audio inputs for merging...") for idx, audio_data in enumerate(audio_inputs): if audio_data is not None: print(f"Found audio at index {idx}") audio_data_list.append(audio_data) merged_count += 1 else: print(f"No audio at index {idx}") print(f"Total audio files found: {merged_count}") if not audio_data_list: return None, "No audio files to merge." # If we have section info, try to organize by section order if SECTION_INFO: # Create a mapping of section order organized_audio = [] section_audio_map = {} # First, map audio to their corresponding sections for idx, (section_num, para_num, text) in enumerate(SECTION_INFO): if idx < len(audio_inputs) and audio_inputs[idx] is not None: if section_num not in section_audio_map: section_audio_map[section_num] = [] section_audio_map[section_num].append((para_num, audio_inputs[idx])) # Now organize by section number, then paragraph number for section_num in sorted(section_audio_map.keys()): # Sort paragraphs within each section section_paragraphs = sorted(section_audio_map[section_num], key=lambda x: x[0]) for para_num, audio_data in section_paragraphs: organized_audio.append(audio_data) print(f"Added Section {section_num}, Paragraph {para_num} to merge list") if organized_audio: audio_data_list = organized_audio print(f"Organized {len(audio_data_list)} audio files by section order") # Merge the audio merged_audio = merge_audio_files(audio_data_list, crossfade_duration=crossfade_duration, pause_duration=pause_duration) if merged_audio is None: return None, "Failed to merge audio files." # Calculate total duration for verification sr, merged_array = merged_audio duration_seconds = len(merged_array) / sr # Save the merged file filepath = save_merged_audio(merged_audio) return merged_audio, f"Merged {merged_count} audio files. Total duration: {duration_seconds:.1f} seconds. Saved to {filepath}" def merge_by_section(section_number, crossfade_duration, *audio_inputs): """ Merge audio files from a specific section """ global SECTION_INFO if not SECTION_INFO: return None, "No paragraphs processed." # Collect audio data for the specified section audio_data_list = [] merged_count = 0 for idx, (section_num, para_num, text) in enumerate(SECTION_INFO): if section_num == section_number and idx < len(audio_inputs) and audio_inputs[idx] is not None: audio_data_list.append(audio_inputs[idx]) merged_count += 1 if not audio_data_list: return None, f"No audio files found for section {section_number}." # Merge the audio merged_audio = merge_audio_files(audio_data_list, crossfade_duration=crossfade_duration) if merged_audio is None: return None, f"Failed to merge audio files for section {section_number}." # Save the merged file filepath = save_merged_audio(merged_audio, section_filter=section_number) return merged_audio, f"Merged {merged_count} audio files from section {section_number}. Saved to {filepath}" def save_trimmed_audio(audio_data, index, is_trimmed=False): """ Save audio data to a file with appropriate naming If is_trimmed=True, adds "_trimmed" to the filename """ global GENERATION_COUNTS, OUTPUT_DIR, SECTION_INFO if audio_data is None: return None # Get section and paragraph information if index < len(SECTION_INFO): section_num, para_num, _ = SECTION_INFO[index] else: section_num = 1 para_num = index + 1 gen_count = GENERATION_COUNTS.get(index, 1) # Create filename if is_trimmed: filename = f"{section_num}_{para_num}_{gen_count}_trimmed.wav" else: filename = f"{section_num}_{para_num}_{gen_count}.wav" filepath = os.path.join(OUTPUT_DIR, filename) # Save audio file sr, audio_array = audio_data if isinstance(audio_array, np.ndarray): audio_tensor = torch.tensor(audio_array) else: audio_tensor = audio_array if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) torchaudio.save(filepath, audio_tensor, sr) print(f"Saved audio to {filepath}") return filepath def split_text_into_sections_and_paragraphs(text): """ Split input text into sections and paragraphs. A section is identified by a line that starts with a digit (first character is used as section number). The line can contain additional text after the digit (e.g., "1 Introduction", "2. Chapter Two", "3 - Main Content"). Each non-empty line after that becomes a separate paragraph in that section until a new section is found. If no section is specified at the beginning, section 1 is the default. Empty lines are skipped. Anything in square brackets [] is treated as comments and filtered out. Returns: - List of (section_number, paragraph_number, text) tuples - Total word count """ import re global SECTION_INFO # Reset section info SECTION_INFO = [] if not text.strip(): return SECTION_INFO, 0 # Split text into lines lines = text.strip().split('\n') # Initialize variables current_section = None current_para_in_section = 1 total_word_count = 0 for line in lines: line = line.rstrip() # Skip empty lines if not line.strip(): continue # Remove anything in square brackets (comments) cleaned_line = re.sub(r'\[.*?\]', '', line).strip() # Skip lines that become empty after removing comments if not cleaned_line: continue # Check if this line starts with a digit (indicating a section marker) # The first character must be a digit, but there can be additional text if cleaned_line[0].isdigit(): # Extract the first character as the section number section_num = int(cleaned_line[0]) # Update section and reset paragraph counter current_section = section_num current_para_in_section = 1 print(f"Found section marker: '{cleaned_line}' -> Section {section_num}") else: # This is a text line - treat as a complete paragraph # If we haven't set a section yet, default to section 1 if current_section is None: current_section = 1 # Add this line as a complete paragraph (using cleaned text) paragraph_text = cleaned_line SECTION_INFO.append((current_section, current_para_in_section, paragraph_text)) total_word_count += count_words(paragraph_text) current_para_in_section += 1 return SECTION_INFO, total_word_count def save_audio_file(audio_data, index): """ Save audio data to a file with appropriate naming: {Section}_{Paragraph}_{Generation}.wav """ global GENERATION_COUNTS, OUTPUT_DIR, SECTION_INFO if audio_data is None: return None # Get section and paragraph information if index < len(SECTION_INFO): section_num, para_num, _ = SECTION_INFO[index] else: # Fallback if index is out of bounds section_num = 1 para_num = index + 1 # Increment generation count for this paragraph GENERATION_COUNTS[index] = GENERATION_COUNTS.get(index, 0) + 1 gen_count = GENERATION_COUNTS[index] # Create filename filename = f"{section_num}_{para_num}_{gen_count}.wav" filepath = os.path.join(OUTPUT_DIR, filename) # Save audio file sr, audio_array = audio_data # Ensure audio_array is properly formatted if isinstance(audio_array, np.ndarray): audio_tensor = torch.tensor(audio_array) else: audio_tensor = audio_array # Add batch dimension if needed if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) torchaudio.save(filepath, audio_tensor, sr) print(f"Saved audio to {filepath}") return filepath def generate_single_in_batch(text, audio_prompt_path, exaggeration, temperature, seed_num, cfgw, paragraph_index=None): """Internal generation function that does the actual work - no model parameter to avoid pickling issues""" model = get_or_load_model() # Get model inside the function if not text.strip(): return None, 0, 0 if seed_num != 0: actual_seed = int(seed_num) else: actual_seed = random.randint(0, 2**32 - 1) print(f"Using seed: {actual_seed} for paragraph {paragraph_index}, exaggeration: {exaggeration}") set_seed(actual_seed) generate_kwargs = { "exaggeration": exaggeration, "temperature": temperature, "cfg_weight": cfgw, } if audio_prompt_path: generate_kwargs["audio_prompt_path"] = audio_prompt_path wav = model.generate( text[:1000], # Truncate text to max chars **generate_kwargs ) audio_data = (model.sr, wav.squeeze(0).numpy()) # Save the audio file if paragraph_index is provided if paragraph_index is not None: save_audio_file(audio_data, paragraph_index) return audio_data @spaces.GPU def generate_single_internal(text, audio_prompt_path, exaggeration, temperature, seed_num, cfgw, paragraph_index=None): """Internal generation function that does the actual work - no model parameter to avoid pickling issues""" model = get_or_load_model() # Get model inside the function if not text.strip(): return None, 0, 0 if seed_num != 0: actual_seed = int(seed_num) else: actual_seed = random.randint(0, 2**32 - 1) print(f"Using seed: {actual_seed} for paragraph {paragraph_index}, exaggeration: {exaggeration}") set_seed(actual_seed) generate_kwargs = { "exaggeration": exaggeration, "temperature": temperature, "cfg_weight": cfgw, } if audio_prompt_path: generate_kwargs["audio_prompt_path"] = audio_prompt_path wav = model.generate( text[:500], # Truncate text to max chars **generate_kwargs ) audio_data = (model.sr, wav.squeeze(0).numpy()) # Save the audio file if paragraph_index is provided if paragraph_index is not None: save_audio_file(audio_data, paragraph_index) return audio_data @spaces.GPU def generate_tts_audio( text_input: str, audio_prompt_path_input: str = None, exaggeration_input: float = 0.5, temperature_input: float = 0.8, seed_num_input: int = 0, cfgw_input: float = 0.6 ) -> tuple[int, np.ndarray]: """ Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling. This tool synthesizes natural-sounding speech from input text. When a reference audio file is provided, it captures the speaker's voice characteristics and speaking style. The generated audio maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided. Args: text_input (str): The text to synthesize into speech (maximum 300 characters) audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None. exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5. temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8. seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0. cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.6. Returns: tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray) """ current_model = get_or_load_model() if current_model is None: raise RuntimeError("TTS model is not loaded.") if seed_num_input != 0: set_seed(int(seed_num_input)) print(f"Generating audio for text: '{text_input[:50]}...'") # Handle optional audio prompt generate_kwargs = { "exaggeration": exaggeration_input, "temperature": temperature_input, "cfg_weight": cfgw_input, } if audio_prompt_path_input: generate_kwargs["audio_prompt_path"] = audio_prompt_path_input wav = current_model.generate( text_input[:300], # Truncate text to max chars **generate_kwargs ) print("Audio generation complete.") return (current_model.sr, wav.squeeze(0).numpy()) @spaces.GPU(duration=600) def generate_all_sequential(ref_wav_path, temperature, seed_num, cfgw, *exaggeration_values): """Generate audio for all paragraphs sequentially using individual exaggeration values""" global SECTION_INFO if not SECTION_INFO: return ["No paragraphs to process. Please process a script first."] + [None] * 100 + [0] * 100 status_messages = [] audio_results = [] # Initialize results lists with None values for all possible slots MAX_PARAGRAPHS = 50 for i in range(MAX_PARAGRAPHS): audio_results.append(None) for idx, (section_num, para_num, text) in enumerate(SECTION_INFO): if text.strip(): # Get the exaggeration value for this paragraph exaggeration = exaggeration_values[idx] if idx < len(exaggeration_values) else 0.35 print(f"Generating Section {section_num}, Paragraph {para_num} [{idx+1}/{len(SECTION_INFO)}] with exaggeration {exaggeration}...") status_messages.append(f"Processing Section {section_num}, Paragraph {para_num} (exaggeration: {exaggeration})...") # Call the spaces.GPU decorated function directly audio = generate_single_in_batch(text, ref_wav_path, exaggeration, temperature, seed_num, cfgw, paragraph_index=idx) if audio: status_messages.append(f"✓ Generated audio for Section {section_num}, Paragraph {para_num}") # Store the audio result in the correct slot audio_results[idx] = audio else: status_messages.append(f"✗ Failed to generate audio for Section {section_num}, Paragraph {para_num}") final_status = "\n".join(status_messages) + f"\n\nCompleted processing {len(SECTION_INFO)} paragraphs!" # Return status plus all audio results return [final_status] + audio_results def apply_trim_and_save(audio_data, start_time, end_time, paragraph_index): """Apply trimming and save the trimmed audio""" if audio_data is None: return None, "No audio to trim" try: trimmed_audio = trim_audio(audio_data, start_time, end_time) if trimmed_audio is None: return None, "Invalid trim parameters" # Save trimmed version filepath = save_trimmed_audio(trimmed_audio, paragraph_index, is_trimmed=True) return trimmed_audio, f"Trimmed and saved to {filepath}" except Exception as e: return None, f"Error trimming audio: {str(e)}" def update_paragraph_ui(script_text): """ Process input script and update paragraph text fields and buttons """ global SECTION_INFO, GENERATION_COUNTS # Reset generation counts GENERATION_COUNTS = {} section_info, word_count = split_text_into_sections_and_paragraphs(script_text) # Initialize generation count for each paragraph for i in range(len(section_info)): GENERATION_COUNTS[i] = 0 # Create updates for all potential paragraph fields MAX_PARAGRAPHS = 50 text_updates = [] row_updates = [] button_updates = [] audio_updates = [] label_updates = [] exaggeration_updates = [] for i in range(MAX_PARAGRAPHS): if i < len(section_info): section_num, para_num, text = section_info[i] text_updates.append(gr.update(value=text, visible=True)) row_updates.append(gr.update(visible=True)) button_updates.append(gr.update(visible=True)) audio_updates.append(gr.update(visible=True)) label_updates.append(gr.update(value=f"Section {section_num}, Paragraph {para_num} ({count_words(text)} words)", visible=True)) exaggeration_updates.append(gr.update(value=0.35, visible=True)) else: text_updates.append(gr.update(value="", visible=False)) row_updates.append(gr.update(visible=False)) button_updates.append(gr.update(visible=False)) audio_updates.append(gr.update(visible=False)) label_updates.append(gr.update(value="", visible=False)) exaggeration_updates.append(gr.update(value=0.35, visible=False)) # Update paragraph count and word count count_update = gr.update(value=f"Total Paragraphs: {len(section_info)} | Total Words: {word_count}") # Generate a summary of sections and paragraph counts section_counts = {} section_word_counts = {} for section_num, para_num, text in section_info: section_counts[section_num] = max(section_counts.get(section_num, 0), para_num) section_word_counts[section_num] = section_word_counts.get(section_num, 0) + count_words(text) section_summary = "Sections: " + ", ".join([f"Section {s}: {c} paragraphs ({section_word_counts[s]} words)" for s, c in sorted(section_counts.items())]) summary_update = gr.update(value=section_summary) # Return all updates return (text_updates + row_updates + button_updates + audio_updates + label_updates + exaggeration_updates + [count_update, summary_update]) def apply_global_exaggeration(global_value): """Apply the same exaggeration value to all visible paragraph sliders""" global SECTION_INFO updates = [] for i in range(50): # MAX_PARAGRAPHS if i < len(SECTION_INFO): # Update visible sliders with the global value updates.append(gr.update(value=global_value)) else: # Keep invisible sliders unchanged updates.append(gr.update()) return updates with gr.Blocks() as demo: # Storage for dynamic components paragraph_texts = [] paragraph_rows = [] paragraph_labels = [] generate_buttons = [] audio_outputs = [] exaggeration_sliders = [] gr.Markdown( """ # Chatterbox TTS Demo with Script Processing Generate high-quality speech from text with reference audio styling and batch processing capabilities. """ ) with gr.Tab("Single Generation"): with gr.Row(): with gr.Column(): text = gr.Textbox( value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", label="Text to synthesize (max chars 300)", max_lines=5 ) ref_wav = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Reference Audio File (Optional)", value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac" ) exaggeration = gr.Slider( 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5 ) cfg_weight = gr.Slider( 0.2, 1, step=.05, label="CFG/Pace", value=0.6 ) with gr.Accordion("More options", open=False): seed_num = gr.Number(value=131789919, label="Random seed (0 for random)") temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8) run_btn = gr.Button("Generate", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Output Audio") run_btn.click( fn=generate_tts_audio, inputs=[ text, ref_wav, exaggeration, temp, seed_num, cfg_weight, ], outputs=[audio_output], ) with gr.Tab("Script Processing"): with gr.Row(): with gr.Column(): script_input = gr.Textbox( label="Script Input (section headers start with a digit, e.g., '1 Introduction', '2. Chapter Two')", placeholder="Enter your script here. Use lines starting with digits to mark sections (e.g., '1 Introduction', '2. Chapter Two', '3 - Main Content'). The first character determines the section number. Separate paragraphs with blank lines.", lines=10, value="1 Introduction\nThis is the first paragraph of section 1. It contains some interesting content.\n\nThis is the second paragraph of section 1. It's separated by a blank line.\n\n2. Chapter Two\nThis is the first paragraph of section 2. Notice the '2.' above marks the section.\n\nThis is the second paragraph of section 2.\n\n3 - Final Section\nThis is the first paragraph of section 3." ) process_btn = gr.Button("Process Script", variant="primary") paragraph_count = gr.Markdown("Total Paragraphs: 0 | Total Words: 0") section_summary = gr.Markdown("Sections: None") output_dir_info = gr.Markdown(f"Audio files will be saved to: {os.path.abspath(OUTPUT_DIR)}") # Generation parameters# Generation parameters with gr.Row(): with gr.Column(): gr.Markdown("## Global Generation Settings") script_ref_wav = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Reference Audio File", value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac") script_cfg_weight = gr.Slider(0.0, 1, step=.05, label="CFG/Pace", value=0.6) with gr.Accordion("More options", open=False): script_seed_num = gr.Number(value=131789919, label="Random seed (0 for random)") script_temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8) # NEW: Global Exaggeration Control gr.Markdown("### Global Exaggeration Control") global_exaggeration = gr.Slider( 0.25, 2, step=0.05, label="Set All Exaggeration Values", value=0.35 ) apply_global_exaggeration_btn = gr.Button("Apply to All Paragraphs", variant="secondary") generate_all_btn = gr.Button("Generate All Paragraphs", variant="primary", size="lg") generation_status = gr.Textbox(label="Generation Status", lines=5, interactive=False) # Paragraphs section gr.Markdown("## Paragraphs") # Create placeholders for paragraph entries MAX_PARAGRAPHS = 50 for i in range(MAX_PARAGRAPHS): with gr.Row(visible=False) as row: paragraph_rows.append(row) with gr.Column(scale=4): paragraph_label = gr.Markdown("Section 1, Paragraph 1", visible=False) paragraph_labels.append(paragraph_label) text_input = gr.Textbox( lines=3, max_lines=5, visible=False ) paragraph_texts.append(text_input) with gr.Column(scale=2): exaggeration_slider = gr.Slider( 0.25, 2, step=0.05, label="Exaggeration", value=0.35, visible=False ) exaggeration_sliders.append(exaggeration_slider) generate_btn = gr.Button(f"Generate", visible=False) generate_buttons.append(generate_btn) with gr.Column(scale=3): audio_output = gr.Audio(label=f"Generated Audio", type="numpy", autoplay=False, visible=False, show_download_button=True, interactive=True) audio_outputs.append(audio_output) # Setup individual paragraph generation def make_generate_handler(idx): def generate_handler(text, ref_wav, exag, temp, seed, cfg): # Call the spaces.GPU function directly without passing model audio = generate_single_internal(text, ref_wav, exag, temp, seed, cfg, idx) return audio return generate_handler generate_btn.click( fn=make_generate_handler(i), inputs=[ text_input, script_ref_wav, exaggeration_slider, script_temp, script_seed_num, script_cfg_weight, ], outputs=[audio_output], ) # Audio Merging section gr.Markdown("## Audio Merging") with gr.Row(): with gr.Column(): gr.Markdown("### Merge Options") crossfade_duration = gr.Slider( 0.0, 1.0, step=0.05, value=0.0, label="Crossfade Duration (seconds)" ) pause_duration = gr.Slider(0, 2, step=0.1, label="Pause Between Segments (seconds). Crossfade must be 0", value=0.3) with gr.Row(): merge_all_btn = gr.Button("Merge All Audio", variant="primary") section_number_input = gr.Number( value=1, label="Section Number", precision=0 ) merge_section_btn = gr.Button("Merge Section", variant="secondary") merge_status = gr.Textbox( label="Merge Status", lines=2, interactive=False ) with gr.Column(): merged_audio_output = gr.Audio( label="Merged Audio", type="numpy", show_download_button=True ) # Setup process button to update paragraphs process_btn.click( fn=update_paragraph_ui, inputs=[script_input], outputs=(paragraph_texts + paragraph_rows + generate_buttons + audio_outputs + paragraph_labels + exaggeration_sliders + [paragraph_count, section_summary]), ) # Setup generate all button - now with all audio outputs generate_all_btn.click( fn=generate_all_sequential, inputs=[ script_ref_wav, script_temp, script_seed_num, script_cfg_weight, ] + exaggeration_sliders, outputs=[generation_status] + audio_outputs, ) # Setup merge all button merge_all_btn.click( fn=merge_all_generated_audio, inputs=[crossfade_duration, pause_duration] + audio_outputs, outputs=[merged_audio_output, merge_status] ) # Setup merge section button def merge_section_handler(section_num, crossfade_duration, *audio_inputs): return merge_by_section(int(section_num), crossfade_duration, *audio_inputs) merge_section_btn.click( fn=merge_section_handler, inputs=[section_number_input, crossfade_duration] + audio_outputs, outputs=[merged_audio_output, merge_status] ) # Setup global exaggeration apply button apply_global_exaggeration_btn.click( fn=apply_global_exaggeration, inputs=[global_exaggeration], outputs=exaggeration_sliders ) demo.launch(mcp_server=True)