Spaces:

bobsackett
/

ChatterboxTTS-DNXS-Spokenwordv1

Sleeping

ChatterboxTTS-DNXS-Spokenwordv1 / modules /tts_engine.py

danneauxs

tts engine.py

7a8491d 8 months ago

48.2 kB

	"""
	TTS Engine Module
	Handles ChatterboxTTS interface, model loading, and chunk processing coordination
	"""

	import torch
	import gc
	import time
	import logging
	import shutil
	import sys
	import numpy as np
	from datetime import timedelta
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from pathlib import Path
	import torchaudio as ta

	from config.config import *
	from modules.text_processor import smart_punctuate, sentence_chunk_text, detect_content_boundaries

	def find_chunks_json_file(book_name):
	"""Find the corresponding chunks JSON file for a book"""
	from config.config import AUDIOBOOK_ROOT

	# Look in the TTS processing directory
	tts_chunks_dir = AUDIOBOOK_ROOT / book_name / "TTS" / "text_chunks"
	json_path = tts_chunks_dir / "chunks_info.json"

	if json_path.exists():
	return json_path

	# Also check old Text_Input location for backwards compatibility
	text_input_dir = Path("Text_Input")
	possible_names = [
	f"{book_name}_chunks.json",
	f"{book_name.lower()}_chunks.json",
	f"{book_name.replace(' ', '_')}_chunks.json"
	]

	for name in possible_names:
	old_json_path = text_input_dir / name
	if old_json_path.exists():
	return old_json_path

	return None
	from modules.audio_processor import (
	smart_audio_validation, apply_smart_fade, add_chunk_end_silence,
	add_contextual_silence, pause_for_chunk_review, get_chunk_audio_duration,
	has_mid_energy_drop, apply_smart_fade_memory, smart_audio_validation_memory
	)
	from modules.file_manager import (
	setup_book_directories, find_book_files, ensure_voice_sample_compatibility,
	combine_audio_chunks, get_audio_files_in_directory, convert_to_m4b, add_metadata_to_m4b
	)
	from modules.progress_tracker import setup_logging, log_chunk_progress, log_run

	# Global shutdown flag
	shutdown_requested = False

	# Console colors
	RED = '\033[91m'
	GREEN = '\033[92m'
	YELLOW = '\033[93m'
	CYAN = '\033[96m'
	RESET = '\033[0m'

	import random
	import numpy as np
	import torch

	def set_seed(seed_value: int):
	"""
	Sets the seed for torch, random, and numpy for reproducibility.
	This is called if a non-zero seed is provided for generation.
	"""
	torch.manual_seed(seed_value)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed_value)
	torch.cuda.manual_seed_all(seed_value) # if using multi-GPU
	if torch.backends.mps.is_available():
	# Check if torch.mps exists before calling
	if hasattr(torch, 'mps') and torch.mps.is_available():
	torch.mps.manual_seed(seed_value)
	random.seed(seed_value)
	np.random.seed(seed_value)
	logging.info(f"Global seed set to: {seed_value}")

	# ============================================================================
	# MEMORY AND MODEL MANAGEMENT
	# ============================================================================

	def monitor_gpu_activity(operation_name):
	"""Lightweight GPU monitoring for high-speed processing"""
	# Disabled expensive pynvml queries to free up GPU cycles
	if torch.cuda.is_available():
	allocated = torch.cuda.memory_allocated() / 1024**3
	# Skip GPU utilization queries during production runs
	return allocated, 0
	return 0, 0

	def optimize_memory_usage():
	"""Aggressive memory management for 8GB VRAM"""
	torch.cuda.empty_cache()
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.ipc_collect()

	def monitor_vram_usage(operation_name=""):
	"""Real-time VRAM monitoring"""
	if torch.cuda.is_available():
	allocated = torch.cuda.memory_allocated() / 1024**3
	reserved = torch.cuda.memory_reserved() / 1024**3

	if allocated > VRAM_SAFETY_THRESHOLD:
	logging.warning(f"⚠️ High VRAM usage during {operation_name}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved")
	optimize_memory_usage()

	return allocated, reserved
	return 0, 0

	def get_optimal_workers():
	"""Dynamic worker allocation based on VRAM usage"""
	if not USE_DYNAMIC_WORKERS:
	return MAX_WORKERS

	allocated_vram = torch.cuda.memory_allocated() / 1024**3

	if allocated_vram < 5.0:
	return min(TEST_MAX_WORKERS, MAX_WORKERS)
	elif allocated_vram < VRAM_SAFETY_THRESHOLD:
	return min(2, MAX_WORKERS)
	else:
	return 1

	def prewarm_model_with_voice(model, voice_path, tts_params=None):
	"""
	Pre-warm the TTS model with a voice sample to eliminate cold start quality issues.

	Args:
	model: Loaded TTS model
	voice_path: Path to voice sample file
	tts_params: Optional TTS parameters for pre-warming (uses defaults if None)

	Returns:
	model: The pre-warmed model (same object, but with cached conditioning)
	"""
	import tempfile
	import os
	from modules.file_manager import ensure_voice_sample_compatibility

	try:
	print("🔥 Pre-warming model with voice sample...")

	# Prepare voice for TTS
	compatible_voice = ensure_voice_sample_compatibility(voice_path)

	# Set up default TTS parameters if none provided
	if tts_params is None:
	tts_params = {
	'exaggeration': 0.5,
	'cfg_weight': 0.5,
	'temperature': 0.9
	}

	# Prepare voice conditionals
	model.prepare_conditionals(compatible_voice)

	# Generate a short dummy audio to fully warm up the model
	dummy_text = "The quick brown fox jumps over the lazy dog."
	print(f"🎤 Generating warm-up audio: '{dummy_text}'")

	# Generate dummy audio with the voice and parameters
	wav_np = model.generate(
	dummy_text,
	exaggeration=tts_params['exaggeration'],
	cfg_weight=tts_params['cfg_weight'],
	temperature=tts_params['temperature']
	)

	print("✅ Model pre-warming completed - first chunk quality optimized")

	# Clean up any temporary audio data (don't save the dummy audio)
	del wav_np

	return model

	except Exception as e:
	print(f"⚠️ Pre-warming failed: {e}")
	print("📝 Model will still work but first chunk may have quality variations")
	return model

	def get_best_available_device():
	"""Detect and return the best available device with proper fallback"""
	try:
	if torch.cuda.is_available():
	# Test CUDA with a simple operation
	test_tensor = torch.tensor([1.0]).to("cuda")
	del test_tensor
	torch.cuda.empty_cache()
	return "cuda"
	except Exception as e:
	logging.warning(f"CUDA test failed: {e}")

	try:
	if torch.backends.mps.is_available():
	# Test MPS with a simple operation
	test_tensor = torch.tensor([1.0]).to("mps")
	del test_tensor
	return "mps"
	except Exception as e:
	logging.warning(f"MPS test failed: {e}")

	return "cpu"

	def load_optimized_model(device):
	"""Load TTS model with memory optimizations"""
	from src.chatterbox.tts import ChatterboxTTS

	try:
	# Try to load with FP16 if supported
	model = ChatterboxTTS.from_pretrained(device=device, torch_dtype=torch.float16)
	logging.info("✅ Loaded model in FP16 mode (halved VRAM usage)")
	except:
	# Fallback to default loading
	model = ChatterboxTTS.from_pretrained(device=device)
	logging.info("⚠️ Using FP32 mode (FP16 not supported)")

	# Only apply eval() and benchmark if the model has these attributes
	if hasattr(model, 'eval'):
	model.eval()

	# Set CUDNN benchmark for performance (if available)
	if torch.backends.cudnn.is_available():
	torch.backends.cudnn.benchmark = True

	return model

	# ============================================================================
	# CHUNK PROCESSING
	# ============================================================================

	def patch_alignment_layer(tfmr, alignment_layer_idx=12):
	"""Patch alignment layer to avoid recursion"""
	from types import MethodType
	target_layer = tfmr.layers[alignment_layer_idx].self_attn
	original_forward = target_layer.forward

	def patched_forward(self, args, *kwargs):
	kwargs['output_attentions'] = True
	return original_forward(args, *kwargs)

	target_layer.forward = MethodType(patched_forward, target_layer)

	def process_batch(
	batch, text_chunks_dir, audio_chunks_dir,
	voice_path, tts_params, start_time, total_chunks,
	punc_norm, basename, log_run_func, log_path, device,
	model, asr_model, seed=0,
	enable_asr=None
	):
	if seed != 0:
	set_seed(seed)
	"""
	Process a batch of chunks using the batch-enabled TTS model.
	"""
	from pydub import AudioSegment
	import io
	import soundfile as sf

	# 1. Prepare batch for TTS
	texts = [chunk_data['text'] for chunk_data in batch]

	# All params are the same, so we take them from the first chunk
	shared_tts_params = batch[0].get("tts_params", tts_params)
	supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"}
	tts_args = {k: v for k, v in shared_tts_params.items() if k in supported_params}

	# 2. Generate audio in a batch
	try:
	with torch.no_grad():
	wavs = model.generate_batch(texts, **tts_args)
	except Exception as e:
	logging.error(f"❌ Batch TTS generation failed: {e}")
	# Fallback to individual processing for this batch
	results = []
	for chunk_data in batch:
	i = chunk_data['index']
	chunk = chunk_data['text']
	boundary_type = chunk_data.get("boundary_type", "none")
	chunk_tts_params = chunk_data.get("tts_params", tts_params)
	result = process_one_chunk(i, chunk, text_chunks_dir, audio_chunks_dir, voice_path, chunk_tts_params, start_time, total_chunks, punc_norm, basename, log_run_func, log_path, device, model, asr_model, boundary_type=boundary_type, enable_asr=enable_asr)
	results.append(result)
	return results


	# 3. Process and save each audio file from the batch
	batch_results = []
	for i, wav_tensor in enumerate(wavs):
	chunk_data = batch[i]
	chunk_index = chunk_data['index']
	boundary_type = chunk_data.get("boundary_type", "none")
	chunk_id_str = f"{chunk_index+1:05}"

	if wav_tensor.dim() == 1:
	wav_tensor = wav_tensor.unsqueeze(0)

	wav_np = wav_tensor.squeeze().cpu().numpy()
	with io.BytesIO() as wav_buffer:
	sf.write(wav_buffer, wav_np, model.sr, format='wav')
	wav_buffer.seek(0)
	audio_segment = AudioSegment.from_wav(wav_buffer)

	# Apply trimming and contextual silence
	from modules.audio_processor import process_audio_with_trimming_and_silence, trim_audio_endpoint
	if boundary_type and boundary_type != "none":
	final_audio = process_audio_with_trimming_and_silence(audio_segment, boundary_type)
	elif ENABLE_AUDIO_TRIMMING:
	final_audio = trim_audio_endpoint(audio_segment)
	else:
	final_audio = audio_segment

	# Final save
	final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav"
	final_audio.export(final_path, format="wav")
	logging.info(f"✅ Saved final chunk from batch: {final_path.name}")

	batch_results.append((chunk_index, final_path))

	return batch_results

	def process_one_chunk(
	i, chunk, text_chunks_dir, audio_chunks_dir,
	voice_path, tts_params, start_time, total_chunks,
	punc_norm, basename, log_run_func, log_path, device,
	model, asr_model, seed=0, boundary_type="none",
	enable_asr=None
	):
	if seed != 0:
	set_seed(seed)
	"""Enhanced chunk processing with quality control, contextual silence, and deep cleanup"""
	import difflib
	from pydub import AudioSegment

	chunk_id_str = f"{i+1:05}"
	chunk_path = text_chunks_dir / f"chunk_{chunk_id_str}.txt"
	with open(chunk_path, 'w', encoding='utf-8') as cf:
	cf.write(chunk)

	chunk_audio_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav"

	# ============================================================================
	# ENHANCED PERIODIC DEEP CLEANUP
	# ============================================================================
	cleanup_interval = CLEANUP_INTERVAL

	# Skip cleanup on model reinitialization chunks to avoid conflicts
	if (i + 1) % cleanup_interval == 0 and (i + 1) % BATCH_SIZE != 0:
	print(f"\n🧹 {YELLOW}DEEP CLEANUP at chunk {i+1}/{total_chunks}...{RESET}")

	# Enhanced VRAM monitoring before cleanup
	allocated_before = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
	reserved_before = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0

	print(f" Before: VRAM Allocated: {allocated_before:.1f}GB \| Reserved: {reserved_before:.1f}GB")

	# Bulk temp file cleanup
	print(" 🗑️ Cleaning bulk temporary files...")
	temp_patterns = ["_try.wav", "_pre.wav", "_fade.wav", "_debug.wav", "_temp.wav", "_backup*.wav"]
	total_temp_files = 0
	for pattern in temp_patterns:
	temp_files = list(audio_chunks_dir.glob(pattern))
	for temp_file in temp_files:
	temp_file.unlink(missing_ok=True)
	total_temp_files += len(temp_files)

	if total_temp_files > 0:
	print(f" 🗑️ Removed {total_temp_files} temporary audio files")

	# Aggressive CUDA context reset
	print(" 🔄 Performing aggressive CUDA context reset...")
	torch.cuda.synchronize()
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()

	# Force CUDA context reset
	if hasattr(torch.cuda, 'reset_peak_memory_stats'):
	torch.cuda.reset_peak_memory_stats()
	if hasattr(torch._C, '_cuda_clearCublasWorkspaces'):
	torch._C._cuda_clearCublasWorkspaces()

	# Force garbage collection multiple times
	for _ in range(3):
	gc.collect()

	# Clear model cache if it has one
	if hasattr(model, 'clear_cache'):
	model.clear_cache()
	elif hasattr(model, 'reset_states'):
	model.reset_states()

	# Brief pause to let GPU settle
	time.sleep(1.0)

	# Monitor after cleanup
	allocated_after = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
	reserved_after = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0

	print(f" After: VRAM Allocated: {allocated_after:.1f}GB \| Reserved: {reserved_after:.1f}GB")
	print(f" Freed: {allocated_before - allocated_after:.1f}GB allocated, {reserved_before - reserved_after:.1f}GB reserved")
	print(f"🧹 {GREEN}Deep cleanup complete!{RESET}\n")

	best_sim, best_asr_text = -1, ""
	wav_path_active = None
	attempt_paths = []
	mid_drop_retries = 0
	max_mid_drop_retries = 2

	# Enhanced regeneration loop with quality validation
	max_attempts = MAX_REGENERATION_ATTEMPTS if ENABLE_REGENERATION_LOOP else 2
	current_tts_params = tts_params.copy()

	# Debug: Log the initial parameters for this chunk
	logging.info(f"🎛️ Chunk {chunk_id_str} initial TTS params: exag={current_tts_params.get('exaggeration', 'N/A'):.3f}, cfg={current_tts_params.get('cfg_weight', 'N/A'):.3f}, temp={current_tts_params.get('temperature', 'N/A'):.3f}, min_p={current_tts_params.get('min_p', 'N/A'):.3f}")

	for attempt_num in range(max_attempts):
	logging.info(f"🔁 Starting TTS for chunk {chunk_id_str}, attempt {attempt_num + 1}/{max_attempts}")
	if attempt_num > 0:
	logging.info(f"🔧 Adjusted params: exag={current_tts_params.get('exaggeration', 'N/A'):.3f}, cfg={current_tts_params.get('cfg_weight', 'N/A'):.3f}, temp={current_tts_params.get('temperature', 'N/A'):.3f}")
	try:
	# Filter to only supported ChatterboxTTS parameters
	supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"}
	tts_args = {k: v for k, v in current_tts_params.items() if k in supported_params}

	# monitor_gpu_activity(f"Before TTS chunk_{chunk_id_str}") # Disabled for speed
	with torch.no_grad():
	wav = model.generate(chunk, **tts_args).detach().cpu()
	# monitor_gpu_activity(f"After TTS chunk_{chunk_id_str}") # Disabled for speed

	if wav.dim() == 1:
	wav = wav.unsqueeze(0)

	# Convert tensor to AudioSegment for in-memory processing
	import io
	import soundfile as sf
	from pydub import AudioSegment

	# Convert wav tensor to AudioSegment (in memory)
	wav_np = wav.squeeze().numpy()
	with io.BytesIO() as wav_buffer:
	sf.write(wav_buffer, wav_np, model.sr, format='wav')
	wav_buffer.seek(0)
	audio_segment = AudioSegment.from_wav(wav_buffer)

	# Enhanced quality validation
	quality_score = 1.0 # Start with perfect score

	# Legacy mid-energy drop check (converted to score)
	if ENABLE_MID_DROP_CHECK and has_mid_energy_drop(wav, model.sr):
	quality_score *= 0.3 # Significant penalty for mid-drop
	logging.info(f"⚠️ Mid-chunk energy drop detected in {chunk_id_str}")

	# Enhanced quality validation (if enabled)
	if ENABLE_REGENERATION_LOOP:
	from modules.audio_processor import evaluate_chunk_quality
	# Pass existing ASR model to avoid loading duplicate
	composite_score = evaluate_chunk_quality(audio_segment, chunk, include_spectral=True, asr_model=asr_model)
	quality_score *= composite_score
	logging.info(f"📊 Quality score for {chunk_id_str}: {quality_score:.3f} (composite: {composite_score:.3f})")

	# ASR validation (memory-based processing)
	asr_score = 1.0 # Default to passed if ASR disabled
	# Use parameter if provided, otherwise fall back to config
	asr_enabled = enable_asr if enable_asr is not None else ENABLE_ASR
	if asr_enabled and asr_model is not None:
	from modules.audio_processor import calculate_text_similarity
	try:
	# Process ASR completely in memory - no disk writes
	samples = np.array(audio_segment.get_array_of_samples())
	if audio_segment.channels == 2:
	samples = samples.reshape((-1, 2)).mean(axis=1)

	# Normalize to float32 for ASR model
	audio_np = samples.astype(np.float32) / audio_segment.max_possible_amplitude
	result = asr_model.transcribe(audio_np)

	if not isinstance(result, dict) or "text" not in result:
	raise ValueError(f"Invalid ASR result type: {type(result)}")

	asr_text = result.get("text", "").strip()
	asr_score = calculate_text_similarity(punc_norm(chunk), asr_text)
	logging.info(f"🎤 ASR similarity for chunk {chunk_id_str}: {asr_score:.3f} - Expected: '{punc_norm(chunk)}' Got: '{asr_text}'")

	except Exception as e:
	logging.error(f"❌ ASR failed for {chunk_id_str}: {e}")
	asr_score = 0.8 # Use neutral score instead of 0 to avoid regeneration

	# Include ASR score in overall quality
	quality_score *= asr_score

	# Final quality check with all validations
	if quality_score >= QUALITY_THRESHOLD or attempt_num == max_attempts - 1:
	if quality_score >= QUALITY_THRESHOLD:
	logging.info(f"✅ Quality acceptable for {chunk_id_str} on attempt {attempt_num + 1} (final score: {quality_score:.3f})")
	else:
	logging.info(f"⚠️ Max attempts reached for {chunk_id_str}, accepting best effort (final score: {quality_score:.3f})")

	# Quality acceptable or max attempts reached, continue with processing
	final_audio = audio_segment
	best_sim = asr_score if asr_enabled else 1.0
	best_asr_text = asr_text if asr_enabled and 'asr_text' in locals() else ""
	break
	else:
	# Quality too low, adjust parameters for retry
	logging.info(f"🔄 Quality below threshold ({quality_score:.3f} < {QUALITY_THRESHOLD}), adjusting parameters for retry {attempt_num + 2}")
	from modules.audio_processor import adjust_parameters_for_retry
	current_tts_params = adjust_parameters_for_retry(current_tts_params, quality_score, attempt_num)
	continue

	except Exception as e:
	import traceback
	logging.error(f"Exception during TTS attempt {attempt_num + 1} for chunk {chunk_id_str}: {e}")
	traceback.print_exc()
	continue

	if 'final_audio' not in locals():
	logging.info(f"❌ Chunk {chunk_id_str} failed all attempts.")
	return None, None

	# Apply trimming and contextual silence in memory before final save
	from modules.audio_processor import process_audio_with_trimming_and_silence

	if boundary_type and boundary_type != "none":
	final_audio = process_audio_with_trimming_and_silence(final_audio, boundary_type)
	print(f"🔇 Added {boundary_type} silence to chunk {i+1:05}")
	else:
	# Apply trimming even without boundary type if enabled
	if ENABLE_AUDIO_TRIMMING:
	from modules.audio_processor import trim_audio_endpoint
	final_audio = trim_audio_endpoint(final_audio)

	# Note: ENABLE_CHUNK_END_SILENCE is now handled by punctuation-specific silence
	# The new system provides more precise silence based on actual punctuation

	# Final save - only disk write in entire process
	final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav"
	final_audio.export(final_path, format="wav")
	logging.info(f"✅ Saved final chunk: {final_path.name}")

	# No intermediate file cleanup needed - all processing done in memory

	# Log details - only log ASR failures
	if asr_enabled and best_sim < 0.8:
	log_run_func(f"ASR VALIDATION FAILED - Chunk {chunk_id_str}:\nExpected:\n{chunk}\nActual:\n{best_asr_text}\nSimilarity: {best_sim:.3f}\n" + "="*50, log_path)
	elif not asr_enabled:
	log_run_func(f"Chunk {chunk_id_str}: Original text: {chunk}", log_path)

	# Silence already added in memory above - no disk processing needed

	# Enhanced regular cleanup (every chunk)
	del wav
	optimize_memory_usage()

	# Additional per-chunk cleanup for long runs
	if (i + 1) % 50 == 0:
	torch.cuda.empty_cache()
	gc.collect()

	return i, final_path

	# ============================================================================
	# MAIN BOOK PROCESSING FUNCTION
	# ============================================================================

	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from wrapper.chunk_loader import save_chunks

	def smooth_sentiment_scores(scores, index, method="rolling", window=3):
	"""
	Apply sentiment smoothing to prevent harsh emotional transitions.

	Args:
	scores: List of compound sentiment scores
	index: Current chunk index
	method: "rolling" for moving average, "exp_decay" for exponential decay
	window: Number of previous chunks to consider

	Returns:
	float: Smoothed sentiment score
	"""
	if index == 0:
	return scores[0]

	start_idx = max(0, index - window + 1)
	window_scores = scores[start_idx:index + 1]

	if method == "rolling":
	return sum(window_scores) / len(window_scores)
	elif method == "exp_decay":
	weights = SENTIMENT_EXP_DECAY_WEIGHTS[:len(window_scores)]
	weighted_sum = sum(w * s for w, s in zip(weights, reversed(window_scores)))
	weight_sum = sum(weights[:len(window_scores)])
	return weighted_sum / weight_sum if weight_sum > 0 else window_scores[-1]
	else:
	return scores[index] # No smoothing

	def generate_enriched_chunks(text_file, output_dir, user_tts_params=None, quality_params=None, config_params=None, voice_name=None):
	"""Reads a text file, performs VADER sentiment analysis, and returns enriched chunks."""
	analyzer = SentimentIntensityAnalyzer()

	# Extract quality parameters for JSON generation (GUI overrides config)
	if quality_params:
	enable_smoothing = quality_params.get('sentiment_smoothing', ENABLE_SENTIMENT_SMOOTHING)
	smoothing_window = quality_params.get('smoothing_window', SENTIMENT_SMOOTHING_WINDOW)
	smoothing_method = quality_params.get('smoothing_method', SENTIMENT_SMOOTHING_METHOD)
	print(f"🔧 JSON Generation: Using GUI smoothing settings - Enabled: {enable_smoothing}, Window: {smoothing_window}, Method: {smoothing_method}")
	else:
	enable_smoothing = ENABLE_SENTIMENT_SMOOTHING
	smoothing_window = SENTIMENT_SMOOTHING_WINDOW
	smoothing_method = SENTIMENT_SMOOTHING_METHOD
	print(f"🔧 JSON Generation: Using config smoothing settings - Enabled: {enable_smoothing}")

	# Extract VADER sensitivity parameters (GUI overrides config)
	if config_params:
	vader_exag_sensitivity = config_params.get('vader_exag_sensitivity', VADER_EXAGGERATION_SENSITIVITY)
	vader_cfg_sensitivity = config_params.get('vader_cfg_sensitivity', VADER_CFG_WEIGHT_SENSITIVITY)
	vader_temp_sensitivity = config_params.get('vader_temp_sensitivity', VADER_TEMPERATURE_SENSITIVITY)
	print(f"🔧 JSON Generation: Using GUI VADER sensitivity - Exag: {vader_exag_sensitivity}, CFG: {vader_cfg_sensitivity}, Temp: {vader_temp_sensitivity}")
	else:
	vader_exag_sensitivity = VADER_EXAGGERATION_SENSITIVITY
	vader_cfg_sensitivity = VADER_CFG_WEIGHT_SENSITIVITY
	vader_temp_sensitivity = VADER_TEMPERATURE_SENSITIVITY
	print(f"🔧 JSON Generation: Using config VADER sensitivity - Exag: {vader_exag_sensitivity}, CFG: {vader_cfg_sensitivity}, Temp: {vader_temp_sensitivity}")

	raw_text = text_file.read_text(encoding='utf-8')
	cleaned = smart_punctuate(raw_text)
	chunks = sentence_chunk_text(cleaned)

	# Use user-provided parameters as base, or fall back to config defaults
	if user_tts_params:
	base_exaggeration = user_tts_params.get('exaggeration', BASE_EXAGGERATION)
	base_cfg_weight = user_tts_params.get('cfg_weight', BASE_CFG_WEIGHT)
	base_temperature = user_tts_params.get('temperature', BASE_TEMPERATURE)
	base_min_p = user_tts_params.get('min_p', DEFAULT_MIN_P)
	base_top_p = user_tts_params.get('top_p', DEFAULT_TOP_P)
	base_repetition_penalty = user_tts_params.get('repetition_penalty', DEFAULT_REPETITION_PENALTY)
	use_vader = user_tts_params.get('use_vader', True) # Default to True for backward compatibility

	else:
	base_exaggeration = BASE_EXAGGERATION
	base_cfg_weight = BASE_CFG_WEIGHT
	base_temperature = BASE_TEMPERATURE
	base_min_p = DEFAULT_MIN_P
	base_top_p = DEFAULT_TOP_P
	base_repetition_penalty = DEFAULT_REPETITION_PENALTY
	use_vader = True # Default behavior

	enriched = []
	chunk_texts = [chunk_text for chunk_text, _ in chunks]

	# First pass: collect all sentiment scores
	raw_sentiment_scores = []
	for chunk_text, _ in chunks:
	sentiment_scores = analyzer.polarity_scores(chunk_text)
	raw_sentiment_scores.append(sentiment_scores['compound'])

	# Second pass: apply smoothing and generate parameters
	for i, (chunk_text, is_para_end) in enumerate(chunks):
	# Get original sentiment score
	raw_compound_score = raw_sentiment_scores[i]

	# Apply sentiment smoothing if enabled (uses GUI settings, not config)
	if use_vader and enable_smoothing:
	compound_score = smooth_sentiment_scores(
	raw_sentiment_scores,
	i,
	method=smoothing_method,
	window=smoothing_window
	)
	# Debug: Log sentiment changes
	if abs(compound_score - raw_compound_score) > 0.1:
	logging.info(f"📊 Chunk {i+1:05}: sentiment smoothed {raw_compound_score:.3f} → {compound_score:.3f}")
	else:
	compound_score = raw_compound_score

	if use_vader:
	# Apply VADER sentiment adjustments using smoothed score
	exaggeration = base_exaggeration + (compound_score * vader_exag_sensitivity)
	cfg_weight = base_cfg_weight + (compound_score * vader_cfg_sensitivity)
	temperature = base_temperature + (compound_score * vader_temp_sensitivity)
	min_p = base_min_p + (compound_score * VADER_MIN_P_SENSITIVITY)
	repetition_penalty = base_repetition_penalty + (compound_score * VADER_REPETITION_PENALTY_SENSITIVITY)

	# Clamp values to defined min/max (ensure JSON values respect bounds)
	exaggeration = round(max(TTS_PARAM_MIN_EXAGGERATION, min(exaggeration, TTS_PARAM_MAX_EXAGGERATION)), 2)
	cfg_weight = round(max(TTS_PARAM_MIN_CFG_WEIGHT, min(cfg_weight, TTS_PARAM_MAX_CFG_WEIGHT)), 2)
	temperature = round(max(TTS_PARAM_MIN_TEMPERATURE, min(temperature, TTS_PARAM_MAX_TEMPERATURE)), 2)
	min_p = round(max(TTS_PARAM_MIN_MIN_P, min(min_p, TTS_PARAM_MAX_MIN_P)), 3)
	repetition_penalty = round(max(TTS_PARAM_MIN_REPETITION_PENALTY, min(repetition_penalty, TTS_PARAM_MAX_REPETITION_PENALTY)), 1)

	# Debug: Log VADER-adjusted parameters for significant changes
	if abs(exaggeration - base_exaggeration) > 0.05 or abs(cfg_weight - base_cfg_weight) > 0.05:
	logging.info(f"🎭 Chunk {i+1:05}: VADER adjusted params - exag: {base_exaggeration:.2f}→{exaggeration:.2f}, cfg: {base_cfg_weight:.2f}→{cfg_weight:.2f}, sentiment: {compound_score:.3f}")
	else:
	# Use fixed base values (no VADER adjustment)
	exaggeration = base_exaggeration
	cfg_weight = base_cfg_weight
	temperature = base_temperature
	min_p = base_min_p
	repetition_penalty = base_repetition_penalty

	boundary_type = detect_content_boundaries(chunk_text, i, chunk_texts, is_para_end)

	enriched.append({
	"index": i,
	"text": chunk_text,
	"word_count": len(chunk_text.split()),
	"boundary_type": boundary_type if boundary_type else "none",
	"sentiment_compound": compound_score, # Store smoothed score
	"sentiment_raw": raw_compound_score, # Store original score for reference
	"tts_params": {
	"exaggeration": exaggeration,
	"cfg_weight": cfg_weight,
	"temperature": temperature,
	"min_p": min_p,
	"top_p": base_top_p, # Top-P remains constant (not adjusted by VADER)
	"repetition_penalty": repetition_penalty
	}
	})

	output_json_path = output_dir / "chunks_info.json"

	# Add voice metadata if provided
	if voice_name:
	# Try metadata method first
	try:
	# Create metadata entry as first element
	metadata = {
	"_metadata": True,
	"voice_used": voice_name,
	"generation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
	"total_chunks": len(enriched)
	}
	enriched_with_metadata = [metadata] + enriched
	save_chunks(output_json_path, enriched_with_metadata)
	print(f"✅ Saved voice metadata: {voice_name}")
	except Exception as e:
	# Fallback to comment method if metadata fails
	print(f"⚠️ Metadata method failed, using comment fallback: {e}")
	save_chunks(output_json_path, enriched)

	# Add voice as comment
	from modules.voice_detector import add_voice_to_json
	add_voice_to_json(output_json_path, voice_name, method="comment")
	else:
	save_chunks(output_json_path, enriched)

	return enriched

	def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=False, enable_asr=None, quality_params=None, config_params=None, specific_text_file=None):
	"""Enhanced book processing with batch processing to prevent hangs"""
	print(f"🔍 DEBUG: Entering process_book_folder with book_dir='{book_dir}', voice_path='{voice_path}'")

	# Apply GUI quality parameters to override config defaults
	if quality_params:
	print(f"🔧 Applying GUI quality parameters: {quality_params}")

	# Override config values with GUI settings
	global ENABLE_REGENERATION_LOOP, ENABLE_SENTIMENT_SMOOTHING, ENABLE_MFCC_VALIDATION
	global ENABLE_OUTPUT_VALIDATION, QUALITY_THRESHOLD, OUTPUT_VALIDATION_THRESHOLD
	global SENTIMENT_SMOOTHING_WINDOW, SENTIMENT_SMOOTHING_METHOD, SPECTRAL_ANOMALY_THRESHOLD

	ENABLE_REGENERATION_LOOP = quality_params.get('regeneration_enabled', ENABLE_REGENERATION_LOOP)
	ENABLE_SENTIMENT_SMOOTHING = quality_params.get('sentiment_smoothing', ENABLE_SENTIMENT_SMOOTHING)
	ENABLE_MFCC_VALIDATION = quality_params.get('mfcc_validation', ENABLE_MFCC_VALIDATION)
	ENABLE_OUTPUT_VALIDATION = quality_params.get('output_validation', ENABLE_OUTPUT_VALIDATION)
	QUALITY_THRESHOLD = quality_params.get('quality_threshold', QUALITY_THRESHOLD)
	OUTPUT_VALIDATION_THRESHOLD = quality_params.get('output_threshold', OUTPUT_VALIDATION_THRESHOLD)
	SENTIMENT_SMOOTHING_WINDOW = quality_params.get('smoothing_window', SENTIMENT_SMOOTHING_WINDOW)
	SENTIMENT_SMOOTHING_METHOD = quality_params.get('smoothing_method', SENTIMENT_SMOOTHING_METHOD)
	SPECTRAL_ANOMALY_THRESHOLD = quality_params.get('spectral_threshold', SPECTRAL_ANOMALY_THRESHOLD)

	print(f"✅ Quality settings applied - Regeneration: {ENABLE_REGENERATION_LOOP}, MFCC: {ENABLE_MFCC_VALIDATION}, Output Validation: {ENABLE_OUTPUT_VALIDATION}")

	from src.chatterbox.tts import punc_norm
	print(f"🔍 DEBUG: Successfully imported punc_norm")

	# Setup directories
	print(f"🔍 DEBUG: Calling setup_book_directories...")
	output_root, tts_dir, text_chunks_dir, audio_chunks_dir = setup_book_directories(book_dir)
	print(f"🔍 DEBUG: Directory setup complete")

	# Clean previous processing files (but skip for resume operations)
	if skip_cleanup:
	print(f"🔄 RESUME MODE: Skipping cleanup to preserve existing chunks")
	print(f"📁 Preserving: {text_chunks_dir}, {audio_chunks_dir}")
	else:
	print(f"🧹 FRESH PROCESSING: Cleaning previous processing files...")
	import glob

	# Clear text chunks
	for txt_file in text_chunks_dir.glob("*.txt"):
	txt_file.unlink(missing_ok=True)
	for json_file in text_chunks_dir.glob("*.json"):
	json_file.unlink(missing_ok=True)

	# Clear audio chunks
	for wav_file in audio_chunks_dir.glob("*.wav"):
	wav_file.unlink(missing_ok=True)

	# Clear logs
	for log_file in output_root.glob("*.log"):
	log_file.unlink(missing_ok=True)

	print(f"✅ Cleanup complete")

	# Find book files
	print(f"🔍 DEBUG: Calling find_book_files...")
	book_files = find_book_files(book_dir)

	# Use specific text file if provided (GUI selection), otherwise use auto-detected file
	if specific_text_file:
	text_file_to_use = Path(specific_text_file)
	print(f"🎯 DEBUG: Using GUI-selected text file: {text_file_to_use}")
	if not text_file_to_use.exists():
	logging.error(f"[{book_dir.name}] ERROR: Selected text file not found: {text_file_to_use}")
	return None, None, []
	else:
	text_file_to_use = book_files['text']
	print(f"🔍 DEBUG: Using auto-detected text file: {text_file_to_use}")
	if not text_file_to_use:
	logging.info(f"[{book_dir.name}] ERROR: No .txt files found in the book folder.")
	return None, None, []

	cover_file = book_files['cover']
	nfo_file = book_files['nfo']

	setup_logging(output_root)

	# Extract voice name for logging and JSON metadata
	voice_name_for_log = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem

	# Generate enriched chunks with VADER analysis using user parameters and GUI quality settings
	print(f"🔍 DEBUG: About to call generate_enriched_chunks with quality_params: {quality_params}")
	print(f"🔍 DEBUG: About to call generate_enriched_chunks with config_params: {config_params}")
	print(f"🔍 DEBUG: Using voice: {voice_name_for_log}")
	all_chunks = generate_enriched_chunks(text_file_to_use, text_chunks_dir, tts_params, quality_params, config_params, voice_name_for_log)

	# Create run_log_lines
	print(f"🔍 DEBUG: Creating run_log_lines...")
	print(f"🔍 DEBUG: voice_path type: {type(voice_path)}, value: {voice_path}")

	run_log_lines = [
	f"\n===== Processing: {book_dir.name} =====",
	f"Voice: {voice_name_for_log}",
	f"Started: {time.strftime('%Y-%m-%d %H:%M:%S')}",
	f"Text file processed: {text_file_to_use.name}",
	f"Total chunks generated: {len(all_chunks)}"
	]

	start_time = time.time()
	total_chunks = len(all_chunks)
	log_path = output_root / "chunk_validation.log"
	total_audio_duration = 0.0

	# Batch processing
	print(f"📊 Processing {total_chunks} chunks in batches of {BATCH_SIZE}")

	all_results = []

	for batch_start in range(0, total_chunks, BATCH_SIZE):
	batch_end = min(batch_start + BATCH_SIZE, total_chunks)
	batch_chunks = all_chunks[batch_start:batch_end]

	print(f"\n🔄 Processing batch: chunks {batch_start+1}-{batch_end}")

	# Fresh model for each batch
	model = load_optimized_model(device)
	compatible_voice = ensure_voice_sample_compatibility(voice_path, output_dir=tts_dir)

	# Pre-warm model to eliminate first chunk quality variations
	model = prewarm_model_with_voice(model, compatible_voice, tts_params)

	# Load ASR model once per batch if needed using adaptive manager
	asr_model = None
	asr_device_used = None
	# Use parameter if provided, otherwise fall back to config
	asr_enabled = enable_asr if enable_asr is not None else ENABLE_ASR
	if asr_enabled:
	from modules.asr_manager import load_asr_model_adaptive

	# Get ASR config from parameters
	asr_config = config_params.get('asr_config', {}) if config_params else {}

	# Use adaptive ASR manager for intelligent loading
	asr_model, asr_device_used = load_asr_model_adaptive(asr_config)

	if asr_model is None:
	print(f"❌ ASR model loading failed completely - disabling ASR for this batch")
	asr_enabled = False

	futures = []
	batch_results = []

	# Dynamic worker allocation
	optimal_workers = get_optimal_workers()
	print(f"🔧 Using {optimal_workers} workers for batch {batch_start+1}-{batch_end}")

	use_vader = tts_params.get('use_vader', True)

	if not use_vader:
	# --- BATCH MODE ---
	print(f"🚀 VADER disabled. Running in high-performance batch mode.")
	tts_batch_size = config_params.get('tts_batch_size', 16)
	chunk_batches = [batch_chunks[i:i + tts_batch_size] for i in range(0, len(batch_chunks), tts_batch_size)]

	print(f"📊 Processing {len(batch_chunks)} chunks in {len(chunk_batches)} batches of size {tts_batch_size}.")

	with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
	for batch in chunk_batches:
	if shutdown_requested:
	break
	futures.append(executor.submit(
	process_batch,
	batch, text_chunks_dir, audio_chunks_dir,
	voice_path, tts_params, start_time, total_chunks,
	punc_norm, book_dir.name, log_run, log_path, device,
	model, asr_model, all_chunks, asr_enabled
	))

	# Wait for batches to complete
	for fut in as_completed(futures):
	try:
	# process_batch returns a list of (idx, wav_path) tuples
	results_list = fut.result()
	for idx, wav_path in results_list:
	if wav_path and wav_path.exists():
	chunk_duration = get_chunk_audio_duration(wav_path)
	total_audio_duration += chunk_duration
	batch_results.append((idx, wav_path))
	log_chunk_progress(len(batch_results), total_chunks, start_time, total_audio_duration)
	except Exception as e:
	logging.error(f"Future failed in batch: {e}")
	else:
	# --- SINGLE/NUANCED MODE ---
	print(f"🎨 VADER enabled. Running in nuanced, single-chunk mode.")
	with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
	for i, chunk_data in enumerate(batch_chunks):
	global_chunk_index = batch_start + i

	# Check for shutdown request
	if shutdown_requested:
	print(f"\n⏹️ {YELLOW}Stopping submission of new chunks...{RESET}")
	break

	# Handle both dictionary and tuple formats for chunk data
	if isinstance(chunk_data, dict):
	chunk = chunk_data["text"]
	boundary_type = chunk_data.get("boundary_type", "none")
	# Use chunk-specific TTS params if available, otherwise fall back to global
	chunk_tts_params = chunk_data.get("tts_params", tts_params)
	else:
	# Handle old tuple format (text, is_para_end) - convert to boundary_type
	chunk = chunk_data[0] if len(chunk_data) > 0 else str(chunk_data)
	# Convert old is_paragraph_end to boundary_type
	is_old_para_end = chunk_data[1] if len(chunk_data) > 1 else False
	boundary_type = "paragraph_end" if is_old_para_end else "none"
	chunk_tts_params = tts_params # Fallback for old format



	futures.append(executor.submit(
	process_one_chunk,
	global_chunk_index, chunk, text_chunks_dir, audio_chunks_dir,
	voice_path, chunk_tts_params, start_time, total_chunks,
	punc_norm, book_dir.name, log_run, log_path, device,
	model, asr_model, boundary_type=boundary_type,
	enable_asr=asr_enabled
	))

	# Wait for batch to complete
	print(f"🔄 {CYAN}Waiting for batch {batch_start+1}-{batch_end} to complete...{RESET}")
	completed_count = 0

	for fut in as_completed(futures):
	try:
	idx, wav_path = fut.result()
	if wav_path and wav_path.exists():
	# Measure actual audio duration for this chunk
	chunk_duration = get_chunk_audio_duration(wav_path)
	total_audio_duration += chunk_duration
	batch_results.append((idx, wav_path))

	# Update progress every 10 chunks within batch
	completed_count += 1
	if completed_count % 2 == 0:
	log_chunk_progress(batch_start + completed_count - 1, total_chunks, start_time, total_audio_duration)

	except Exception as e:
	logging.error(f"Future failed in batch: {e}")

	# Clean up model after batch
	print(f"🧹 Cleaning up after batch {batch_start+1}-{batch_end}")
	del model
	if asr_model:
	from modules.asr_manager import cleanup_asr_model
	cleanup_asr_model(asr_model)
	torch.cuda.empty_cache()
	gc.collect()
	time.sleep(2)

	all_results.extend(batch_results)
	print(f"✅ Batch {batch_start+1}-{batch_end} completed ({len(batch_results)} chunks)")

	# Final processing
	quarantine_dir = audio_chunks_dir / "quarantine"
	pause_for_chunk_review(quarantine_dir)

	# Collect final chunk paths
	chunk_paths = get_audio_files_in_directory(audio_chunks_dir)

	if not chunk_paths:
	logging.info(f"{RED}❌ No valid audio chunks found. Skipping concatenation and conversion.{RESET}")
	return None, None, []

	# Calculate timing
	elapsed_total = time.time() - start_time
	elapsed_td = timedelta(seconds=int(elapsed_total))

	total_audio_duration_final = sum(get_chunk_audio_duration(chunk_path) for chunk_path in chunk_paths)
	audio_duration_td = timedelta(seconds=int(total_audio_duration_final))
	realtime_factor = total_audio_duration_final / elapsed_total if elapsed_total > 0 else 0.0

	print(f"\n⏱️ TTS Processing Complete:")
	print(f" Elapsed Time: {CYAN}{str(elapsed_td)}{RESET}")
	print(f" Audio Duration: {GREEN}{str(audio_duration_td)}{RESET}")
	print(f" Realtime Factor: {YELLOW}{realtime_factor:.2f}x{RESET}")

	# Combine audio
	voice_name = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem
	combined_wav_path = output_root / f"{book_dir.name} [{voice_name}].wav"
	print("\n💾 Saving WAV file...")
	combine_audio_chunks(chunk_paths, combined_wav_path)

	# M4B conversion with normalization
	temp_m4b_path = output_root / "output.m4b"
	final_m4b_path = output_root / f"{book_dir.name}[{voice_name}].m4b"
	convert_to_m4b(combined_wav_path, temp_m4b_path)
	add_metadata_to_m4b(temp_m4b_path, final_m4b_path, cover_file, nfo_file)

	logging.info(f"Audiobook created: {final_m4b_path}")

	# Add final info to run log
	run_log_lines.extend([
	f"Combined WAV: {combined_wav_path}",
	"--- Generation Settings ---",
	f"Batch Processing: Enabled ({BATCH_SIZE} chunks per batch)",
	f"ASR Enabled: {ENABLE_ASR}",
	f"Hum Detection: {ENABLE_HUM_DETECTION}",
	f"Dynamic Workers: {USE_DYNAMIC_WORKERS}",
	f"Voice used: {voice_name}",
	f"Exaggeration: {tts_params['exaggeration']}",
	f"CFG weight: {tts_params['cfg_weight']}",
	f"Temperature: {tts_params['temperature']}",
	f"Processing Time: {str(elapsed_td)}",
	f"Audio Duration: {str(audio_duration_td)}",
	f"Realtime Factor: {realtime_factor:.2f}x",
	f"Total Chunks: {len(chunk_paths)}"
	])

	# Write the run log
	log_run("\n".join(run_log_lines), output_root / "run.log")
	print(f"📝 Run log written to: {output_root / 'run.log'}")

	return final_m4b_path, combined_wav_path, run_log_lines