| """ |
| TTS Engine Module |
| Handles ChatterboxTTS interface, model loading, and chunk processing coordination |
| """ |
|
|
| import torch |
| import gc |
| import time |
| import logging |
| import shutil |
| import sys |
| import numpy as np |
| from datetime import timedelta |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| from pathlib import Path |
| import torchaudio as ta |
|
|
| from config.config import * |
| from modules.text_processor import smart_punctuate, sentence_chunk_text, detect_content_boundaries |
|
|
| def find_chunks_json_file(book_name): |
| """Find the corresponding chunks JSON file for a book""" |
| from config.config import AUDIOBOOK_ROOT |
|
|
| |
| tts_chunks_dir = AUDIOBOOK_ROOT / book_name / "TTS" / "text_chunks" |
| json_path = tts_chunks_dir / "chunks_info.json" |
|
|
| if json_path.exists(): |
| return json_path |
|
|
| |
| text_input_dir = Path("Text_Input") |
| possible_names = [ |
| f"{book_name}_chunks.json", |
| f"{book_name.lower()}_chunks.json", |
| f"{book_name.replace(' ', '_')}_chunks.json" |
| ] |
|
|
| for name in possible_names: |
| old_json_path = text_input_dir / name |
| if old_json_path.exists(): |
| return old_json_path |
|
|
| return None |
| from modules.audio_processor import ( |
| smart_audio_validation, apply_smart_fade, add_chunk_end_silence, |
| add_contextual_silence, pause_for_chunk_review, get_chunk_audio_duration, |
| has_mid_energy_drop, apply_smart_fade_memory, smart_audio_validation_memory |
| ) |
| from modules.file_manager import ( |
| setup_book_directories, find_book_files, ensure_voice_sample_compatibility, |
| combine_audio_chunks, get_audio_files_in_directory, convert_to_m4b, add_metadata_to_m4b |
| ) |
| from modules.progress_tracker import setup_logging, log_chunk_progress, log_run |
|
|
| |
| shutdown_requested = False |
|
|
| |
| RED = '\033[91m' |
| GREEN = '\033[92m' |
| YELLOW = '\033[93m' |
| CYAN = '\033[96m' |
| RESET = '\033[0m' |
|
|
| import random |
| import numpy as np |
| import torch |
|
|
| def set_seed(seed_value: int): |
| """ |
| Sets the seed for torch, random, and numpy for reproducibility. |
| This is called if a non-zero seed is provided for generation. |
| """ |
| torch.manual_seed(seed_value) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed(seed_value) |
| torch.cuda.manual_seed_all(seed_value) |
| if torch.backends.mps.is_available(): |
| |
| if hasattr(torch, 'mps') and torch.mps.is_available(): |
| torch.mps.manual_seed(seed_value) |
| random.seed(seed_value) |
| np.random.seed(seed_value) |
| logging.info(f"Global seed set to: {seed_value}") |
|
|
| |
| |
| |
|
|
| def monitor_gpu_activity(operation_name): |
| """Lightweight GPU monitoring for high-speed processing""" |
| |
| if torch.cuda.is_available(): |
| allocated = torch.cuda.memory_allocated() / 1024**3 |
| |
| return allocated, 0 |
| return 0, 0 |
|
|
| def optimize_memory_usage(): |
| """Aggressive memory management for 8GB VRAM""" |
| torch.cuda.empty_cache() |
| gc.collect() |
| if torch.cuda.is_available(): |
| torch.cuda.ipc_collect() |
|
|
| def monitor_vram_usage(operation_name=""): |
| """Real-time VRAM monitoring""" |
| if torch.cuda.is_available(): |
| allocated = torch.cuda.memory_allocated() / 1024**3 |
| reserved = torch.cuda.memory_reserved() / 1024**3 |
|
|
| if allocated > VRAM_SAFETY_THRESHOLD: |
| logging.warning(f"⚠️ High VRAM usage during {operation_name}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved") |
| optimize_memory_usage() |
|
|
| return allocated, reserved |
| return 0, 0 |
|
|
| def get_optimal_workers(): |
| """Dynamic worker allocation based on VRAM usage""" |
| if not USE_DYNAMIC_WORKERS: |
| return MAX_WORKERS |
|
|
| allocated_vram = torch.cuda.memory_allocated() / 1024**3 |
|
|
| if allocated_vram < 5.0: |
| return min(TEST_MAX_WORKERS, MAX_WORKERS) |
| elif allocated_vram < VRAM_SAFETY_THRESHOLD: |
| return min(2, MAX_WORKERS) |
| else: |
| return 1 |
|
|
| def prewarm_model_with_voice(model, voice_path, tts_params=None): |
| """ |
| Pre-warm the TTS model with a voice sample to eliminate cold start quality issues. |
| |
| Args: |
| model: Loaded TTS model |
| voice_path: Path to voice sample file |
| tts_params: Optional TTS parameters for pre-warming (uses defaults if None) |
| |
| Returns: |
| model: The pre-warmed model (same object, but with cached conditioning) |
| """ |
| import tempfile |
| import os |
| from modules.file_manager import ensure_voice_sample_compatibility |
| |
| try: |
| print("🔥 Pre-warming model with voice sample...") |
| |
| |
| compatible_voice = ensure_voice_sample_compatibility(voice_path) |
| |
| |
| if tts_params is None: |
| tts_params = { |
| 'exaggeration': 0.5, |
| 'cfg_weight': 0.5, |
| 'temperature': 0.9 |
| } |
| |
| |
| model.prepare_conditionals(compatible_voice) |
| |
| |
| dummy_text = "The quick brown fox jumps over the lazy dog." |
| print(f"🎤 Generating warm-up audio: '{dummy_text}'") |
| |
| |
| wav_np = model.generate( |
| dummy_text, |
| exaggeration=tts_params['exaggeration'], |
| cfg_weight=tts_params['cfg_weight'], |
| temperature=tts_params['temperature'] |
| ) |
| |
| print("✅ Model pre-warming completed - first chunk quality optimized") |
| |
| |
| del wav_np |
| |
| return model |
| |
| except Exception as e: |
| print(f"⚠️ Pre-warming failed: {e}") |
| print("📝 Model will still work but first chunk may have quality variations") |
| return model |
|
|
| def get_best_available_device(): |
| """Detect and return the best available device with proper fallback""" |
| try: |
| if torch.cuda.is_available(): |
| |
| test_tensor = torch.tensor([1.0]).to("cuda") |
| del test_tensor |
| torch.cuda.empty_cache() |
| return "cuda" |
| except Exception as e: |
| logging.warning(f"CUDA test failed: {e}") |
| |
| try: |
| if torch.backends.mps.is_available(): |
| |
| test_tensor = torch.tensor([1.0]).to("mps") |
| del test_tensor |
| return "mps" |
| except Exception as e: |
| logging.warning(f"MPS test failed: {e}") |
| |
| return "cpu" |
|
|
| def load_optimized_model(device): |
| """Load TTS model with memory optimizations""" |
| from src.chatterbox.tts import ChatterboxTTS |
|
|
| try: |
| |
| model = ChatterboxTTS.from_pretrained(device=device, torch_dtype=torch.float16) |
| logging.info("✅ Loaded model in FP16 mode (halved VRAM usage)") |
| except: |
| |
| model = ChatterboxTTS.from_pretrained(device=device) |
| logging.info("⚠️ Using FP32 mode (FP16 not supported)") |
|
|
| |
| if hasattr(model, 'eval'): |
| model.eval() |
|
|
| |
| if torch.backends.cudnn.is_available(): |
| torch.backends.cudnn.benchmark = True |
|
|
| return model |
|
|
| |
| |
| |
|
|
| def patch_alignment_layer(tfmr, alignment_layer_idx=12): |
| """Patch alignment layer to avoid recursion""" |
| from types import MethodType |
| target_layer = tfmr.layers[alignment_layer_idx].self_attn |
| original_forward = target_layer.forward |
|
|
| def patched_forward(self, *args, **kwargs): |
| kwargs['output_attentions'] = True |
| return original_forward(*args, **kwargs) |
|
|
| target_layer.forward = MethodType(patched_forward, target_layer) |
|
|
| def process_batch( |
| batch, text_chunks_dir, audio_chunks_dir, |
| voice_path, tts_params, start_time, total_chunks, |
| punc_norm, basename, log_run_func, log_path, device, |
| model, asr_model, seed=0, |
| enable_asr=None |
| ): |
| if seed != 0: |
| set_seed(seed) |
| """ |
| Process a batch of chunks using the batch-enabled TTS model. |
| """ |
| from pydub import AudioSegment |
| import io |
| import soundfile as sf |
|
|
| |
| texts = [chunk_data['text'] for chunk_data in batch] |
| |
| |
| shared_tts_params = batch[0].get("tts_params", tts_params) |
| supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"} |
| tts_args = {k: v for k, v in shared_tts_params.items() if k in supported_params} |
|
|
| |
| try: |
| with torch.no_grad(): |
| wavs = model.generate_batch(texts, **tts_args) |
| except Exception as e: |
| logging.error(f"❌ Batch TTS generation failed: {e}") |
| |
| results = [] |
| for chunk_data in batch: |
| i = chunk_data['index'] |
| chunk = chunk_data['text'] |
| boundary_type = chunk_data.get("boundary_type", "none") |
| chunk_tts_params = chunk_data.get("tts_params", tts_params) |
| result = process_one_chunk(i, chunk, text_chunks_dir, audio_chunks_dir, voice_path, chunk_tts_params, start_time, total_chunks, punc_norm, basename, log_run_func, log_path, device, model, asr_model, boundary_type=boundary_type, enable_asr=enable_asr) |
| results.append(result) |
| return results |
|
|
|
|
| |
| batch_results = [] |
| for i, wav_tensor in enumerate(wavs): |
| chunk_data = batch[i] |
| chunk_index = chunk_data['index'] |
| boundary_type = chunk_data.get("boundary_type", "none") |
| chunk_id_str = f"{chunk_index+1:05}" |
|
|
| if wav_tensor.dim() == 1: |
| wav_tensor = wav_tensor.unsqueeze(0) |
|
|
| wav_np = wav_tensor.squeeze().cpu().numpy() |
| with io.BytesIO() as wav_buffer: |
| sf.write(wav_buffer, wav_np, model.sr, format='wav') |
| wav_buffer.seek(0) |
| audio_segment = AudioSegment.from_wav(wav_buffer) |
|
|
| |
| from modules.audio_processor import process_audio_with_trimming_and_silence, trim_audio_endpoint |
| if boundary_type and boundary_type != "none": |
| final_audio = process_audio_with_trimming_and_silence(audio_segment, boundary_type) |
| elif ENABLE_AUDIO_TRIMMING: |
| final_audio = trim_audio_endpoint(audio_segment) |
| else: |
| final_audio = audio_segment |
|
|
| |
| final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav" |
| final_audio.export(final_path, format="wav") |
| logging.info(f"✅ Saved final chunk from batch: {final_path.name}") |
| |
| batch_results.append((chunk_index, final_path)) |
|
|
| return batch_results |
|
|
| def process_one_chunk( |
| i, chunk, text_chunks_dir, audio_chunks_dir, |
| voice_path, tts_params, start_time, total_chunks, |
| punc_norm, basename, log_run_func, log_path, device, |
| model, asr_model, seed=0, boundary_type="none", |
| enable_asr=None |
| ): |
| if seed != 0: |
| set_seed(seed) |
| """Enhanced chunk processing with quality control, contextual silence, and deep cleanup""" |
| import difflib |
| from pydub import AudioSegment |
|
|
| chunk_id_str = f"{i+1:05}" |
| chunk_path = text_chunks_dir / f"chunk_{chunk_id_str}.txt" |
| with open(chunk_path, 'w', encoding='utf-8') as cf: |
| cf.write(chunk) |
|
|
| chunk_audio_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav" |
|
|
| |
| |
| |
| cleanup_interval = CLEANUP_INTERVAL |
|
|
| |
| if (i + 1) % cleanup_interval == 0 and (i + 1) % BATCH_SIZE != 0: |
| print(f"\n🧹 {YELLOW}DEEP CLEANUP at chunk {i+1}/{total_chunks}...{RESET}") |
|
|
| |
| allocated_before = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 |
| reserved_before = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0 |
|
|
| print(f" Before: VRAM Allocated: {allocated_before:.1f}GB | Reserved: {reserved_before:.1f}GB") |
|
|
| |
| print(" 🗑️ Cleaning bulk temporary files...") |
| temp_patterns = ["*_try*.wav", "*_pre.wav", "*_fade*.wav", "*_debug*.wav", "*_temp*.wav", "*_backup*.wav"] |
| total_temp_files = 0 |
| for pattern in temp_patterns: |
| temp_files = list(audio_chunks_dir.glob(pattern)) |
| for temp_file in temp_files: |
| temp_file.unlink(missing_ok=True) |
| total_temp_files += len(temp_files) |
|
|
| if total_temp_files > 0: |
| print(f" 🗑️ Removed {total_temp_files} temporary audio files") |
|
|
| |
| print(" 🔄 Performing aggressive CUDA context reset...") |
| torch.cuda.synchronize() |
| torch.cuda.empty_cache() |
| torch.cuda.ipc_collect() |
|
|
| |
| if hasattr(torch.cuda, 'reset_peak_memory_stats'): |
| torch.cuda.reset_peak_memory_stats() |
| if hasattr(torch._C, '_cuda_clearCublasWorkspaces'): |
| torch._C._cuda_clearCublasWorkspaces() |
|
|
| |
| for _ in range(3): |
| gc.collect() |
|
|
| |
| if hasattr(model, 'clear_cache'): |
| model.clear_cache() |
| elif hasattr(model, 'reset_states'): |
| model.reset_states() |
|
|
| |
| time.sleep(1.0) |
|
|
| |
| allocated_after = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 |
| reserved_after = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0 |
|
|
| print(f" After: VRAM Allocated: {allocated_after:.1f}GB | Reserved: {reserved_after:.1f}GB") |
| print(f" Freed: {allocated_before - allocated_after:.1f}GB allocated, {reserved_before - reserved_after:.1f}GB reserved") |
| print(f"🧹 {GREEN}Deep cleanup complete!{RESET}\n") |
|
|
| best_sim, best_asr_text = -1, "" |
| wav_path_active = None |
| attempt_paths = [] |
| mid_drop_retries = 0 |
| max_mid_drop_retries = 2 |
|
|
| |
| max_attempts = MAX_REGENERATION_ATTEMPTS if ENABLE_REGENERATION_LOOP else 2 |
| current_tts_params = tts_params.copy() |
|
|
| |
| logging.info(f"🎛️ Chunk {chunk_id_str} initial TTS params: exag={current_tts_params.get('exaggeration', 'N/A'):.3f}, cfg={current_tts_params.get('cfg_weight', 'N/A'):.3f}, temp={current_tts_params.get('temperature', 'N/A'):.3f}, min_p={current_tts_params.get('min_p', 'N/A'):.3f}") |
|
|
| for attempt_num in range(max_attempts): |
| logging.info(f"🔁 Starting TTS for chunk {chunk_id_str}, attempt {attempt_num + 1}/{max_attempts}") |
| if attempt_num > 0: |
| logging.info(f"🔧 Adjusted params: exag={current_tts_params.get('exaggeration', 'N/A'):.3f}, cfg={current_tts_params.get('cfg_weight', 'N/A'):.3f}, temp={current_tts_params.get('temperature', 'N/A'):.3f}") |
| try: |
| |
| supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"} |
| tts_args = {k: v for k, v in current_tts_params.items() if k in supported_params} |
|
|
| |
| with torch.no_grad(): |
| wav = model.generate(chunk, **tts_args).detach().cpu() |
| |
|
|
| if wav.dim() == 1: |
| wav = wav.unsqueeze(0) |
|
|
| |
| import io |
| import soundfile as sf |
| from pydub import AudioSegment |
|
|
| |
| wav_np = wav.squeeze().numpy() |
| with io.BytesIO() as wav_buffer: |
| sf.write(wav_buffer, wav_np, model.sr, format='wav') |
| wav_buffer.seek(0) |
| audio_segment = AudioSegment.from_wav(wav_buffer) |
|
|
| |
| quality_score = 1.0 |
|
|
| |
| if ENABLE_MID_DROP_CHECK and has_mid_energy_drop(wav, model.sr): |
| quality_score *= 0.3 |
| logging.info(f"⚠️ Mid-chunk energy drop detected in {chunk_id_str}") |
|
|
| |
| if ENABLE_REGENERATION_LOOP: |
| from modules.audio_processor import evaluate_chunk_quality |
| |
| composite_score = evaluate_chunk_quality(audio_segment, chunk, include_spectral=True, asr_model=asr_model) |
| quality_score *= composite_score |
| logging.info(f"📊 Quality score for {chunk_id_str}: {quality_score:.3f} (composite: {composite_score:.3f})") |
|
|
| |
| asr_score = 1.0 |
| |
| asr_enabled = enable_asr if enable_asr is not None else ENABLE_ASR |
| if asr_enabled and asr_model is not None: |
| from modules.audio_processor import calculate_text_similarity |
| try: |
| |
| samples = np.array(audio_segment.get_array_of_samples()) |
| if audio_segment.channels == 2: |
| samples = samples.reshape((-1, 2)).mean(axis=1) |
|
|
| |
| audio_np = samples.astype(np.float32) / audio_segment.max_possible_amplitude |
| result = asr_model.transcribe(audio_np) |
|
|
| if not isinstance(result, dict) or "text" not in result: |
| raise ValueError(f"Invalid ASR result type: {type(result)}") |
|
|
| asr_text = result.get("text", "").strip() |
| asr_score = calculate_text_similarity(punc_norm(chunk), asr_text) |
| logging.info(f"🎤 ASR similarity for chunk {chunk_id_str}: {asr_score:.3f} - Expected: '{punc_norm(chunk)}' Got: '{asr_text}'") |
|
|
| except Exception as e: |
| logging.error(f"❌ ASR failed for {chunk_id_str}: {e}") |
| asr_score = 0.8 |
|
|
| |
| quality_score *= asr_score |
|
|
| |
| if quality_score >= QUALITY_THRESHOLD or attempt_num == max_attempts - 1: |
| if quality_score >= QUALITY_THRESHOLD: |
| logging.info(f"✅ Quality acceptable for {chunk_id_str} on attempt {attempt_num + 1} (final score: {quality_score:.3f})") |
| else: |
| logging.info(f"⚠️ Max attempts reached for {chunk_id_str}, accepting best effort (final score: {quality_score:.3f})") |
|
|
| |
| final_audio = audio_segment |
| best_sim = asr_score if asr_enabled else 1.0 |
| best_asr_text = asr_text if asr_enabled and 'asr_text' in locals() else "" |
| break |
| else: |
| |
| logging.info(f"🔄 Quality below threshold ({quality_score:.3f} < {QUALITY_THRESHOLD}), adjusting parameters for retry {attempt_num + 2}") |
| from modules.audio_processor import adjust_parameters_for_retry |
| current_tts_params = adjust_parameters_for_retry(current_tts_params, quality_score, attempt_num) |
| continue |
|
|
| except Exception as e: |
| import traceback |
| logging.error(f"Exception during TTS attempt {attempt_num + 1} for chunk {chunk_id_str}: {e}") |
| traceback.print_exc() |
| continue |
|
|
| if 'final_audio' not in locals(): |
| logging.info(f"❌ Chunk {chunk_id_str} failed all attempts.") |
| return None, None |
|
|
| |
| from modules.audio_processor import process_audio_with_trimming_and_silence |
|
|
| if boundary_type and boundary_type != "none": |
| final_audio = process_audio_with_trimming_and_silence(final_audio, boundary_type) |
| print(f"🔇 Added {boundary_type} silence to chunk {i+1:05}") |
| else: |
| |
| if ENABLE_AUDIO_TRIMMING: |
| from modules.audio_processor import trim_audio_endpoint |
| final_audio = trim_audio_endpoint(final_audio) |
|
|
| |
| |
|
|
| |
| final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav" |
| final_audio.export(final_path, format="wav") |
| logging.info(f"✅ Saved final chunk: {final_path.name}") |
|
|
| |
|
|
| |
| if asr_enabled and best_sim < 0.8: |
| log_run_func(f"ASR VALIDATION FAILED - Chunk {chunk_id_str}:\nExpected:\n{chunk}\nActual:\n{best_asr_text}\nSimilarity: {best_sim:.3f}\n" + "="*50, log_path) |
| elif not asr_enabled: |
| log_run_func(f"Chunk {chunk_id_str}: Original text: {chunk}", log_path) |
|
|
| |
|
|
| |
| del wav |
| optimize_memory_usage() |
|
|
| |
| if (i + 1) % 50 == 0: |
| torch.cuda.empty_cache() |
| gc.collect() |
|
|
| return i, final_path |
|
|
| |
| |
| |
|
|
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer |
| from wrapper.chunk_loader import save_chunks |
|
|
| def smooth_sentiment_scores(scores, index, method="rolling", window=3): |
| """ |
| Apply sentiment smoothing to prevent harsh emotional transitions. |
| |
| Args: |
| scores: List of compound sentiment scores |
| index: Current chunk index |
| method: "rolling" for moving average, "exp_decay" for exponential decay |
| window: Number of previous chunks to consider |
| |
| Returns: |
| float: Smoothed sentiment score |
| """ |
| if index == 0: |
| return scores[0] |
|
|
| start_idx = max(0, index - window + 1) |
| window_scores = scores[start_idx:index + 1] |
|
|
| if method == "rolling": |
| return sum(window_scores) / len(window_scores) |
| elif method == "exp_decay": |
| weights = SENTIMENT_EXP_DECAY_WEIGHTS[:len(window_scores)] |
| weighted_sum = sum(w * s for w, s in zip(weights, reversed(window_scores))) |
| weight_sum = sum(weights[:len(window_scores)]) |
| return weighted_sum / weight_sum if weight_sum > 0 else window_scores[-1] |
| else: |
| return scores[index] |
|
|
| def generate_enriched_chunks(text_file, output_dir, user_tts_params=None, quality_params=None, config_params=None, voice_name=None): |
| """Reads a text file, performs VADER sentiment analysis, and returns enriched chunks.""" |
| analyzer = SentimentIntensityAnalyzer() |
|
|
| |
| if quality_params: |
| enable_smoothing = quality_params.get('sentiment_smoothing', ENABLE_SENTIMENT_SMOOTHING) |
| smoothing_window = quality_params.get('smoothing_window', SENTIMENT_SMOOTHING_WINDOW) |
| smoothing_method = quality_params.get('smoothing_method', SENTIMENT_SMOOTHING_METHOD) |
| print(f"🔧 JSON Generation: Using GUI smoothing settings - Enabled: {enable_smoothing}, Window: {smoothing_window}, Method: {smoothing_method}") |
| else: |
| enable_smoothing = ENABLE_SENTIMENT_SMOOTHING |
| smoothing_window = SENTIMENT_SMOOTHING_WINDOW |
| smoothing_method = SENTIMENT_SMOOTHING_METHOD |
| print(f"🔧 JSON Generation: Using config smoothing settings - Enabled: {enable_smoothing}") |
|
|
| |
| if config_params: |
| vader_exag_sensitivity = config_params.get('vader_exag_sensitivity', VADER_EXAGGERATION_SENSITIVITY) |
| vader_cfg_sensitivity = config_params.get('vader_cfg_sensitivity', VADER_CFG_WEIGHT_SENSITIVITY) |
| vader_temp_sensitivity = config_params.get('vader_temp_sensitivity', VADER_TEMPERATURE_SENSITIVITY) |
| print(f"🔧 JSON Generation: Using GUI VADER sensitivity - Exag: {vader_exag_sensitivity}, CFG: {vader_cfg_sensitivity}, Temp: {vader_temp_sensitivity}") |
| else: |
| vader_exag_sensitivity = VADER_EXAGGERATION_SENSITIVITY |
| vader_cfg_sensitivity = VADER_CFG_WEIGHT_SENSITIVITY |
| vader_temp_sensitivity = VADER_TEMPERATURE_SENSITIVITY |
| print(f"🔧 JSON Generation: Using config VADER sensitivity - Exag: {vader_exag_sensitivity}, CFG: {vader_cfg_sensitivity}, Temp: {vader_temp_sensitivity}") |
|
|
| raw_text = text_file.read_text(encoding='utf-8') |
| cleaned = smart_punctuate(raw_text) |
| chunks = sentence_chunk_text(cleaned) |
|
|
| |
| if user_tts_params: |
| base_exaggeration = user_tts_params.get('exaggeration', BASE_EXAGGERATION) |
| base_cfg_weight = user_tts_params.get('cfg_weight', BASE_CFG_WEIGHT) |
| base_temperature = user_tts_params.get('temperature', BASE_TEMPERATURE) |
| base_min_p = user_tts_params.get('min_p', DEFAULT_MIN_P) |
| base_top_p = user_tts_params.get('top_p', DEFAULT_TOP_P) |
| base_repetition_penalty = user_tts_params.get('repetition_penalty', DEFAULT_REPETITION_PENALTY) |
| use_vader = user_tts_params.get('use_vader', True) |
|
|
| else: |
| base_exaggeration = BASE_EXAGGERATION |
| base_cfg_weight = BASE_CFG_WEIGHT |
| base_temperature = BASE_TEMPERATURE |
| base_min_p = DEFAULT_MIN_P |
| base_top_p = DEFAULT_TOP_P |
| base_repetition_penalty = DEFAULT_REPETITION_PENALTY |
| use_vader = True |
|
|
| enriched = [] |
| chunk_texts = [chunk_text for chunk_text, _ in chunks] |
|
|
| |
| raw_sentiment_scores = [] |
| for chunk_text, _ in chunks: |
| sentiment_scores = analyzer.polarity_scores(chunk_text) |
| raw_sentiment_scores.append(sentiment_scores['compound']) |
|
|
| |
| for i, (chunk_text, is_para_end) in enumerate(chunks): |
| |
| raw_compound_score = raw_sentiment_scores[i] |
|
|
| |
| if use_vader and enable_smoothing: |
| compound_score = smooth_sentiment_scores( |
| raw_sentiment_scores, |
| i, |
| method=smoothing_method, |
| window=smoothing_window |
| ) |
| |
| if abs(compound_score - raw_compound_score) > 0.1: |
| logging.info(f"📊 Chunk {i+1:05}: sentiment smoothed {raw_compound_score:.3f} → {compound_score:.3f}") |
| else: |
| compound_score = raw_compound_score |
|
|
| if use_vader: |
| |
| exaggeration = base_exaggeration + (compound_score * vader_exag_sensitivity) |
| cfg_weight = base_cfg_weight + (compound_score * vader_cfg_sensitivity) |
| temperature = base_temperature + (compound_score * vader_temp_sensitivity) |
| min_p = base_min_p + (compound_score * VADER_MIN_P_SENSITIVITY) |
| repetition_penalty = base_repetition_penalty + (compound_score * VADER_REPETITION_PENALTY_SENSITIVITY) |
|
|
| |
| exaggeration = round(max(TTS_PARAM_MIN_EXAGGERATION, min(exaggeration, TTS_PARAM_MAX_EXAGGERATION)), 2) |
| cfg_weight = round(max(TTS_PARAM_MIN_CFG_WEIGHT, min(cfg_weight, TTS_PARAM_MAX_CFG_WEIGHT)), 2) |
| temperature = round(max(TTS_PARAM_MIN_TEMPERATURE, min(temperature, TTS_PARAM_MAX_TEMPERATURE)), 2) |
| min_p = round(max(TTS_PARAM_MIN_MIN_P, min(min_p, TTS_PARAM_MAX_MIN_P)), 3) |
| repetition_penalty = round(max(TTS_PARAM_MIN_REPETITION_PENALTY, min(repetition_penalty, TTS_PARAM_MAX_REPETITION_PENALTY)), 1) |
|
|
| |
| if abs(exaggeration - base_exaggeration) > 0.05 or abs(cfg_weight - base_cfg_weight) > 0.05: |
| logging.info(f"🎭 Chunk {i+1:05}: VADER adjusted params - exag: {base_exaggeration:.2f}→{exaggeration:.2f}, cfg: {base_cfg_weight:.2f}→{cfg_weight:.2f}, sentiment: {compound_score:.3f}") |
| else: |
| |
| exaggeration = base_exaggeration |
| cfg_weight = base_cfg_weight |
| temperature = base_temperature |
| min_p = base_min_p |
| repetition_penalty = base_repetition_penalty |
|
|
| boundary_type = detect_content_boundaries(chunk_text, i, chunk_texts, is_para_end) |
|
|
| enriched.append({ |
| "index": i, |
| "text": chunk_text, |
| "word_count": len(chunk_text.split()), |
| "boundary_type": boundary_type if boundary_type else "none", |
| "sentiment_compound": compound_score, |
| "sentiment_raw": raw_compound_score, |
| "tts_params": { |
| "exaggeration": exaggeration, |
| "cfg_weight": cfg_weight, |
| "temperature": temperature, |
| "min_p": min_p, |
| "top_p": base_top_p, |
| "repetition_penalty": repetition_penalty |
| } |
| }) |
|
|
| output_json_path = output_dir / "chunks_info.json" |
|
|
| |
| if voice_name: |
| |
| try: |
| |
| metadata = { |
| "_metadata": True, |
| "voice_used": voice_name, |
| "generation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), |
| "total_chunks": len(enriched) |
| } |
| enriched_with_metadata = [metadata] + enriched |
| save_chunks(output_json_path, enriched_with_metadata) |
| print(f"✅ Saved voice metadata: {voice_name}") |
| except Exception as e: |
| |
| print(f"⚠️ Metadata method failed, using comment fallback: {e}") |
| save_chunks(output_json_path, enriched) |
|
|
| |
| from modules.voice_detector import add_voice_to_json |
| add_voice_to_json(output_json_path, voice_name, method="comment") |
| else: |
| save_chunks(output_json_path, enriched) |
|
|
| return enriched |
|
|
| def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=False, enable_asr=None, quality_params=None, config_params=None, specific_text_file=None): |
| """Enhanced book processing with batch processing to prevent hangs""" |
| print(f"🔍 DEBUG: Entering process_book_folder with book_dir='{book_dir}', voice_path='{voice_path}'") |
|
|
| |
| if quality_params: |
| print(f"🔧 Applying GUI quality parameters: {quality_params}") |
|
|
| |
| global ENABLE_REGENERATION_LOOP, ENABLE_SENTIMENT_SMOOTHING, ENABLE_MFCC_VALIDATION |
| global ENABLE_OUTPUT_VALIDATION, QUALITY_THRESHOLD, OUTPUT_VALIDATION_THRESHOLD |
| global SENTIMENT_SMOOTHING_WINDOW, SENTIMENT_SMOOTHING_METHOD, SPECTRAL_ANOMALY_THRESHOLD |
|
|
| ENABLE_REGENERATION_LOOP = quality_params.get('regeneration_enabled', ENABLE_REGENERATION_LOOP) |
| ENABLE_SENTIMENT_SMOOTHING = quality_params.get('sentiment_smoothing', ENABLE_SENTIMENT_SMOOTHING) |
| ENABLE_MFCC_VALIDATION = quality_params.get('mfcc_validation', ENABLE_MFCC_VALIDATION) |
| ENABLE_OUTPUT_VALIDATION = quality_params.get('output_validation', ENABLE_OUTPUT_VALIDATION) |
| QUALITY_THRESHOLD = quality_params.get('quality_threshold', QUALITY_THRESHOLD) |
| OUTPUT_VALIDATION_THRESHOLD = quality_params.get('output_threshold', OUTPUT_VALIDATION_THRESHOLD) |
| SENTIMENT_SMOOTHING_WINDOW = quality_params.get('smoothing_window', SENTIMENT_SMOOTHING_WINDOW) |
| SENTIMENT_SMOOTHING_METHOD = quality_params.get('smoothing_method', SENTIMENT_SMOOTHING_METHOD) |
| SPECTRAL_ANOMALY_THRESHOLD = quality_params.get('spectral_threshold', SPECTRAL_ANOMALY_THRESHOLD) |
|
|
| print(f"✅ Quality settings applied - Regeneration: {ENABLE_REGENERATION_LOOP}, MFCC: {ENABLE_MFCC_VALIDATION}, Output Validation: {ENABLE_OUTPUT_VALIDATION}") |
|
|
| from src.chatterbox.tts import punc_norm |
| print(f"🔍 DEBUG: Successfully imported punc_norm") |
|
|
| |
| print(f"🔍 DEBUG: Calling setup_book_directories...") |
| output_root, tts_dir, text_chunks_dir, audio_chunks_dir = setup_book_directories(book_dir) |
| print(f"🔍 DEBUG: Directory setup complete") |
|
|
| |
| if skip_cleanup: |
| print(f"🔄 RESUME MODE: Skipping cleanup to preserve existing chunks") |
| print(f"📁 Preserving: {text_chunks_dir}, {audio_chunks_dir}") |
| else: |
| print(f"🧹 FRESH PROCESSING: Cleaning previous processing files...") |
| import glob |
|
|
| |
| for txt_file in text_chunks_dir.glob("*.txt"): |
| txt_file.unlink(missing_ok=True) |
| for json_file in text_chunks_dir.glob("*.json"): |
| json_file.unlink(missing_ok=True) |
|
|
| |
| for wav_file in audio_chunks_dir.glob("*.wav"): |
| wav_file.unlink(missing_ok=True) |
|
|
| |
| for log_file in output_root.glob("*.log"): |
| log_file.unlink(missing_ok=True) |
|
|
| print(f"✅ Cleanup complete") |
|
|
| |
| print(f"🔍 DEBUG: Calling find_book_files...") |
| book_files = find_book_files(book_dir) |
| |
| |
| if specific_text_file: |
| text_file_to_use = Path(specific_text_file) |
| print(f"🎯 DEBUG: Using GUI-selected text file: {text_file_to_use}") |
| if not text_file_to_use.exists(): |
| logging.error(f"[{book_dir.name}] ERROR: Selected text file not found: {text_file_to_use}") |
| return None, None, [] |
| else: |
| text_file_to_use = book_files['text'] |
| print(f"🔍 DEBUG: Using auto-detected text file: {text_file_to_use}") |
| if not text_file_to_use: |
| logging.info(f"[{book_dir.name}] ERROR: No .txt files found in the book folder.") |
| return None, None, [] |
| |
| cover_file = book_files['cover'] |
| nfo_file = book_files['nfo'] |
|
|
| setup_logging(output_root) |
|
|
| |
| voice_name_for_log = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem |
|
|
| |
| print(f"🔍 DEBUG: About to call generate_enriched_chunks with quality_params: {quality_params}") |
| print(f"🔍 DEBUG: About to call generate_enriched_chunks with config_params: {config_params}") |
| print(f"🔍 DEBUG: Using voice: {voice_name_for_log}") |
| all_chunks = generate_enriched_chunks(text_file_to_use, text_chunks_dir, tts_params, quality_params, config_params, voice_name_for_log) |
|
|
| |
| print(f"🔍 DEBUG: Creating run_log_lines...") |
| print(f"🔍 DEBUG: voice_path type: {type(voice_path)}, value: {voice_path}") |
|
|
| run_log_lines = [ |
| f"\n===== Processing: {book_dir.name} =====", |
| f"Voice: {voice_name_for_log}", |
| f"Started: {time.strftime('%Y-%m-%d %H:%M:%S')}", |
| f"Text file processed: {text_file_to_use.name}", |
| f"Total chunks generated: {len(all_chunks)}" |
| ] |
|
|
| start_time = time.time() |
| total_chunks = len(all_chunks) |
| log_path = output_root / "chunk_validation.log" |
| total_audio_duration = 0.0 |
|
|
| |
| print(f"📊 Processing {total_chunks} chunks in batches of {BATCH_SIZE}") |
|
|
| all_results = [] |
|
|
| for batch_start in range(0, total_chunks, BATCH_SIZE): |
| batch_end = min(batch_start + BATCH_SIZE, total_chunks) |
| batch_chunks = all_chunks[batch_start:batch_end] |
|
|
| print(f"\n🔄 Processing batch: chunks {batch_start+1}-{batch_end}") |
|
|
| |
| model = load_optimized_model(device) |
| compatible_voice = ensure_voice_sample_compatibility(voice_path, output_dir=tts_dir) |
| |
| |
| model = prewarm_model_with_voice(model, compatible_voice, tts_params) |
|
|
| |
| asr_model = None |
| asr_device_used = None |
| |
| asr_enabled = enable_asr if enable_asr is not None else ENABLE_ASR |
| if asr_enabled: |
| from modules.asr_manager import load_asr_model_adaptive |
| |
| |
| asr_config = config_params.get('asr_config', {}) if config_params else {} |
| |
| |
| asr_model, asr_device_used = load_asr_model_adaptive(asr_config) |
| |
| if asr_model is None: |
| print(f"❌ ASR model loading failed completely - disabling ASR for this batch") |
| asr_enabled = False |
|
|
| futures = [] |
| batch_results = [] |
|
|
| |
| optimal_workers = get_optimal_workers() |
| print(f"🔧 Using {optimal_workers} workers for batch {batch_start+1}-{batch_end}") |
|
|
| use_vader = tts_params.get('use_vader', True) |
|
|
| if not use_vader: |
| |
| print(f"🚀 VADER disabled. Running in high-performance batch mode.") |
| tts_batch_size = config_params.get('tts_batch_size', 16) |
| chunk_batches = [batch_chunks[i:i + tts_batch_size] for i in range(0, len(batch_chunks), tts_batch_size)] |
| |
| print(f"📊 Processing {len(batch_chunks)} chunks in {len(chunk_batches)} batches of size {tts_batch_size}.") |
|
|
| with ThreadPoolExecutor(max_workers=optimal_workers) as executor: |
| for batch in chunk_batches: |
| if shutdown_requested: |
| break |
| futures.append(executor.submit( |
| process_batch, |
| batch, text_chunks_dir, audio_chunks_dir, |
| voice_path, tts_params, start_time, total_chunks, |
| punc_norm, book_dir.name, log_run, log_path, device, |
| model, asr_model, all_chunks, asr_enabled |
| )) |
| |
| |
| for fut in as_completed(futures): |
| try: |
| |
| results_list = fut.result() |
| for idx, wav_path in results_list: |
| if wav_path and wav_path.exists(): |
| chunk_duration = get_chunk_audio_duration(wav_path) |
| total_audio_duration += chunk_duration |
| batch_results.append((idx, wav_path)) |
| log_chunk_progress(len(batch_results), total_chunks, start_time, total_audio_duration) |
| except Exception as e: |
| logging.error(f"Future failed in batch: {e}") |
| else: |
| |
| print(f"🎨 VADER enabled. Running in nuanced, single-chunk mode.") |
| with ThreadPoolExecutor(max_workers=optimal_workers) as executor: |
| for i, chunk_data in enumerate(batch_chunks): |
| global_chunk_index = batch_start + i |
|
|
| |
| if shutdown_requested: |
| print(f"\n⏹️ {YELLOW}Stopping submission of new chunks...{RESET}") |
| break |
|
|
| |
| if isinstance(chunk_data, dict): |
| chunk = chunk_data["text"] |
| boundary_type = chunk_data.get("boundary_type", "none") |
| |
| chunk_tts_params = chunk_data.get("tts_params", tts_params) |
| else: |
| |
| chunk = chunk_data[0] if len(chunk_data) > 0 else str(chunk_data) |
| |
| is_old_para_end = chunk_data[1] if len(chunk_data) > 1 else False |
| boundary_type = "paragraph_end" if is_old_para_end else "none" |
| chunk_tts_params = tts_params |
|
|
| |
|
|
| futures.append(executor.submit( |
| process_one_chunk, |
| global_chunk_index, chunk, text_chunks_dir, audio_chunks_dir, |
| voice_path, chunk_tts_params, start_time, total_chunks, |
| punc_norm, book_dir.name, log_run, log_path, device, |
| model, asr_model, boundary_type=boundary_type, |
| enable_asr=asr_enabled |
| )) |
|
|
| |
| print(f"🔄 {CYAN}Waiting for batch {batch_start+1}-{batch_end} to complete...{RESET}") |
| completed_count = 0 |
|
|
| for fut in as_completed(futures): |
| try: |
| idx, wav_path = fut.result() |
| if wav_path and wav_path.exists(): |
| |
| chunk_duration = get_chunk_audio_duration(wav_path) |
| total_audio_duration += chunk_duration |
| batch_results.append((idx, wav_path)) |
|
|
| |
| completed_count += 1 |
| if completed_count % 2 == 0: |
| log_chunk_progress(batch_start + completed_count - 1, total_chunks, start_time, total_audio_duration) |
|
|
| except Exception as e: |
| logging.error(f"Future failed in batch: {e}") |
|
|
| |
| print(f"🧹 Cleaning up after batch {batch_start+1}-{batch_end}") |
| del model |
| if asr_model: |
| from modules.asr_manager import cleanup_asr_model |
| cleanup_asr_model(asr_model) |
| torch.cuda.empty_cache() |
| gc.collect() |
| time.sleep(2) |
|
|
| all_results.extend(batch_results) |
| print(f"✅ Batch {batch_start+1}-{batch_end} completed ({len(batch_results)} chunks)") |
|
|
| |
| quarantine_dir = audio_chunks_dir / "quarantine" |
| pause_for_chunk_review(quarantine_dir) |
|
|
| |
| chunk_paths = get_audio_files_in_directory(audio_chunks_dir) |
|
|
| if not chunk_paths: |
| logging.info(f"{RED}❌ No valid audio chunks found. Skipping concatenation and conversion.{RESET}") |
| return None, None, [] |
|
|
| |
| elapsed_total = time.time() - start_time |
| elapsed_td = timedelta(seconds=int(elapsed_total)) |
|
|
| total_audio_duration_final = sum(get_chunk_audio_duration(chunk_path) for chunk_path in chunk_paths) |
| audio_duration_td = timedelta(seconds=int(total_audio_duration_final)) |
| realtime_factor = total_audio_duration_final / elapsed_total if elapsed_total > 0 else 0.0 |
|
|
| print(f"\n⏱️ TTS Processing Complete:") |
| print(f" Elapsed Time: {CYAN}{str(elapsed_td)}{RESET}") |
| print(f" Audio Duration: {GREEN}{str(audio_duration_td)}{RESET}") |
| print(f" Realtime Factor: {YELLOW}{realtime_factor:.2f}x{RESET}") |
|
|
| |
| voice_name = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem |
| combined_wav_path = output_root / f"{book_dir.name} [{voice_name}].wav" |
| print("\n💾 Saving WAV file...") |
| combine_audio_chunks(chunk_paths, combined_wav_path) |
|
|
| |
| temp_m4b_path = output_root / "output.m4b" |
| final_m4b_path = output_root / f"{book_dir.name}[{voice_name}].m4b" |
| convert_to_m4b(combined_wav_path, temp_m4b_path) |
| add_metadata_to_m4b(temp_m4b_path, final_m4b_path, cover_file, nfo_file) |
|
|
| logging.info(f"Audiobook created: {final_m4b_path}") |
|
|
| |
| run_log_lines.extend([ |
| f"Combined WAV: {combined_wav_path}", |
| "--- Generation Settings ---", |
| f"Batch Processing: Enabled ({BATCH_SIZE} chunks per batch)", |
| f"ASR Enabled: {ENABLE_ASR}", |
| f"Hum Detection: {ENABLE_HUM_DETECTION}", |
| f"Dynamic Workers: {USE_DYNAMIC_WORKERS}", |
| f"Voice used: {voice_name}", |
| f"Exaggeration: {tts_params['exaggeration']}", |
| f"CFG weight: {tts_params['cfg_weight']}", |
| f"Temperature: {tts_params['temperature']}", |
| f"Processing Time: {str(elapsed_td)}", |
| f"Audio Duration: {str(audio_duration_td)}", |
| f"Realtime Factor: {realtime_factor:.2f}x", |
| f"Total Chunks: {len(chunk_paths)}" |
| ]) |
|
|
| |
| log_run("\n".join(run_log_lines), output_root / "run.log") |
| print(f"📝 Run log written to: {output_root / 'run.log'}") |
|
|
| return final_m4b_path, combined_wav_path, run_log_lines |