danneauxs commited on
Commit Β·
f3cff30
1
Parent(s): 67b64d0
vader check and batch processing
Browse files- BATCH_IMPLEMENTATION_PLAN.md +58 -0
- app.py +2 -2
- app.py.20250811-120000.bak +523 -0
- config/config.py +252 -0
- config/config.py.20250811-120000.bak +251 -0
- config/config.py~ +251 -0
- gradio_main_interface.py +27 -10
- gradio_tabs/gradio_imports.py +85 -0
- gradio_tabs/tab1_convert_book.py +15 -22
- gradio_tabs/tab1_convert_book.py.20250811-120000.bak +1173 -0
- gradio_tabs/tab2_configuration.py +68 -45
- gradio_tabs/tab_diagnostics.py +558 -0
- hold/chatterbox (copy).tar.gz +3 -0
- modules/tts_engine.py +143 -575
- modules/tts_engine.py.20250811-120000.bak +710 -0
- src/chatterbox/models/t3/t3.py +1 -0
- src/chatterbox/tts.py +135 -1
- src/chatterbox/tts.py.20250811-120000.bak +281 -0
- test_parallel_performance.py +235 -0
- utils/generate_from_json (copy).py +146 -0
BATCH_IMPLEMENTATION_PLAN.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Plan for Implementing High-Performance Batch Processing
|
| 2 |
+
|
| 3 |
+
This document outlines the necessary code modifications to implement a high-performance batch processing mode that can be toggled by the "Use VADER" checkbox in the GUI.
|
| 4 |
+
|
| 5 |
+
The goal is to create two distinct modes:
|
| 6 |
+
- **VADER On (Nuanced Mode):** Slower, processes chunks one-by-one with unique TTS parameters for nuanced delivery.
|
| 7 |
+
- **VADER Off (Batch Mode):** Significantly faster, processes chunks in batches with a single set of TTS parameters.
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## 1. File to Modify: `src/chatterbox/tts.py`
|
| 12 |
+
|
| 13 |
+
* **Purpose:** To enable the core TTS model to handle batches of text.
|
| 14 |
+
* **Changes Needed:**
|
| 15 |
+
* A new method, `generate_batch(self, texts: list, **tts_params)`, needs to be created within the `ChatterboxTTS` class.
|
| 16 |
+
* This method must perform the following steps:
|
| 17 |
+
1. Accept a list of text strings (`texts`).
|
| 18 |
+
2. Tokenize each text string in the list.
|
| 19 |
+
3. Pad the tokenized sequences to ensure they all have the same length, creating a single batch tensor. `torch.nn.utils.rnn.pad_sequence` is suitable for this.
|
| 20 |
+
4. Feed the complete batch tensor to the underlying model (`self.t3.inference` and `self.s3gen.inference`).
|
| 21 |
+
5. Return a list of the resulting audio waveforms.
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## 2. File to Modify: `modules/tts_engine.py`
|
| 26 |
+
|
| 27 |
+
* **Purpose:** To orchestrate the new batching workflow and choose the processing mode.
|
| 28 |
+
* **Changes Needed:**
|
| 29 |
+
|
| 30 |
+
### a. Create a New Worker Function
|
| 31 |
+
* Add a new function: `process_batch(batch_of_chunks, model, ...)`
|
| 32 |
+
* This function will:
|
| 33 |
+
1. Accept a list of chunk objects (e.g., a batch of 16).
|
| 34 |
+
2. Extract the text from each chunk into a simple list.
|
| 35 |
+
3. Call the new `model.generate_batch()` with the list of texts and the shared TTS parameters.
|
| 36 |
+
4. Receive a list of audio waveforms back.
|
| 37 |
+
5. Loop through the audio waves, apply the existing silence trimming and padding logic to each one, and save them to their respective `chunk_...wav` files.
|
| 38 |
+
|
| 39 |
+
### b. Modify the Main `process_book_folder` Function
|
| 40 |
+
* Locate the `use_vader` flag which is determined from the GUI options.
|
| 41 |
+
* Wrap the core processing loop in an `if/else` block based on this flag.
|
| 42 |
+
* **`if use_vader:` (Nuanced Mode):**
|
| 43 |
+
* Keep the existing code that iterates through chunks one-by-one and submits them to the `process_one_chunk` function.
|
| 44 |
+
* **`else:` (Batch Mode):**
|
| 45 |
+
* Add the new logic here.
|
| 46 |
+
* Group the `all_chunks` list into fixed-size batches based on `TTS_BATCH_SIZE` from the config.
|
| 47 |
+
* Use the existing `ThreadPoolExecutor` to submit these new **batches** to the new `process_batch` worker function.
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## 3. Files to Modify: `config/config.py` and `chatterbox_gui.py`
|
| 52 |
+
|
| 53 |
+
* **Purpose:** To provide user control over the batch size for performance tuning.
|
| 54 |
+
* **Changes Needed:**
|
| 55 |
+
* **In `config/config.py`:**
|
| 56 |
+
* Add a new configuration variable: `TTS_BATCH_SIZE = 16` (or another sensible default).
|
| 57 |
+
* **In `chatterbox_gui.py`:**
|
| 58 |
+
* On the "Config" tab, add a new `QSpinBox` (numeric input field) that is linked to the `TTS_BATCH_SIZE` variable. This will allow the user to change the batch size without editing code.
|
app.py
CHANGED
|
@@ -495,10 +495,10 @@ def main():
|
|
| 495 |
launcher.run()
|
| 496 |
|
| 497 |
if __name__ == "__main__":
|
| 498 |
-
# Add
|
| 499 |
import sys
|
| 500 |
import os
|
| 501 |
-
sys.path.
|
| 502 |
|
| 503 |
# Fix OpenMP environment variable for HuggingFace Spaces
|
| 504 |
os.environ["OMP_NUM_THREADS"] = "1"
|
|
|
|
| 495 |
launcher.run()
|
| 496 |
|
| 497 |
if __name__ == "__main__":
|
| 498 |
+
# Add src directory to Python path for HF Spaces
|
| 499 |
import sys
|
| 500 |
import os
|
| 501 |
+
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'src'))
|
| 502 |
|
| 503 |
# Fix OpenMP environment variable for HuggingFace Spaces
|
| 504 |
os.environ["OMP_NUM_THREADS"] = "1"
|
app.py.20250811-120000.bak
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Comprehensive Gradio Launcher for ChatterboxTTS
|
| 4 |
+
Automatically handles all requirements, installation, and setup
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
import subprocess
|
| 10 |
+
import importlib
|
| 11 |
+
import pkg_resources
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
class GradioLauncher:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.required_packages = {
|
| 18 |
+
# Core packages with fallbacks
|
| 19 |
+
'gradio': {'min_version': '4.0.0', 'install_name': 'gradio>=4.0.0'},
|
| 20 |
+
'torch': {'min_version': '2.0.0', 'install_name': 'torch>=2.0.0'},
|
| 21 |
+
'torchaudio': {'min_version': '2.0.0', 'install_name': 'torchaudio>=2.0.0'},
|
| 22 |
+
'transformers': {'min_version': '4.20.0', 'install_name': 'transformers>=4.20.0'},
|
| 23 |
+
'huggingface_hub': {'min_version': '0.15.0', 'install_name': 'huggingface_hub>=0.15.0'},
|
| 24 |
+
'safetensors': {'min_version': '0.3.0', 'install_name': 'safetensors>=0.3.0'},
|
| 25 |
+
|
| 26 |
+
# Audio processing
|
| 27 |
+
'soundfile': {'min_version': '0.12.0', 'install_name': 'soundfile>=0.12.0'},
|
| 28 |
+
'librosa': {'min_version': '0.10.0', 'install_name': 'librosa>=0.10.0'},
|
| 29 |
+
'pydub': {'min_version': '0.25.0', 'install_name': 'pydub>=0.25.0'},
|
| 30 |
+
|
| 31 |
+
# Voice Analysis (optional but recommended)
|
| 32 |
+
'parselmouth': {'min_version': '0.4.3', 'install_name': 'praat-parselmouth>=0.4.3', 'optional': True},
|
| 33 |
+
'matplotlib': {'min_version': '3.5.0', 'install_name': 'matplotlib>=3.5.0'},
|
| 34 |
+
'scipy': {'min_version': '1.8.0', 'install_name': 'scipy>=1.8.0'},
|
| 35 |
+
'numpy': {'min_version': '1.21.0', 'install_name': 'numpy>=1.21.0'},
|
| 36 |
+
|
| 37 |
+
# System utilities
|
| 38 |
+
'psutil': {'min_version': '5.8.0', 'install_name': 'psutil>=5.8.0'},
|
| 39 |
+
'vaderSentiment': {'min_version': '3.3.0', 'install_name': 'vaderSentiment>=3.3.0'},
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
self.chatterbox_git_url = 'git+https://github.com/resemble-ai/chatterbox-tts.git'
|
| 43 |
+
self.optional_packages = ['parselmouth', 'pynvml']
|
| 44 |
+
|
| 45 |
+
def print_header(self):
|
| 46 |
+
"""Print launcher header"""
|
| 47 |
+
print("=" * 70)
|
| 48 |
+
print("π ChatterboxTTS Gradio Launcher")
|
| 49 |
+
print("=" * 70)
|
| 50 |
+
print("π§ Comprehensive setup and dependency manager")
|
| 51 |
+
print("π¦ Automatically installs missing requirements")
|
| 52 |
+
print("π Launches web interface when ready")
|
| 53 |
+
print("-" * 70)
|
| 54 |
+
|
| 55 |
+
def check_python_version(self):
|
| 56 |
+
"""Check if Python version is compatible"""
|
| 57 |
+
print("π Checking Python version...")
|
| 58 |
+
|
| 59 |
+
version_info = sys.version_info
|
| 60 |
+
if version_info.major < 3 or (version_info.major == 3 and version_info.minor < 8):
|
| 61 |
+
print("β Error: Python 3.8+ required")
|
| 62 |
+
print(f" Current version: {version_info.major}.{version_info.minor}.{version_info.micro}")
|
| 63 |
+
print(" Please upgrade Python and try again")
|
| 64 |
+
sys.exit(1)
|
| 65 |
+
|
| 66 |
+
print(f"β
Python {version_info.major}.{version_info.minor}.{version_info.micro} - Compatible")
|
| 67 |
+
|
| 68 |
+
def check_working_directory(self):
|
| 69 |
+
"""Verify we're in the correct directory"""
|
| 70 |
+
print("π Checking working directory...")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if missing_files:
|
| 74 |
+
print(f"β Error: Missing required files/directories: {', '.join(missing_files)}")
|
| 75 |
+
print(" Please run this script from the ChatterboxTTS root directory")
|
| 76 |
+
print(" Expected structure:")
|
| 77 |
+
print(" βββ gradio_main_interface.py")
|
| 78 |
+
print(" βββ gradio_tabs/")
|
| 79 |
+
print(" βββ config/")
|
| 80 |
+
print(" βββ src/")
|
| 81 |
+
print(" βββ ...")
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
print("β
Working directory structure verified")
|
| 85 |
+
return True
|
| 86 |
+
|
| 87 |
+
def create_directories(self):
|
| 88 |
+
"""Create required directories if they don't exist"""
|
| 89 |
+
print("π Creating required directories...")
|
| 90 |
+
|
| 91 |
+
directories = ['Voice_Samples', 'Text_Input', 'Audiobook', 'Output', 'voice_analyzer']
|
| 92 |
+
created = []
|
| 93 |
+
|
| 94 |
+
for dir_name in directories:
|
| 95 |
+
dir_path = Path(dir_name)
|
| 96 |
+
if not dir_path.exists():
|
| 97 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 98 |
+
created.append(dir_name)
|
| 99 |
+
|
| 100 |
+
if created:
|
| 101 |
+
print(f"β
Created directories: {', '.join(created)}")
|
| 102 |
+
else:
|
| 103 |
+
print("β
All required directories exist")
|
| 104 |
+
|
| 105 |
+
def check_package_installed(self, package_name):
|
| 106 |
+
"""Check if a package is installed and get its version"""
|
| 107 |
+
# If we have a virtual environment, check there first
|
| 108 |
+
if hasattr(self, 'venv_python') and Path(self.venv_python).exists():
|
| 109 |
+
try:
|
| 110 |
+
cmd = [self.venv_python, '-c', f'''
|
| 111 |
+
try:
|
| 112 |
+
import {package_name}
|
| 113 |
+
print("INSTALLED", getattr({package_name}, "__version__", "0.0.0"))
|
| 114 |
+
except ImportError:
|
| 115 |
+
print("NOT_INSTALLED")
|
| 116 |
+
''']
|
| 117 |
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
| 118 |
+
if result.returncode == 0:
|
| 119 |
+
output = result.stdout.strip()
|
| 120 |
+
if output.startswith("INSTALLED"):
|
| 121 |
+
version = output.split(" ", 1)[1] if " " in output else "0.0.0"
|
| 122 |
+
return True, version
|
| 123 |
+
else:
|
| 124 |
+
return False, None
|
| 125 |
+
except Exception:
|
| 126 |
+
pass # Fall back to local check
|
| 127 |
+
|
| 128 |
+
# Fallback to local Python environment check
|
| 129 |
+
try:
|
| 130 |
+
if package_name == 'parselmouth':
|
| 131 |
+
# Special case for praat-parselmouth
|
| 132 |
+
import parselmouth
|
| 133 |
+
return True, getattr(parselmouth, '__version__', '0.0.0')
|
| 134 |
+
else:
|
| 135 |
+
module = importlib.import_module(package_name)
|
| 136 |
+
version = getattr(module, '__version__', '0.0.0')
|
| 137 |
+
return True, version
|
| 138 |
+
except ImportError:
|
| 139 |
+
try:
|
| 140 |
+
# Try with pkg_resources as fallback
|
| 141 |
+
pkg = pkg_resources.get_distribution(package_name)
|
| 142 |
+
return True, pkg.version
|
| 143 |
+
except (pkg_resources.DistributionNotFound, ImportError):
|
| 144 |
+
return False, None
|
| 145 |
+
|
| 146 |
+
def compare_versions(self, current, required):
|
| 147 |
+
"""Compare version strings"""
|
| 148 |
+
try:
|
| 149 |
+
current_parts = [int(x) for x in current.split('.')]
|
| 150 |
+
required_parts = [int(x) for x in required.split('.')]
|
| 151 |
+
|
| 152 |
+
# Pad shorter version with zeros
|
| 153 |
+
max_len = max(len(current_parts), len(required_parts))
|
| 154 |
+
current_parts.extend([0] * (max_len - len(current_parts)))
|
| 155 |
+
required_parts.extend([0] * (max_len - len(required_parts)))
|
| 156 |
+
|
| 157 |
+
return current_parts >= required_parts
|
| 158 |
+
except (ValueError, AttributeError):
|
| 159 |
+
# If we can't parse versions, assume it's okay
|
| 160 |
+
return True
|
| 161 |
+
|
| 162 |
+
def setup_virtual_environment(self):
|
| 163 |
+
"""Set up virtual environment if in externally managed environment"""
|
| 164 |
+
venv_path = Path("venv")
|
| 165 |
+
|
| 166 |
+
if not venv_path.exists():
|
| 167 |
+
print("π§ Creating virtual environment (externally managed Python detected)...")
|
| 168 |
+
try:
|
| 169 |
+
result = subprocess.run(
|
| 170 |
+
[sys.executable, '-m', 'venv', 'venv'],
|
| 171 |
+
capture_output=True,
|
| 172 |
+
text=True,
|
| 173 |
+
timeout=60
|
| 174 |
+
)
|
| 175 |
+
if result.returncode != 0:
|
| 176 |
+
print(f" β Failed to create virtual environment: {result.stderr}")
|
| 177 |
+
return False
|
| 178 |
+
print(" β
Virtual environment created")
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f" β Error creating virtual environment: {e}")
|
| 181 |
+
return False
|
| 182 |
+
else:
|
| 183 |
+
print("π§ Using existing virtual environment...")
|
| 184 |
+
|
| 185 |
+
# Update sys.executable to use venv python
|
| 186 |
+
if os.name == 'nt': # Windows
|
| 187 |
+
self.venv_python = str(venv_path / "Scripts" / "python.exe")
|
| 188 |
+
self.venv_pip = str(venv_path / "Scripts" / "pip.exe")
|
| 189 |
+
else: # Unix/Linux/Mac
|
| 190 |
+
self.venv_python = str(venv_path / "bin" / "python")
|
| 191 |
+
self.venv_pip = str(venv_path / "bin" / "pip")
|
| 192 |
+
|
| 193 |
+
# Verify venv python works
|
| 194 |
+
try:
|
| 195 |
+
result = subprocess.run([self.venv_python, '--version'], capture_output=True, text=True)
|
| 196 |
+
if result.returncode == 0:
|
| 197 |
+
print(f" β
Virtual environment Python: {result.stdout.strip()}")
|
| 198 |
+
return True
|
| 199 |
+
else:
|
| 200 |
+
print(" β Virtual environment Python not working")
|
| 201 |
+
return False
|
| 202 |
+
except Exception as e:
|
| 203 |
+
print(f" β Error testing virtual environment: {e}")
|
| 204 |
+
return False
|
| 205 |
+
|
| 206 |
+
def install_package(self, package_spec):
|
| 207 |
+
"""Install a package using pip (with virtual environment support)"""
|
| 208 |
+
try:
|
| 209 |
+
print(f" Installing {package_spec}...")
|
| 210 |
+
|
| 211 |
+
# Use venv pip if available, otherwise system pip
|
| 212 |
+
pip_executable = getattr(self, 'venv_pip', None)
|
| 213 |
+
if pip_executable and Path(pip_executable).exists():
|
| 214 |
+
cmd = [pip_executable, 'install', package_spec]
|
| 215 |
+
else:
|
| 216 |
+
cmd = [sys.executable, '-m', 'pip', 'install', package_spec]
|
| 217 |
+
|
| 218 |
+
result = subprocess.run(
|
| 219 |
+
cmd,
|
| 220 |
+
capture_output=True,
|
| 221 |
+
text=True,
|
| 222 |
+
timeout=300 # 5 minute timeout
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
if result.returncode == 0:
|
| 226 |
+
print(f" β
Successfully installed {package_spec}")
|
| 227 |
+
return True
|
| 228 |
+
else:
|
| 229 |
+
print(f" β Failed to install {package_spec}")
|
| 230 |
+
print(f" Error: {result.stderr}")
|
| 231 |
+
|
| 232 |
+
# If we get externally-managed error, try setting up venv
|
| 233 |
+
if "externally-managed-environment" in result.stderr and not hasattr(self, 'venv_python'):
|
| 234 |
+
print(" π Detected externally managed environment, setting up virtual environment...")
|
| 235 |
+
if self.setup_virtual_environment():
|
| 236 |
+
# Retry installation with venv
|
| 237 |
+
return self.install_package(package_spec)
|
| 238 |
+
|
| 239 |
+
return False
|
| 240 |
+
|
| 241 |
+
except subprocess.TimeoutExpired:
|
| 242 |
+
print(f" β° Installation of {package_spec} timed out")
|
| 243 |
+
return False
|
| 244 |
+
except Exception as e:
|
| 245 |
+
print(f" β Error installing {package_spec}: {str(e)}")
|
| 246 |
+
return False
|
| 247 |
+
|
| 248 |
+
def check_and_install_requirements(self):
|
| 249 |
+
"""Check and install all required packages"""
|
| 250 |
+
print("π¦ Checking package requirements...")
|
| 251 |
+
|
| 252 |
+
missing_packages = []
|
| 253 |
+
outdated_packages = []
|
| 254 |
+
optional_missing = []
|
| 255 |
+
|
| 256 |
+
# Check each required package
|
| 257 |
+
for package_name, info in self.required_packages.items():
|
| 258 |
+
is_installed, current_version = self.check_package_installed(package_name)
|
| 259 |
+
min_version = info['min_version']
|
| 260 |
+
is_optional = info.get('optional', False)
|
| 261 |
+
|
| 262 |
+
if not is_installed:
|
| 263 |
+
if is_optional:
|
| 264 |
+
optional_missing.append((package_name, info))
|
| 265 |
+
print(f" β οΈ Optional package missing: {package_name}")
|
| 266 |
+
else:
|
| 267 |
+
missing_packages.append((package_name, info))
|
| 268 |
+
print(f" β Missing required package: {package_name}")
|
| 269 |
+
elif current_version and not self.compare_versions(current_version, min_version):
|
| 270 |
+
if is_optional:
|
| 271 |
+
print(f" β οΈ Optional package outdated: {package_name} {current_version} < {min_version}")
|
| 272 |
+
else:
|
| 273 |
+
outdated_packages.append((package_name, info))
|
| 274 |
+
print(f" β Outdated package: {package_name} {current_version} < {min_version}")
|
| 275 |
+
else:
|
| 276 |
+
status = "β
" if not is_optional else "π§"
|
| 277 |
+
print(f" {status} {package_name}: {current_version}")
|
| 278 |
+
|
| 279 |
+
# Install missing/outdated packages
|
| 280 |
+
if missing_packages or outdated_packages:
|
| 281 |
+
print(f"\nπ§ Installing {len(missing_packages + outdated_packages)} required packages...")
|
| 282 |
+
|
| 283 |
+
for package_name, info in missing_packages + outdated_packages:
|
| 284 |
+
install_spec = info['install_name']
|
| 285 |
+
if not self.install_package(install_spec):
|
| 286 |
+
print(f"β Critical error: Failed to install {package_name}")
|
| 287 |
+
return False
|
| 288 |
+
|
| 289 |
+
# Install ChatterboxTTS if not available
|
| 290 |
+
print("π€ Checking ChatterboxTTS installation...")
|
| 291 |
+
try:
|
| 292 |
+
import chatterbox
|
| 293 |
+
print(" β
ChatterboxTTS already installed")
|
| 294 |
+
except ImportError:
|
| 295 |
+
print(" π₯ Installing ChatterboxTTS from GitHub...")
|
| 296 |
+
if not self.install_package(self.chatterbox_git_url):
|
| 297 |
+
print(" β οΈ ChatterboxTTS installation failed - some features may not work")
|
| 298 |
+
|
| 299 |
+
# Try to install optional packages
|
| 300 |
+
if optional_missing:
|
| 301 |
+
print(f"\nπ― Installing {len(optional_missing)} optional packages...")
|
| 302 |
+
for package_name, info in optional_missing:
|
| 303 |
+
install_spec = info['install_name']
|
| 304 |
+
if self.install_package(install_spec):
|
| 305 |
+
print(f" β
Optional package {package_name} installed successfully")
|
| 306 |
+
else:
|
| 307 |
+
print(f" β οΈ Optional package {package_name} failed - voice analysis may be limited")
|
| 308 |
+
|
| 309 |
+
return True
|
| 310 |
+
|
| 311 |
+
def check_gpu_availability(self):
|
| 312 |
+
"""Check for GPU availability"""
|
| 313 |
+
print("π₯οΈ Checking GPU availability...")
|
| 314 |
+
|
| 315 |
+
try:
|
| 316 |
+
import torch
|
| 317 |
+
if torch.cuda.is_available():
|
| 318 |
+
gpu_count = torch.cuda.device_count()
|
| 319 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 320 |
+
print(f" β
CUDA GPU available: {gpu_name} ({gpu_count} device{'s' if gpu_count > 1 else ''})")
|
| 321 |
+
return True
|
| 322 |
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 323 |
+
print(" β
Apple Metal Performance Shaders (MPS) available")
|
| 324 |
+
return True
|
| 325 |
+
else:
|
| 326 |
+
print(" β οΈ No GPU acceleration available - using CPU")
|
| 327 |
+
print(" π‘ For better performance, consider using a GPU-enabled environment")
|
| 328 |
+
return False
|
| 329 |
+
except Exception as e:
|
| 330 |
+
print(f" β Error checking GPU: {str(e)}")
|
| 331 |
+
return False
|
| 332 |
+
|
| 333 |
+
def verify_installation(self):
|
| 334 |
+
"""Verify that all components can be imported"""
|
| 335 |
+
print("π Verifying installation...")
|
| 336 |
+
|
| 337 |
+
critical_imports = [
|
| 338 |
+
('gradio', 'Gradio web interface'),
|
| 339 |
+
('torch', 'PyTorch machine learning'),
|
| 340 |
+
('transformers', 'Hugging Face transformers'),
|
| 341 |
+
('librosa', 'Audio processing'),
|
| 342 |
+
('soundfile', 'Audio file I/O'),
|
| 343 |
+
('numpy', 'Numerical computing'),
|
| 344 |
+
('matplotlib', 'Plotting and visualization')
|
| 345 |
+
]
|
| 346 |
+
|
| 347 |
+
optional_imports = [
|
| 348 |
+
('parselmouth', 'Praat voice analysis'),
|
| 349 |
+
('scipy', 'Scientific computing'),
|
| 350 |
+
('psutil', 'System monitoring')
|
| 351 |
+
]
|
| 352 |
+
|
| 353 |
+
failed_critical = []
|
| 354 |
+
failed_optional = []
|
| 355 |
+
|
| 356 |
+
# Check critical imports
|
| 357 |
+
for module_name, description in critical_imports:
|
| 358 |
+
try:
|
| 359 |
+
importlib.import_module(module_name)
|
| 360 |
+
print(f" β
{description}")
|
| 361 |
+
except ImportError as e:
|
| 362 |
+
print(f" β {description}: {str(e)}")
|
| 363 |
+
failed_critical.append(module_name)
|
| 364 |
+
|
| 365 |
+
# Check optional imports
|
| 366 |
+
for module_name, description in optional_imports:
|
| 367 |
+
try:
|
| 368 |
+
importlib.import_module(module_name)
|
| 369 |
+
print(f" π§ {description}")
|
| 370 |
+
except ImportError:
|
| 371 |
+
print(f" β οΈ {description}: Not available")
|
| 372 |
+
failed_optional.append(module_name)
|
| 373 |
+
|
| 374 |
+
if failed_critical:
|
| 375 |
+
print(f"\nβ Critical imports failed: {', '.join(failed_critical)}")
|
| 376 |
+
print(" The interface may not work properly")
|
| 377 |
+
return False
|
| 378 |
+
|
| 379 |
+
if failed_optional:
|
| 380 |
+
print(f"\nβ οΈ Optional features unavailable: {', '.join(failed_optional)}")
|
| 381 |
+
print(" Voice analysis features may be limited")
|
| 382 |
+
|
| 383 |
+
print("β
Installation verification complete")
|
| 384 |
+
return True
|
| 385 |
+
|
| 386 |
+
def launch_interface(self):
|
| 387 |
+
"""Launch the Gradio interface"""
|
| 388 |
+
print("\nπ Launching ChatterboxTTS Gradio Interface...")
|
| 389 |
+
print("-" * 50)
|
| 390 |
+
|
| 391 |
+
# If we're using a virtual environment, launch with venv python
|
| 392 |
+
if hasattr(self, 'venv_python') and Path(self.venv_python).exists():
|
| 393 |
+
print("π§ Using virtual environment Python...")
|
| 394 |
+
try:
|
| 395 |
+
print("π Starting web server...")
|
| 396 |
+
print("π± Interface will be available in your browser")
|
| 397 |
+
print("π Default URL: http://localhost:7860")
|
| 398 |
+
|
| 399 |
+
if os.getenv("RUNPOD_POD_ID"):
|
| 400 |
+
print("βοΈ RunPod deployment detected")
|
| 401 |
+
elif os.getenv("COLAB_GPU"):
|
| 402 |
+
print("βοΈ Google Colab detected - sharing link will be generated")
|
| 403 |
+
|
| 404 |
+
print("\n" + "=" * 50)
|
| 405 |
+
print("π LAUNCHING CHATTERBOX TTS!")
|
| 406 |
+
print("=" * 50)
|
| 407 |
+
|
| 408 |
+
# Launch using virtual environment python
|
| 409 |
+
subprocess.run([self.venv_python, "gradio_main_interface.py"])
|
| 410 |
+
|
| 411 |
+
except KeyboardInterrupt:
|
| 412 |
+
print("\n\nπ Shutdown requested by user")
|
| 413 |
+
print(" Thanks for using ChatterboxTTS!")
|
| 414 |
+
sys.exit(0)
|
| 415 |
+
except Exception as e:
|
| 416 |
+
print(f"\nβ Error launching with virtual environment: {str(e)}")
|
| 417 |
+
print(" Falling back to direct import...")
|
| 418 |
+
self._launch_direct()
|
| 419 |
+
else:
|
| 420 |
+
self._launch_direct()
|
| 421 |
+
|
| 422 |
+
def _launch_direct(self):
|
| 423 |
+
"""Launch interface by direct import"""
|
| 424 |
+
try:
|
| 425 |
+
# Import and launch
|
| 426 |
+
from gradio_main_interface import launch_interface
|
| 427 |
+
|
| 428 |
+
print("π Starting web server...")
|
| 429 |
+
print("π± Interface will be available in your browser")
|
| 430 |
+
print("π Default URL: http://localhost:7860")
|
| 431 |
+
|
| 432 |
+
if os.getenv("RUNPOD_POD_ID"):
|
| 433 |
+
print("βοΈ RunPod deployment detected")
|
| 434 |
+
elif os.getenv("COLAB_GPU"):
|
| 435 |
+
print("βοΈ Google Colab detected - sharing link will be generated")
|
| 436 |
+
|
| 437 |
+
print("\n" + "=" * 50)
|
| 438 |
+
print("π LAUNCHING CHATTERBOX TTS!")
|
| 439 |
+
print("=" * 50)
|
| 440 |
+
|
| 441 |
+
# Small delay for user to read messages
|
| 442 |
+
time.sleep(2)
|
| 443 |
+
|
| 444 |
+
# Launch the interface
|
| 445 |
+
launch_interface()
|
| 446 |
+
|
| 447 |
+
except KeyboardInterrupt:
|
| 448 |
+
print("\n\nπ Shutdown requested by user")
|
| 449 |
+
print(" Thanks for using ChatterboxTTS!")
|
| 450 |
+
sys.exit(0)
|
| 451 |
+
except Exception as e:
|
| 452 |
+
print(f"\nβ Error launching interface: {str(e)}")
|
| 453 |
+
print("\nTroubleshooting tips:")
|
| 454 |
+
print("1. Check that all dependencies are installed")
|
| 455 |
+
print("2. Verify you're in the correct directory")
|
| 456 |
+
if hasattr(self, 'venv_python'):
|
| 457 |
+
print(f"3. Try running: {self.venv_python} gradio_main_interface.py")
|
| 458 |
+
else:
|
| 459 |
+
print("3. Try running: python3 gradio_main_interface.py")
|
| 460 |
+
sys.exit(1)
|
| 461 |
+
|
| 462 |
+
def run(self):
|
| 463 |
+
"""Run the complete launcher process"""
|
| 464 |
+
self.print_header()
|
| 465 |
+
|
| 466 |
+
# Step 1: Check Python version
|
| 467 |
+
self.check_python_version()
|
| 468 |
+
|
| 469 |
+
# Step 2: Check working directory
|
| 470 |
+
if not self.check_working_directory():
|
| 471 |
+
sys.exit(1)
|
| 472 |
+
|
| 473 |
+
# Step 3: Create required directories
|
| 474 |
+
self.create_directories()
|
| 475 |
+
|
| 476 |
+
# Step 4: Check and install requirements
|
| 477 |
+
if not self.check_and_install_requirements():
|
| 478 |
+
print("\nβ Failed to install required packages")
|
| 479 |
+
sys.exit(1)
|
| 480 |
+
|
| 481 |
+
# Step 5: Check GPU availability
|
| 482 |
+
self.check_gpu_availability()
|
| 483 |
+
|
| 484 |
+
# Step 6: Verify installation
|
| 485 |
+
if not self.verify_installation():
|
| 486 |
+
print("\nβ οΈ Installation verification failed")
|
| 487 |
+
print(" Proceeding anyway - some features may not work")
|
| 488 |
+
|
| 489 |
+
# Step 7: Launch interface
|
| 490 |
+
self.launch_interface()
|
| 491 |
+
|
| 492 |
+
def main():
|
| 493 |
+
"""Main entry point"""
|
| 494 |
+
launcher = GradioLauncher()
|
| 495 |
+
launcher.run()
|
| 496 |
+
|
| 497 |
+
if __name__ == "__main__":
|
| 498 |
+
# Add current directory to Python path for HF Spaces
|
| 499 |
+
import sys
|
| 500 |
+
import os
|
| 501 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 502 |
+
|
| 503 |
+
# Fix OpenMP environment variable for HuggingFace Spaces
|
| 504 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
| 505 |
+
|
| 506 |
+
# Skip launcher logic for HF Spaces, run interface directly
|
| 507 |
+
try:
|
| 508 |
+
# Import the actual Gradio interface
|
| 509 |
+
import gradio_main_interface
|
| 510 |
+
|
| 511 |
+
# Create and launch the interface
|
| 512 |
+
demo = gradio_main_interface.create_main_interface()
|
| 513 |
+
demo.launch(
|
| 514 |
+
server_name="0.0.0.0",
|
| 515 |
+
server_port=7860,
|
| 516 |
+
share=False,
|
| 517 |
+
show_error=True
|
| 518 |
+
)
|
| 519 |
+
except ImportError as e:
|
| 520 |
+
print(f"β Failed to import gradio_main_interface: {e}")
|
| 521 |
+
# Fallback to launcher if needed
|
| 522 |
+
launcher = GradioLauncher()
|
| 523 |
+
launcher.launch_interface()
|
config/config.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GenTTS Configuration Module
|
| 3 |
+
Central location for all settings, paths, and feature toggles
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# ============================================================================
|
| 10 |
+
# CORE DIRECTORIES
|
| 11 |
+
# ============================================================================
|
| 12 |
+
TEXT_INPUT_ROOT = Path("Text_Input")
|
| 13 |
+
AUDIOBOOK_ROOT = Path("Audiobook")
|
| 14 |
+
VOICE_SAMPLES_DIR = Path("Voice_Samples")
|
| 15 |
+
|
| 16 |
+
# ============================================================================
|
| 17 |
+
# TEXT PROCESSING SETTINGS
|
| 18 |
+
# ============================================================================
|
| 19 |
+
MAX_CHUNK_WORDS = 28
|
| 20 |
+
MIN_CHUNK_WORDS = 4
|
| 21 |
+
|
| 22 |
+
# ============================================================================
|
| 23 |
+
# WORKER AND PERFORMANCE SETTINGS
|
| 24 |
+
# ============================================================================
|
| 25 |
+
MAX_WORKERS = 2
|
| 26 |
+
TEST_MAX_WORKERS = 6 # For experimentation
|
| 27 |
+
USE_DYNAMIC_WORKERS = False # Toggle for testing
|
| 28 |
+
VRAM_SAFETY_THRESHOLD = 6.5 # GB
|
| 29 |
+
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# AUDIO QUALITY SETTINGS
|
| 32 |
+
# ============================================================================
|
| 33 |
+
ENABLE_MID_DROP_CHECK = False
|
| 34 |
+
ENABLE_ASR = False # Disabled by default due to tensor dimension errors
|
| 35 |
+
ASR_WORKERS = 4 # Parallel ASR on CPU threads
|
| 36 |
+
DEFAULT_ASR_MODEL = "base" # Default Whisper model for ASR validation
|
| 37 |
+
|
| 38 |
+
# ASR Model Memory Requirements (approximate)
|
| 39 |
+
ASR_MODEL_VRAM_MB = {
|
| 40 |
+
"tiny": 39,
|
| 41 |
+
"base": 74,
|
| 42 |
+
"small": 244,
|
| 43 |
+
"medium": 769,
|
| 44 |
+
"large": 1550,
|
| 45 |
+
"large-v2": 1550,
|
| 46 |
+
"large-v3": 1550
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
ASR_MODEL_RAM_MB = {
|
| 50 |
+
"tiny": 150,
|
| 51 |
+
"base": 300,
|
| 52 |
+
"small": 800,
|
| 53 |
+
"medium": 2000,
|
| 54 |
+
"large": 4000,
|
| 55 |
+
"large-v2": 4000,
|
| 56 |
+
"large-v3": 4000
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# ============================================================================
|
| 60 |
+
# TTS HUM DETECTION SETTINGS
|
| 61 |
+
# ============================================================================
|
| 62 |
+
ENABLE_HUM_DETECTION = False
|
| 63 |
+
HUM_FREQ_MIN = 50 # Hz - Lower frequency bound for hum detection
|
| 64 |
+
HUM_FREQ_MAX = 200 # Hz - Upper frequency bound for hum detection
|
| 65 |
+
HUM_ENERGY_THRESHOLD = 0.3 # Ratio of hum energy to total energy (0.1-0.5 range)
|
| 66 |
+
HUM_STEADY_THRESHOLD = 0.6 # Ratio of segments with steady amplitude (0.5-0.8 range)
|
| 67 |
+
HUM_AMPLITUDE_MIN = 0.005 # Minimum RMS for steady hum detection
|
| 68 |
+
HUM_AMPLITUDE_MAX = 0.1 # Maximum RMS for steady hum detection
|
| 69 |
+
|
| 70 |
+
# ============================================================================
|
| 71 |
+
# AUDIO TRIMMING SETTINGS
|
| 72 |
+
# ============================================================================
|
| 73 |
+
ENABLE_AUDIO_TRIMMING = True
|
| 74 |
+
SPEECH_ENDPOINT_THRESHOLD = 0.006
|
| 75 |
+
TRIMMING_BUFFER_MS = 50
|
| 76 |
+
|
| 77 |
+
# ============================================================================
|
| 78 |
+
# SILENCE DURATION SETTINGS (milliseconds)
|
| 79 |
+
# ============================================================================
|
| 80 |
+
SILENCE_CHAPTER_START = 1195
|
| 81 |
+
SILENCE_CHAPTER_END = 1100
|
| 82 |
+
SILENCE_SECTION_BREAK = 700
|
| 83 |
+
SILENCE_PARAGRAPH_END = 1000
|
| 84 |
+
|
| 85 |
+
# Punctuation-specific silence settings (milliseconds)
|
| 86 |
+
SILENCE_COMMA = 150
|
| 87 |
+
SILENCE_SEMICOLON = 150 # Medium pause after semicolons
|
| 88 |
+
SILENCE_COLON = 150 # Pause after colons
|
| 89 |
+
SILENCE_PERIOD = 500
|
| 90 |
+
SILENCE_QUESTION_MARK = 500
|
| 91 |
+
SILENCE_EXCLAMATION = 200
|
| 92 |
+
SILENCE_DASH = 200 # Em dash pause
|
| 93 |
+
SILENCE_ELLIPSIS = 80 # Ellipsis pause (suspense)
|
| 94 |
+
SILENCE_QUOTE_END = 150 # End of quoted speech
|
| 95 |
+
|
| 96 |
+
# Chunk-level silence settings
|
| 97 |
+
ENABLE_CHUNK_END_SILENCE = False
|
| 98 |
+
CHUNK_END_SILENCE_MS = 200
|
| 99 |
+
|
| 100 |
+
# Content boundary silence settings (milliseconds)
|
| 101 |
+
SILENCE_PARAGRAPH_FALLBACK = 500 # Original paragraph logic fallback
|
| 102 |
+
|
| 103 |
+
# ============================================================================
|
| 104 |
+
# AUDIO NORMALIZATION SETTINGS
|
| 105 |
+
# ============================================================================
|
| 106 |
+
ENABLE_NORMALIZATION = True
|
| 107 |
+
NORMALIZATION_TYPE = "peak"
|
| 108 |
+
TARGET_LUFS = -16
|
| 109 |
+
TARGET_PEAK_DB = -1.5
|
| 110 |
+
TARGET_LRA = 11 # Target loudness range for consistency
|
| 111 |
+
|
| 112 |
+
# ============================================================================
|
| 113 |
+
# AUDIO PLAYBACK SPEED SETTINGS
|
| 114 |
+
# ============================================================================
|
| 115 |
+
ATEMPO_SPEED = 1.0
|
| 116 |
+
|
| 117 |
+
# ============================================================================
|
| 118 |
+
# ENVIRONMENT SETUP
|
| 119 |
+
# ============================================================================
|
| 120 |
+
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
| 121 |
+
os.environ["TRANSFORMERS_NO_PROGRESS_BAR"] = "1"
|
| 122 |
+
os.environ["HF_TRANSFORMERS_NO_TQDM"] = "1"
|
| 123 |
+
# Cache handling is now done by launcher scripts:
|
| 124 |
+
# - launch_gradio_local.sh: Sets shared cache for development
|
| 125 |
+
# - launch_gradio.sh: Uses PyTorch defaults for containers/deployment
|
| 126 |
+
|
| 127 |
+
# ============================================================================
|
| 128 |
+
# COLOR CODES FOR TERMINAL OUTPUT
|
| 129 |
+
# ============================================================================
|
| 130 |
+
RESET = "\033[0m"
|
| 131 |
+
BOLD = "\033[1m"
|
| 132 |
+
RED = "\033[91m"
|
| 133 |
+
GREEN = "\033[92m"
|
| 134 |
+
YELLOW = "\033[93m"
|
| 135 |
+
CYAN = "\033[96m"
|
| 136 |
+
|
| 137 |
+
# ============================================================================
|
| 138 |
+
# TTS MODEL PARAMETERS (DEFAULTS)
|
| 139 |
+
# ============================================================================
|
| 140 |
+
DEFAULT_EXAGGERATION = 0.5
|
| 141 |
+
DEFAULT_CFG_WEIGHT = 0.5
|
| 142 |
+
DEFAULT_TEMPERATURE = 0.85
|
| 143 |
+
|
| 144 |
+
# Advanced Sampling Parameters (Min_P Sampler Support)
|
| 145 |
+
DEFAULT_MIN_P = 0.05 # Min probability threshold (0.0 disables)
|
| 146 |
+
DEFAULT_TOP_P = 1.0 # Top-p sampling (1.0 disables)
|
| 147 |
+
DEFAULT_REPETITION_PENALTY = 1.2 # Repetition penalty (1.0 = no penalty)
|
| 148 |
+
|
| 149 |
+
# ============================================================================
|
| 150 |
+
# VADER SENTIMENT TO TTS PARAMETER MAPPING
|
| 151 |
+
# ============================================================================
|
| 152 |
+
# These settings control how VADER sentiment analysis dynamically adjusts TTS parameters.
|
| 153 |
+
# The formula used is: new_param = base_param + (compound_score * sensitivity)
|
| 154 |
+
# The result is then clamped within the defined MIN/MAX range.
|
| 155 |
+
|
| 156 |
+
# --- Base TTS Parameters (used as the starting point) ---
|
| 157 |
+
# These are the same as the main defaults, but listed here for clarity.
|
| 158 |
+
BASE_EXAGGERATION = DEFAULT_EXAGGERATION # Default: 1.0
|
| 159 |
+
BASE_CFG_WEIGHT = DEFAULT_CFG_WEIGHT # Default: 0.7
|
| 160 |
+
BASE_TEMPERATURE = DEFAULT_TEMPERATURE # Default: 0.7
|
| 161 |
+
|
| 162 |
+
# --- Sensitivity ---
|
| 163 |
+
# How much VADER's compound score affects each parameter.
|
| 164 |
+
# Higher values mean more dramatic changes based on sentiment.
|
| 165 |
+
VADER_EXAGGERATION_SENSITIVITY = 0.33
|
| 166 |
+
VADER_CFG_WEIGHT_SENSITIVITY = 0.32
|
| 167 |
+
VADER_TEMPERATURE_SENSITIVITY = 0.3
|
| 168 |
+
VADER_MIN_P_SENSITIVITY = 0.01 # Reduced from 0.02 to prevent sampling issues
|
| 169 |
+
VADER_REPETITION_PENALTY_SENSITIVITY = 0.05 # Reduced from 0.1 to be more conservative
|
| 170 |
+
|
| 171 |
+
# --- Min/Max Clamps ---
|
| 172 |
+
# Hard limits to prevent extreme, undesirable audio artifacts.
|
| 173 |
+
TTS_PARAM_MIN_EXAGGERATION = 0.1
|
| 174 |
+
TTS_PARAM_MAX_EXAGGERATION = 0.65
|
| 175 |
+
TTS_PARAM_MIN_CFG_WEIGHT = 0.15
|
| 176 |
+
TTS_PARAM_MAX_CFG_WEIGHT = 0.8
|
| 177 |
+
|
| 178 |
+
TTS_PARAM_MIN_TEMPERATURE = 0.1
|
| 179 |
+
TTS_PARAM_MAX_TEMPERATURE = 2.3499999999999988
|
| 180 |
+
|
| 181 |
+
TTS_PARAM_MIN_MIN_P = 0.02 # Increased from 0.0 to prevent sampling issues
|
| 182 |
+
TTS_PARAM_MAX_MIN_P = 0.3 # Reduced from MAX 0.5 to prevent over-restriction
|
| 183 |
+
TTS_PARAM_MIN_TOP_P = 0.5 # Too low causes repetition
|
| 184 |
+
TTS_PARAM_MAX_TOP_P = 1.0 # MAX 1.0 disables top_p
|
| 185 |
+
TTS_PARAM_MIN_REPETITION_PENALTY = 1.0 # 1.0 = no penalty
|
| 186 |
+
TTS_PARAM_MAX_REPETITION_PENALTY = 2.0 # Higher values too restrictive MAX 2
|
| 187 |
+
|
| 188 |
+
# ============================================================================
|
| 189 |
+
# BATCH PROCESSING SETTINGS
|
| 190 |
+
# ============================================================================
|
| 191 |
+
BATCH_SIZE = 400
|
| 192 |
+
TTS_BATCH_SIZE = 16 # Batch size for TTS inference when VADER is disabled
|
| 193 |
+
CLEANUP_INTERVAL = 500 # Deep cleanup every N chunks (reduced frequency for speed)
|
| 194 |
+
|
| 195 |
+
# ============================================================================
|
| 196 |
+
# QUALITY ENHANCEMENT SETTINGS (Phase 1)
|
| 197 |
+
# ============================================================================
|
| 198 |
+
|
| 199 |
+
# --- Regeneration Loop Settings ---
|
| 200 |
+
ENABLE_REGENERATION_LOOP = True # Enable automatic chunk regeneration on quality failure
|
| 201 |
+
MAX_REGENERATION_ATTEMPTS = 3 # Maximum retry attempts per chunk
|
| 202 |
+
QUALITY_THRESHOLD = 0.30 # TEMPORARILY LOWERED - Composite quality score threshold (0.0-1.0)
|
| 203 |
+
|
| 204 |
+
# --- Sentiment Smoothing Settings ---
|
| 205 |
+
ENABLE_SENTIMENT_SMOOTHING = True # Re-enabled - GUI controls now working properly
|
| 206 |
+
SENTIMENT_SMOOTHING_WINDOW = 3 # Number of previous chunks to consider
|
| 207 |
+
SENTIMENT_SMOOTHING_METHOD = "rolling" # "rolling" or "exp_decay"
|
| 208 |
+
|
| 209 |
+
# Exponential decay weights for smoothing (used if method is "exp_decay")
|
| 210 |
+
SENTIMENT_EXP_DECAY_WEIGHTS = [0.5, 0.3, 0.2] # Most recent to oldest
|
| 211 |
+
|
| 212 |
+
# --- Enhanced Anomaly Detection ---
|
| 213 |
+
SPECTRAL_ANOMALY_THRESHOLD = 0.6 # Spectral anomaly score threshold (0.0-1.0)
|
| 214 |
+
ENABLE_MFCC_VALIDATION = True # Enable MFCC-based spectral analysis
|
| 215 |
+
SPECTRAL_VARIANCE_LIMIT = 100.0 # Maximum spectral variance before flagging as artifact
|
| 216 |
+
|
| 217 |
+
# --- Output Validation Settings ---
|
| 218 |
+
ENABLE_OUTPUT_VALIDATION = True # Enable quality control clearinghouse (runs individual checks when enabled)
|
| 219 |
+
OUTPUT_VALIDATION_THRESHOLD = 0.6 # Minimum F1 score for output validation (reduced for punctuation tolerance)
|
| 220 |
+
|
| 221 |
+
# --- Parameter Adjustment for Regeneration ---
|
| 222 |
+
REGEN_TEMPERATURE_ADJUSTMENT = 0.1 # How much to adjust temperature per retry (increased for visibility)
|
| 223 |
+
REGEN_EXAGGERATION_ADJUSTMENT = 0.15 # How much to adjust exaggeration per retry (increased for visibility)
|
| 224 |
+
REGEN_CFG_ADJUSTMENT = 0.1 # How much to adjust cfg_weight per retry (increased for visibility)
|
| 225 |
+
|
| 226 |
+
# ============================================================================
|
| 227 |
+
# PERFORMANCE OPTIMIZATION SETTINGS
|
| 228 |
+
# ============================================================================
|
| 229 |
+
# Voice Embedding Caching - Cache voice embeddings to avoid recomputation
|
| 230 |
+
ENABLE_VOICE_EMBEDDING_CACHE = True # Enable voice embedding caching
|
| 231 |
+
VOICE_CACHE_MEMORY_LIMIT_MB = 500 # Maximum memory for voice cache (MB)
|
| 232 |
+
ENABLE_ADAPTIVE_VOICE_CACHE = True # Adapt cache based on system memory
|
| 233 |
+
|
| 234 |
+
# GPU Persistence Mode - Keep GPU in compute-ready state
|
| 235 |
+
ENABLE_GPU_PERSISTENCE_MODE = False # Try to enable GPU persistence mode
|
| 236 |
+
GPU_PERSISTENCE_RETRY_COUNT = 3 # Retry attempts for persistence mode
|
| 237 |
+
|
| 238 |
+
# CUDA Memory Pool - Advanced GPU memory management
|
| 239 |
+
ENABLE_CUDA_MEMORY_POOL = False # Enable CUDA memory pooling
|
| 240 |
+
CUDA_MEMORY_POOL_FRACTION = 0.9 # Fraction of GPU memory to pool
|
| 241 |
+
ENABLE_ADAPTIVE_MEMORY_POOL = True # Adapt pool size to system
|
| 242 |
+
|
| 243 |
+
# Producer-Consumer Pipeline - Eliminate chunk loading overhead
|
| 244 |
+
ENABLE_PRODUCER_CONSUMER_PIPELINE = False # Re-enabled with proper ETA tracking
|
| 245 |
+
PIPELINE_QUEUE_SIZE_MULTIPLIER = 3 # Queue size = workers * multiplier
|
| 246 |
+
PIPELINE_MAX_QUEUE_SIZE = 20 # Maximum queue size limit
|
| 247 |
+
ENABLE_PIPELINE_FALLBACK = True # Fall back to sequential if pipeline fails
|
| 248 |
+
|
| 249 |
+
# ============================================================================
|
| 250 |
+
# FEATURE TOGGLES
|
| 251 |
+
# ============================================================================
|
| 252 |
+
shutdown_requested = False # Global shutdown flag
|
config/config.py.20250811-120000.bak
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GenTTS Configuration Module
|
| 3 |
+
Central location for all settings, paths, and feature toggles
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# ============================================================================
|
| 10 |
+
# CORE DIRECTORIES
|
| 11 |
+
# ============================================================================
|
| 12 |
+
TEXT_INPUT_ROOT = Path("Text_Input")
|
| 13 |
+
AUDIOBOOK_ROOT = Path("Audiobook")
|
| 14 |
+
VOICE_SAMPLES_DIR = Path("Voice_Samples")
|
| 15 |
+
|
| 16 |
+
# ============================================================================
|
| 17 |
+
# TEXT PROCESSING SETTINGS
|
| 18 |
+
# ============================================================================
|
| 19 |
+
MAX_CHUNK_WORDS = 32
|
| 20 |
+
MIN_CHUNK_WORDS = 4
|
| 21 |
+
|
| 22 |
+
# ============================================================================
|
| 23 |
+
# WORKER AND PERFORMANCE SETTINGS
|
| 24 |
+
# ============================================================================
|
| 25 |
+
MAX_WORKERS = 2
|
| 26 |
+
TEST_MAX_WORKERS = 6 # For experimentation
|
| 27 |
+
USE_DYNAMIC_WORKERS = False # Toggle for testing
|
| 28 |
+
VRAM_SAFETY_THRESHOLD = 6.5 # GB
|
| 29 |
+
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# AUDIO QUALITY SETTINGS
|
| 32 |
+
# ============================================================================
|
| 33 |
+
ENABLE_MID_DROP_CHECK = False
|
| 34 |
+
ENABLE_ASR = False # Disabled by default due to tensor dimension errors
|
| 35 |
+
ASR_WORKERS = 4 # Parallel ASR on CPU threads
|
| 36 |
+
DEFAULT_ASR_MODEL = "base" # Default Whisper model for ASR validation
|
| 37 |
+
|
| 38 |
+
# ASR Model Memory Requirements (approximate)
|
| 39 |
+
ASR_MODEL_VRAM_MB = {
|
| 40 |
+
"tiny": 39,
|
| 41 |
+
"base": 74,
|
| 42 |
+
"small": 244,
|
| 43 |
+
"medium": 769,
|
| 44 |
+
"large": 1550,
|
| 45 |
+
"large-v2": 1550,
|
| 46 |
+
"large-v3": 1550
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
ASR_MODEL_RAM_MB = {
|
| 50 |
+
"tiny": 150,
|
| 51 |
+
"base": 300,
|
| 52 |
+
"small": 800,
|
| 53 |
+
"medium": 2000,
|
| 54 |
+
"large": 4000,
|
| 55 |
+
"large-v2": 4000,
|
| 56 |
+
"large-v3": 4000
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# ============================================================================
|
| 60 |
+
# TTS HUM DETECTION SETTINGS
|
| 61 |
+
# ============================================================================
|
| 62 |
+
ENABLE_HUM_DETECTION = False
|
| 63 |
+
HUM_FREQ_MIN = 50 # Hz - Lower frequency bound for hum detection
|
| 64 |
+
HUM_FREQ_MAX = 200 # Hz - Upper frequency bound for hum detection
|
| 65 |
+
HUM_ENERGY_THRESHOLD = 0.3 # Ratio of hum energy to total energy (0.1-0.5 range)
|
| 66 |
+
HUM_STEADY_THRESHOLD = 0.6 # Ratio of segments with steady amplitude (0.5-0.8 range)
|
| 67 |
+
HUM_AMPLITUDE_MIN = 0.005 # Minimum RMS for steady hum detection
|
| 68 |
+
HUM_AMPLITUDE_MAX = 0.1 # Maximum RMS for steady hum detection
|
| 69 |
+
|
| 70 |
+
# ============================================================================
|
| 71 |
+
# AUDIO TRIMMING SETTINGS
|
| 72 |
+
# ============================================================================
|
| 73 |
+
ENABLE_AUDIO_TRIMMING = True
|
| 74 |
+
SPEECH_ENDPOINT_THRESHOLD = 0.006
|
| 75 |
+
TRIMMING_BUFFER_MS = 50
|
| 76 |
+
|
| 77 |
+
# ============================================================================
|
| 78 |
+
# SILENCE DURATION SETTINGS (milliseconds)
|
| 79 |
+
# ============================================================================
|
| 80 |
+
SILENCE_CHAPTER_START = 1195
|
| 81 |
+
SILENCE_CHAPTER_END = 1100
|
| 82 |
+
SILENCE_SECTION_BREAK = 700
|
| 83 |
+
SILENCE_PARAGRAPH_END = 1000
|
| 84 |
+
|
| 85 |
+
# Punctuation-specific silence settings (milliseconds)
|
| 86 |
+
SILENCE_COMMA = 150
|
| 87 |
+
SILENCE_SEMICOLON = 150 # Medium pause after semicolons
|
| 88 |
+
SILENCE_COLON = 150 # Pause after colons
|
| 89 |
+
SILENCE_PERIOD = 500
|
| 90 |
+
SILENCE_QUESTION_MARK = 500
|
| 91 |
+
SILENCE_EXCLAMATION = 200
|
| 92 |
+
SILENCE_DASH = 200 # Em dash pause
|
| 93 |
+
SILENCE_ELLIPSIS = 80 # Ellipsis pause (suspense)
|
| 94 |
+
SILENCE_QUOTE_END = 150 # End of quoted speech
|
| 95 |
+
|
| 96 |
+
# Chunk-level silence settings
|
| 97 |
+
ENABLE_CHUNK_END_SILENCE = False
|
| 98 |
+
CHUNK_END_SILENCE_MS = 200
|
| 99 |
+
|
| 100 |
+
# Content boundary silence settings (milliseconds)
|
| 101 |
+
SILENCE_PARAGRAPH_FALLBACK = 500 # Original paragraph logic fallback
|
| 102 |
+
|
| 103 |
+
# ============================================================================
|
| 104 |
+
# AUDIO NORMALIZATION SETTINGS
|
| 105 |
+
# ============================================================================
|
| 106 |
+
ENABLE_NORMALIZATION = True
|
| 107 |
+
NORMALIZATION_TYPE = "peak"
|
| 108 |
+
TARGET_LUFS = -16
|
| 109 |
+
TARGET_PEAK_DB = -1.5
|
| 110 |
+
TARGET_LRA = 11 # Target loudness range for consistency
|
| 111 |
+
|
| 112 |
+
# ============================================================================
|
| 113 |
+
# AUDIO PLAYBACK SPEED SETTINGS
|
| 114 |
+
# ============================================================================
|
| 115 |
+
ATEMPO_SPEED = 1.0
|
| 116 |
+
|
| 117 |
+
# ============================================================================
|
| 118 |
+
# ENVIRONMENT SETUP
|
| 119 |
+
# ============================================================================
|
| 120 |
+
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
| 121 |
+
os.environ["TRANSFORMERS_NO_PROGRESS_BAR"] = "1"
|
| 122 |
+
os.environ["HF_TRANSFORMERS_NO_TQDM"] = "1"
|
| 123 |
+
# Cache handling is now done by launcher scripts:
|
| 124 |
+
# - launch_gradio_local.sh: Sets shared cache for development
|
| 125 |
+
# - launch_gradio.sh: Uses PyTorch defaults for containers/deployment
|
| 126 |
+
|
| 127 |
+
# ============================================================================
|
| 128 |
+
# COLOR CODES FOR TERMINAL OUTPUT
|
| 129 |
+
# ============================================================================
|
| 130 |
+
RESET = "\033[0m"
|
| 131 |
+
BOLD = "\033[1m"
|
| 132 |
+
RED = "\033[91m"
|
| 133 |
+
GREEN = "\033[92m"
|
| 134 |
+
YELLOW = "\033[93m"
|
| 135 |
+
CYAN = "\033[96m"
|
| 136 |
+
|
| 137 |
+
# ============================================================================
|
| 138 |
+
# TTS MODEL PARAMETERS (DEFAULTS)
|
| 139 |
+
# ============================================================================
|
| 140 |
+
DEFAULT_EXAGGERATION = 0.5
|
| 141 |
+
DEFAULT_CFG_WEIGHT = 0.5
|
| 142 |
+
DEFAULT_TEMPERATURE = 0.85
|
| 143 |
+
|
| 144 |
+
# Advanced Sampling Parameters (Min_P Sampler Support)
|
| 145 |
+
DEFAULT_MIN_P = 0.05 # Min probability threshold (0.0 disables)
|
| 146 |
+
DEFAULT_TOP_P = 1.0 # Top-p sampling (1.0 disables)
|
| 147 |
+
DEFAULT_REPETITION_PENALTY = 1.2 # Repetition penalty (1.0 = no penalty)
|
| 148 |
+
|
| 149 |
+
# ============================================================================
|
| 150 |
+
# VADER SENTIMENT TO TTS PARAMETER MAPPING
|
| 151 |
+
# ============================================================================
|
| 152 |
+
# These settings control how VADER sentiment analysis dynamically adjusts TTS parameters.
|
| 153 |
+
# The formula used is: new_param = base_param + (compound_score * sensitivity)
|
| 154 |
+
# The result is then clamped within the defined MIN/MAX range.
|
| 155 |
+
|
| 156 |
+
# --- Base TTS Parameters (used as the starting point) ---
|
| 157 |
+
# These are the same as the main defaults, but listed here for clarity.
|
| 158 |
+
BASE_EXAGGERATION = DEFAULT_EXAGGERATION # Default: 1.0
|
| 159 |
+
BASE_CFG_WEIGHT = DEFAULT_CFG_WEIGHT # Default: 0.7
|
| 160 |
+
BASE_TEMPERATURE = DEFAULT_TEMPERATURE # Default: 0.7
|
| 161 |
+
|
| 162 |
+
# --- Sensitivity ---
|
| 163 |
+
# How much VADER's compound score affects each parameter.
|
| 164 |
+
# Higher values mean more dramatic changes based on sentiment.
|
| 165 |
+
VADER_EXAGGERATION_SENSITIVITY = 0.33
|
| 166 |
+
VADER_CFG_WEIGHT_SENSITIVITY = 0.32
|
| 167 |
+
VADER_TEMPERATURE_SENSITIVITY = 0.3
|
| 168 |
+
VADER_MIN_P_SENSITIVITY = 0.01 # Reduced from 0.02 to prevent sampling issues
|
| 169 |
+
VADER_REPETITION_PENALTY_SENSITIVITY = 0.05 # Reduced from 0.1 to be more conservative
|
| 170 |
+
|
| 171 |
+
# --- Min/Max Clamps ---
|
| 172 |
+
# Hard limits to prevent extreme, undesirable audio artifacts.
|
| 173 |
+
TTS_PARAM_MIN_EXAGGERATION = 0.1
|
| 174 |
+
TTS_PARAM_MAX_EXAGGERATION = 0.65
|
| 175 |
+
TTS_PARAM_MIN_CFG_WEIGHT = 0.15
|
| 176 |
+
TTS_PARAM_MAX_CFG_WEIGHT = 0.8
|
| 177 |
+
|
| 178 |
+
TTS_PARAM_MIN_TEMPERATURE = 0.1
|
| 179 |
+
TTS_PARAM_MAX_TEMPERATURE = 2.3499999999999988
|
| 180 |
+
|
| 181 |
+
TTS_PARAM_MIN_MIN_P = 0.02 # Increased from 0.0 to prevent sampling issues
|
| 182 |
+
TTS_PARAM_MAX_MIN_P = 0.3 # Reduced from MAX 0.5 to prevent over-restriction
|
| 183 |
+
TTS_PARAM_MIN_TOP_P = 0.5 # Too low causes repetition
|
| 184 |
+
TTS_PARAM_MAX_TOP_P = 1.0 # MAX 1.0 disables top_p
|
| 185 |
+
TTS_PARAM_MIN_REPETITION_PENALTY = 1.0 # 1.0 = no penalty
|
| 186 |
+
TTS_PARAM_MAX_REPETITION_PENALTY = 2.0 # Higher values too restrictive MAX 2
|
| 187 |
+
|
| 188 |
+
# ============================================================================
|
| 189 |
+
# BATCH PROCESSING SETTINGS
|
| 190 |
+
# ============================================================================
|
| 191 |
+
BATCH_SIZE = 400
|
| 192 |
+
CLEANUP_INTERVAL = 500 # Deep cleanup every N chunks (reduced frequency for speed)
|
| 193 |
+
|
| 194 |
+
# ============================================================================
|
| 195 |
+
# QUALITY ENHANCEMENT SETTINGS (Phase 1)
|
| 196 |
+
# ============================================================================
|
| 197 |
+
|
| 198 |
+
# --- Regeneration Loop Settings ---
|
| 199 |
+
ENABLE_REGENERATION_LOOP = True # Enable automatic chunk regeneration on quality failure
|
| 200 |
+
MAX_REGENERATION_ATTEMPTS = 3 # Maximum retry attempts per chunk
|
| 201 |
+
QUALITY_THRESHOLD = 0.30 # TEMPORARILY LOWERED - Composite quality score threshold (0.0-1.0)
|
| 202 |
+
|
| 203 |
+
# --- Sentiment Smoothing Settings ---
|
| 204 |
+
ENABLE_SENTIMENT_SMOOTHING = True # Re-enabled - GUI controls now working properly
|
| 205 |
+
SENTIMENT_SMOOTHING_WINDOW = 3 # Number of previous chunks to consider
|
| 206 |
+
SENTIMENT_SMOOTHING_METHOD = "rolling" # "rolling" or "exp_decay"
|
| 207 |
+
|
| 208 |
+
# Exponential decay weights for smoothing (used if method is "exp_decay")
|
| 209 |
+
SENTIMENT_EXP_DECAY_WEIGHTS = [0.5, 0.3, 0.2] # Most recent to oldest
|
| 210 |
+
|
| 211 |
+
# --- Enhanced Anomaly Detection ---
|
| 212 |
+
SPECTRAL_ANOMALY_THRESHOLD = 0.6 # Spectral anomaly score threshold (0.0-1.0)
|
| 213 |
+
ENABLE_MFCC_VALIDATION = True # Enable MFCC-based spectral analysis
|
| 214 |
+
SPECTRAL_VARIANCE_LIMIT = 100.0 # Maximum spectral variance before flagging as artifact
|
| 215 |
+
|
| 216 |
+
# --- Output Validation Settings ---
|
| 217 |
+
ENABLE_OUTPUT_VALIDATION = True # Enable quality control clearinghouse (runs individual checks when enabled)
|
| 218 |
+
OUTPUT_VALIDATION_THRESHOLD = 0.6 # Minimum F1 score for output validation (reduced for punctuation tolerance)
|
| 219 |
+
|
| 220 |
+
# --- Parameter Adjustment for Regeneration ---
|
| 221 |
+
REGEN_TEMPERATURE_ADJUSTMENT = 0.1 # How much to adjust temperature per retry (increased for visibility)
|
| 222 |
+
REGEN_EXAGGERATION_ADJUSTMENT = 0.15 # How much to adjust exaggeration per retry (increased for visibility)
|
| 223 |
+
REGEN_CFG_ADJUSTMENT = 0.1 # How much to adjust cfg_weight per retry (increased for visibility)
|
| 224 |
+
|
| 225 |
+
# ============================================================================
|
| 226 |
+
# PERFORMANCE OPTIMIZATION SETTINGS
|
| 227 |
+
# ============================================================================
|
| 228 |
+
# Voice Embedding Caching - Cache voice embeddings to avoid recomputation
|
| 229 |
+
ENABLE_VOICE_EMBEDDING_CACHE = True # Enable voice embedding caching
|
| 230 |
+
VOICE_CACHE_MEMORY_LIMIT_MB = 500 # Maximum memory for voice cache (MB)
|
| 231 |
+
ENABLE_ADAPTIVE_VOICE_CACHE = True # Adapt cache based on system memory
|
| 232 |
+
|
| 233 |
+
# GPU Persistence Mode - Keep GPU in compute-ready state
|
| 234 |
+
ENABLE_GPU_PERSISTENCE_MODE = False # Try to enable GPU persistence mode
|
| 235 |
+
GPU_PERSISTENCE_RETRY_COUNT = 3 # Retry attempts for persistence mode
|
| 236 |
+
|
| 237 |
+
# CUDA Memory Pool - Advanced GPU memory management
|
| 238 |
+
ENABLE_CUDA_MEMORY_POOL = True # Enable CUDA memory pooling
|
| 239 |
+
CUDA_MEMORY_POOL_FRACTION = 0.9 # Fraction of GPU memory to pool
|
| 240 |
+
ENABLE_ADAPTIVE_MEMORY_POOL = True # Adapt pool size to system
|
| 241 |
+
|
| 242 |
+
# Producer-Consumer Pipeline - Eliminate chunk loading overhead
|
| 243 |
+
ENABLE_PRODUCER_CONSUMER_PIPELINE = True # Re-enabled with proper ETA tracking
|
| 244 |
+
PIPELINE_QUEUE_SIZE_MULTIPLIER = 3 # Queue size = workers * multiplier
|
| 245 |
+
PIPELINE_MAX_QUEUE_SIZE = 20 # Maximum queue size limit
|
| 246 |
+
ENABLE_PIPELINE_FALLBACK = True # Fall back to sequential if pipeline fails
|
| 247 |
+
|
| 248 |
+
# ============================================================================
|
| 249 |
+
# FEATURE TOGGLES
|
| 250 |
+
# ============================================================================
|
| 251 |
+
shutdown_requested = False # Global shutdown flag
|
config/config.py~
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GenTTS Configuration Module
|
| 3 |
+
Central location for all settings, paths, and feature toggles
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# ============================================================================
|
| 10 |
+
# CORE DIRECTORIES
|
| 11 |
+
# ============================================================================
|
| 12 |
+
TEXT_INPUT_ROOT = Path("Text_Input")
|
| 13 |
+
AUDIOBOOK_ROOT = Path("Audiobook")
|
| 14 |
+
VOICE_SAMPLES_DIR = Path("Voice_Samples")
|
| 15 |
+
|
| 16 |
+
# ============================================================================
|
| 17 |
+
# TEXT PROCESSING SETTINGS
|
| 18 |
+
# ============================================================================
|
| 19 |
+
MAX_CHUNK_WORDS = 32
|
| 20 |
+
MIN_CHUNK_WORDS = 4
|
| 21 |
+
|
| 22 |
+
# ============================================================================
|
| 23 |
+
# WORKER AND PERFORMANCE SETTINGS
|
| 24 |
+
# ============================================================================
|
| 25 |
+
MAX_WORKERS = 2
|
| 26 |
+
TEST_MAX_WORKERS = 6 # For experimentation
|
| 27 |
+
USE_DYNAMIC_WORKERS = False # Toggle for testing
|
| 28 |
+
VRAM_SAFETY_THRESHOLD = 6.5 # GB
|
| 29 |
+
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# AUDIO QUALITY SETTINGS
|
| 32 |
+
# ============================================================================
|
| 33 |
+
ENABLE_MID_DROP_CHECK = False
|
| 34 |
+
ENABLE_ASR = False # Disabled by default due to tensor dimension errors
|
| 35 |
+
ASR_WORKERS = 4 # Parallel ASR on CPU threads
|
| 36 |
+
DEFAULT_ASR_MODEL = "base" # Default Whisper model for ASR validation
|
| 37 |
+
|
| 38 |
+
# ASR Model Memory Requirements (approximate)
|
| 39 |
+
ASR_MODEL_VRAM_MB = {
|
| 40 |
+
"tiny": 39,
|
| 41 |
+
"base": 74,
|
| 42 |
+
"small": 244,
|
| 43 |
+
"medium": 769,
|
| 44 |
+
"large": 1550,
|
| 45 |
+
"large-v2": 1550,
|
| 46 |
+
"large-v3": 1550
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
ASR_MODEL_RAM_MB = {
|
| 50 |
+
"tiny": 150,
|
| 51 |
+
"base": 300,
|
| 52 |
+
"small": 800,
|
| 53 |
+
"medium": 2000,
|
| 54 |
+
"large": 4000,
|
| 55 |
+
"large-v2": 4000,
|
| 56 |
+
"large-v3": 4000
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# ============================================================================
|
| 60 |
+
# TTS HUM DETECTION SETTINGS
|
| 61 |
+
# ============================================================================
|
| 62 |
+
ENABLE_HUM_DETECTION = False
|
| 63 |
+
HUM_FREQ_MIN = 50 # Hz - Lower frequency bound for hum detection
|
| 64 |
+
HUM_FREQ_MAX = 200 # Hz - Upper frequency bound for hum detection
|
| 65 |
+
HUM_ENERGY_THRESHOLD = 0.3 # Ratio of hum energy to total energy (0.1-0.5 range)
|
| 66 |
+
HUM_STEADY_THRESHOLD = 0.6 # Ratio of segments with steady amplitude (0.5-0.8 range)
|
| 67 |
+
HUM_AMPLITUDE_MIN = 0.005 # Minimum RMS for steady hum detection
|
| 68 |
+
HUM_AMPLITUDE_MAX = 0.1 # Maximum RMS for steady hum detection
|
| 69 |
+
|
| 70 |
+
# ============================================================================
|
| 71 |
+
# AUDIO TRIMMING SETTINGS
|
| 72 |
+
# ============================================================================
|
| 73 |
+
ENABLE_AUDIO_TRIMMING = True
|
| 74 |
+
SPEECH_ENDPOINT_THRESHOLD = 0.006
|
| 75 |
+
TRIMMING_BUFFER_MS = 50
|
| 76 |
+
|
| 77 |
+
# ============================================================================
|
| 78 |
+
# SILENCE DURATION SETTINGS (milliseconds)
|
| 79 |
+
# ============================================================================
|
| 80 |
+
SILENCE_CHAPTER_START = 1195
|
| 81 |
+
SILENCE_CHAPTER_END = 1100
|
| 82 |
+
SILENCE_SECTION_BREAK = 700
|
| 83 |
+
SILENCE_PARAGRAPH_END = 1000
|
| 84 |
+
|
| 85 |
+
# Punctuation-specific silence settings (milliseconds)
|
| 86 |
+
SILENCE_COMMA = 150
|
| 87 |
+
SILENCE_SEMICOLON = 150 # Medium pause after semicolons
|
| 88 |
+
SILENCE_COLON = 150 # Pause after colons
|
| 89 |
+
SILENCE_PERIOD = 500
|
| 90 |
+
SILENCE_QUESTION_MARK = 500
|
| 91 |
+
SILENCE_EXCLAMATION = 200
|
| 92 |
+
SILENCE_DASH = 200 # Em dash pause
|
| 93 |
+
SILENCE_ELLIPSIS = 80 # Ellipsis pause (suspense)
|
| 94 |
+
SILENCE_QUOTE_END = 150 # End of quoted speech
|
| 95 |
+
|
| 96 |
+
# Chunk-level silence settings
|
| 97 |
+
ENABLE_CHUNK_END_SILENCE = False
|
| 98 |
+
CHUNK_END_SILENCE_MS = 200
|
| 99 |
+
|
| 100 |
+
# Content boundary silence settings (milliseconds)
|
| 101 |
+
SILENCE_PARAGRAPH_FALLBACK = 500 # Original paragraph logic fallback
|
| 102 |
+
|
| 103 |
+
# ============================================================================
|
| 104 |
+
# AUDIO NORMALIZATION SETTINGS
|
| 105 |
+
# ============================================================================
|
| 106 |
+
ENABLE_NORMALIZATION = True
|
| 107 |
+
NORMALIZATION_TYPE = "peak"
|
| 108 |
+
TARGET_LUFS = -16
|
| 109 |
+
TARGET_PEAK_DB = -1.5
|
| 110 |
+
TARGET_LRA = 11 # Target loudness range for consistency
|
| 111 |
+
|
| 112 |
+
# ============================================================================
|
| 113 |
+
# AUDIO PLAYBACK SPEED SETTINGS
|
| 114 |
+
# ============================================================================
|
| 115 |
+
ATEMPO_SPEED = 1.0
|
| 116 |
+
|
| 117 |
+
# ============================================================================
|
| 118 |
+
# ENVIRONMENT SETUP
|
| 119 |
+
# ============================================================================
|
| 120 |
+
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
| 121 |
+
os.environ["TRANSFORMERS_NO_PROGRESS_BAR"] = "1"
|
| 122 |
+
os.environ["HF_TRANSFORMERS_NO_TQDM"] = "1"
|
| 123 |
+
# Cache handling is now done by launcher scripts:
|
| 124 |
+
# - launch_gradio_local.sh: Sets shared cache for development
|
| 125 |
+
# - launch_gradio.sh: Uses PyTorch defaults for containers/deployment
|
| 126 |
+
|
| 127 |
+
# ============================================================================
|
| 128 |
+
# COLOR CODES FOR TERMINAL OUTPUT
|
| 129 |
+
# ============================================================================
|
| 130 |
+
RESET = "\033[0m"
|
| 131 |
+
BOLD = "\033[1m"
|
| 132 |
+
RED = "\033[91m"
|
| 133 |
+
GREEN = "\033[92m"
|
| 134 |
+
YELLOW = "\033[93m"
|
| 135 |
+
CYAN = "\033[96m"
|
| 136 |
+
|
| 137 |
+
# ============================================================================
|
| 138 |
+
# TTS MODEL PARAMETERS (DEFAULTS)
|
| 139 |
+
# ============================================================================
|
| 140 |
+
DEFAULT_EXAGGERATION = 0.5
|
| 141 |
+
DEFAULT_CFG_WEIGHT = 0.5
|
| 142 |
+
DEFAULT_TEMPERATURE = 0.85
|
| 143 |
+
|
| 144 |
+
# Advanced Sampling Parameters (Min_P Sampler Support)
|
| 145 |
+
DEFAULT_MIN_P = 0.05 # Min probability threshold (0.0 disables)
|
| 146 |
+
DEFAULT_TOP_P = 1.0 # Top-p sampling (1.0 disables)
|
| 147 |
+
DEFAULT_REPETITION_PENALTY = 1.2 # Repetition penalty (1.0 = no penalty)
|
| 148 |
+
|
| 149 |
+
# ============================================================================
|
| 150 |
+
# VADER SENTIMENT TO TTS PARAMETER MAPPING
|
| 151 |
+
# ============================================================================
|
| 152 |
+
# These settings control how VADER sentiment analysis dynamically adjusts TTS parameters.
|
| 153 |
+
# The formula used is: new_param = base_param + (compound_score * sensitivity)
|
| 154 |
+
# The result is then clamped within the defined MIN/MAX range.
|
| 155 |
+
|
| 156 |
+
# --- Base TTS Parameters (used as the starting point) ---
|
| 157 |
+
# These are the same as the main defaults, but listed here for clarity.
|
| 158 |
+
BASE_EXAGGERATION = DEFAULT_EXAGGERATION # Default: 1.0
|
| 159 |
+
BASE_CFG_WEIGHT = DEFAULT_CFG_WEIGHT # Default: 0.7
|
| 160 |
+
BASE_TEMPERATURE = DEFAULT_TEMPERATURE # Default: 0.7
|
| 161 |
+
|
| 162 |
+
# --- Sensitivity ---
|
| 163 |
+
# How much VADER's compound score affects each parameter.
|
| 164 |
+
# Higher values mean more dramatic changes based on sentiment.
|
| 165 |
+
VADER_EXAGGERATION_SENSITIVITY = 0.33
|
| 166 |
+
VADER_CFG_WEIGHT_SENSITIVITY = 0.32
|
| 167 |
+
VADER_TEMPERATURE_SENSITIVITY = 0.3
|
| 168 |
+
VADER_MIN_P_SENSITIVITY = 0.01 # Reduced from 0.02 to prevent sampling issues
|
| 169 |
+
VADER_REPETITION_PENALTY_SENSITIVITY = 0.05 # Reduced from 0.1 to be more conservative
|
| 170 |
+
|
| 171 |
+
# --- Min/Max Clamps ---
|
| 172 |
+
# Hard limits to prevent extreme, undesirable audio artifacts.
|
| 173 |
+
TTS_PARAM_MIN_EXAGGERATION = 0.1
|
| 174 |
+
TTS_PARAM_MAX_EXAGGERATION = 0.65
|
| 175 |
+
TTS_PARAM_MIN_CFG_WEIGHT = 0.15
|
| 176 |
+
TTS_PARAM_MAX_CFG_WEIGHT = 0.8
|
| 177 |
+
|
| 178 |
+
TTS_PARAM_MIN_TEMPERATURE = 0.1
|
| 179 |
+
TTS_PARAM_MAX_TEMPERATURE = 2.3499999999999988
|
| 180 |
+
|
| 181 |
+
TTS_PARAM_MIN_MIN_P = 0.02 # Increased from 0.0 to prevent sampling issues
|
| 182 |
+
TTS_PARAM_MAX_MIN_P = 0.3 # Reduced from MAX 0.5 to prevent over-restriction
|
| 183 |
+
TTS_PARAM_MIN_TOP_P = 0.5 # Too low causes repetition
|
| 184 |
+
TTS_PARAM_MAX_TOP_P = 1.0 # MAX 1.0 disables top_p
|
| 185 |
+
TTS_PARAM_MIN_REPETITION_PENALTY = 1.0 # 1.0 = no penalty
|
| 186 |
+
TTS_PARAM_MAX_REPETITION_PENALTY = 2.0 # Higher values too restrictive MAX 2
|
| 187 |
+
|
| 188 |
+
# ============================================================================
|
| 189 |
+
# BATCH PROCESSING SETTINGS
|
| 190 |
+
# ============================================================================
|
| 191 |
+
BATCH_SIZE = 400
|
| 192 |
+
CLEANUP_INTERVAL = 500 # Deep cleanup every N chunks (reduced frequency for speed)
|
| 193 |
+
|
| 194 |
+
# ============================================================================
|
| 195 |
+
# QUALITY ENHANCEMENT SETTINGS (Phase 1)
|
| 196 |
+
# ============================================================================
|
| 197 |
+
|
| 198 |
+
# --- Regeneration Loop Settings ---
|
| 199 |
+
ENABLE_REGENERATION_LOOP = True # Enable automatic chunk regeneration on quality failure
|
| 200 |
+
MAX_REGENERATION_ATTEMPTS = 3 # Maximum retry attempts per chunk
|
| 201 |
+
QUALITY_THRESHOLD = 0.30 # TEMPORARILY LOWERED - Composite quality score threshold (0.0-1.0)
|
| 202 |
+
|
| 203 |
+
# --- Sentiment Smoothing Settings ---
|
| 204 |
+
ENABLE_SENTIMENT_SMOOTHING = True # Re-enabled - GUI controls now working properly
|
| 205 |
+
SENTIMENT_SMOOTHING_WINDOW = 3 # Number of previous chunks to consider
|
| 206 |
+
SENTIMENT_SMOOTHING_METHOD = "rolling" # "rolling" or "exp_decay"
|
| 207 |
+
|
| 208 |
+
# Exponential decay weights for smoothing (used if method is "exp_decay")
|
| 209 |
+
SENTIMENT_EXP_DECAY_WEIGHTS = [0.5, 0.3, 0.2] # Most recent to oldest
|
| 210 |
+
|
| 211 |
+
# --- Enhanced Anomaly Detection ---
|
| 212 |
+
SPECTRAL_ANOMALY_THRESHOLD = 0.6 # Spectral anomaly score threshold (0.0-1.0)
|
| 213 |
+
ENABLE_MFCC_VALIDATION = True # Enable MFCC-based spectral analysis
|
| 214 |
+
SPECTRAL_VARIANCE_LIMIT = 100.0 # Maximum spectral variance before flagging as artifact
|
| 215 |
+
|
| 216 |
+
# --- Output Validation Settings ---
|
| 217 |
+
ENABLE_OUTPUT_VALIDATION = True # Enable quality control clearinghouse (runs individual checks when enabled)
|
| 218 |
+
OUTPUT_VALIDATION_THRESHOLD = 0.6 # Minimum F1 score for output validation (reduced for punctuation tolerance)
|
| 219 |
+
|
| 220 |
+
# --- Parameter Adjustment for Regeneration ---
|
| 221 |
+
REGEN_TEMPERATURE_ADJUSTMENT = 0.1 # How much to adjust temperature per retry (increased for visibility)
|
| 222 |
+
REGEN_EXAGGERATION_ADJUSTMENT = 0.15 # How much to adjust exaggeration per retry (increased for visibility)
|
| 223 |
+
REGEN_CFG_ADJUSTMENT = 0.1 # How much to adjust cfg_weight per retry (increased for visibility)
|
| 224 |
+
|
| 225 |
+
# ============================================================================
|
| 226 |
+
# PERFORMANCE OPTIMIZATION SETTINGS
|
| 227 |
+
# ============================================================================
|
| 228 |
+
# Voice Embedding Caching - Cache voice embeddings to avoid recomputation
|
| 229 |
+
ENABLE_VOICE_EMBEDDING_CACHE = True # Enable voice embedding caching
|
| 230 |
+
VOICE_CACHE_MEMORY_LIMIT_MB = 500 # Maximum memory for voice cache (MB)
|
| 231 |
+
ENABLE_ADAPTIVE_VOICE_CACHE = True # Adapt cache based on system memory
|
| 232 |
+
|
| 233 |
+
# GPU Persistence Mode - Keep GPU in compute-ready state
|
| 234 |
+
ENABLE_GPU_PERSISTENCE_MODE = False # Try to enable GPU persistence mode
|
| 235 |
+
GPU_PERSISTENCE_RETRY_COUNT = 3 # Retry attempts for persistence mode
|
| 236 |
+
|
| 237 |
+
# CUDA Memory Pool - Advanced GPU memory management
|
| 238 |
+
ENABLE_CUDA_MEMORY_POOL = True # Enable CUDA memory pooling
|
| 239 |
+
CUDA_MEMORY_POOL_FRACTION = 0.9 # Fraction of GPU memory to pool
|
| 240 |
+
ENABLE_ADAPTIVE_MEMORY_POOL = True # Adapt pool size to system
|
| 241 |
+
|
| 242 |
+
# Producer-Consumer Pipeline - Eliminate chunk loading overhead
|
| 243 |
+
ENABLE_PRODUCER_CONSUMER_PIPELINE = True # Re-enabled with proper ETA tracking
|
| 244 |
+
PIPELINE_QUEUE_SIZE_MULTIPLIER = 3 # Queue size = workers * multiplier
|
| 245 |
+
PIPELINE_MAX_QUEUE_SIZE = 20 # Maximum queue size limit
|
| 246 |
+
ENABLE_PIPELINE_FALLBACK = True # Fall back to sequential if pipeline fails
|
| 247 |
+
|
| 248 |
+
# ============================================================================
|
| 249 |
+
# FEATURE TOGGLES
|
| 250 |
+
# ============================================================================
|
| 251 |
+
shutdown_requested = False # Global shutdown flag
|
gradio_main_interface.py
CHANGED
|
@@ -16,14 +16,15 @@ ARCHITECTURE:
|
|
| 16 |
|
| 17 |
AVAILABLE TABS:
|
| 18 |
1. Convert Book (Tab 1) - FUNCTIONAL: Main TTS conversion interface
|
| 19 |
-
2.
|
| 20 |
3. Voice Analysis (Tab 3) - PLACEHOLDER: Voice sample analysis tools
|
| 21 |
-
4.
|
| 22 |
-
5.
|
| 23 |
6. Settings (Tab 6) - FUNCTIONAL: Configuration management
|
| 24 |
-
7. Chunk Tools (Tab 7) -
|
| 25 |
-
8.
|
| 26 |
-
9.
|
|
|
|
| 27 |
|
| 28 |
DEPLOYMENT MODES:
|
| 29 |
- LOCAL: python3 gradio_main_interface.py (development)
|
|
@@ -96,6 +97,13 @@ except ImportError as e:
|
|
| 96 |
print(f"β οΈ Tab 8 (JSON Generate) not available: {e}")
|
| 97 |
TAB8_AVAILABLE = False
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
def create_placeholder_tab(tab_name, tab_number):
|
| 100 |
"""Create a placeholder tab for future implementation"""
|
| 101 |
with gr.Column():
|
|
@@ -185,11 +193,19 @@ def create_main_interface():
|
|
| 185 |
with gr.Tab("8. JSON Generate"):
|
| 186 |
create_placeholder_tab("JSON Generate", 8)
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
-
with gr.Tab("
|
| 192 |
-
create_placeholder_tab("About",
|
| 193 |
|
| 194 |
# Footer
|
| 195 |
gr.Markdown("""
|
|
@@ -211,6 +227,7 @@ def launch_interface():
|
|
| 211 |
print(f" Tab 6 (Settings): {'β
Available' if TAB6_AVAILABLE else 'β Not Available'}")
|
| 212 |
print(f" Tab 7 (Chunk Tools): {'β
Available' if TAB7_AVAILABLE else 'β Not Available'}")
|
| 213 |
print(f" Tab 8 (JSON Generate): {'β
Available' if TAB8_AVAILABLE else 'β Not Available'}")
|
|
|
|
| 214 |
print(" Other Tabs: π§ Placeholder (Coming Soon)")
|
| 215 |
print("-" * 50)
|
| 216 |
|
|
|
|
| 16 |
|
| 17 |
AVAILABLE TABS:
|
| 18 |
1. Convert Book (Tab 1) - FUNCTIONAL: Main TTS conversion interface
|
| 19 |
+
2. Configuration (Tab 2) - FUNCTIONAL: System configuration settings
|
| 20 |
3. Voice Analysis (Tab 3) - PLACEHOLDER: Voice sample analysis tools
|
| 21 |
+
4. Combine Audio (Tab 4) - FUNCTIONAL: Audio file combination tools
|
| 22 |
+
5. Prepare Text (Tab 5) - FUNCTIONAL: Text preparation and chunking
|
| 23 |
6. Settings (Tab 6) - FUNCTIONAL: Configuration management
|
| 24 |
+
7. Chunk Tools (Tab 7) - FUNCTIONAL: Chunk editing and repair
|
| 25 |
+
8. JSON Generate (Tab 8) - FUNCTIONAL: Direct JSON-to-audiobook generation
|
| 26 |
+
9. Diagnostics (Tab 9) - FUNCTIONAL: Parallel processing performance diagnostics
|
| 27 |
+
10. System Monitor (Tab 10) - PLACEHOLDER: Performance monitoring
|
| 28 |
|
| 29 |
DEPLOYMENT MODES:
|
| 30 |
- LOCAL: python3 gradio_main_interface.py (development)
|
|
|
|
| 97 |
print(f"β οΈ Tab 8 (JSON Generate) not available: {e}")
|
| 98 |
TAB8_AVAILABLE = False
|
| 99 |
|
| 100 |
+
try:
|
| 101 |
+
from gradio_tabs.tab_diagnostics import create_diagnostics_tab
|
| 102 |
+
TAB_DIAGNOSTICS_AVAILABLE = True
|
| 103 |
+
except ImportError as e:
|
| 104 |
+
print(f"β οΈ Diagnostics tab not available: {e}")
|
| 105 |
+
TAB_DIAGNOSTICS_AVAILABLE = False
|
| 106 |
+
|
| 107 |
def create_placeholder_tab(tab_name, tab_number):
|
| 108 |
"""Create a placeholder tab for future implementation"""
|
| 109 |
with gr.Column():
|
|
|
|
| 193 |
with gr.Tab("8. JSON Generate"):
|
| 194 |
create_placeholder_tab("JSON Generate", 8)
|
| 195 |
|
| 196 |
+
# Tab 9: Diagnostics (Working)
|
| 197 |
+
if TAB_DIAGNOSTICS_AVAILABLE:
|
| 198 |
+
with gr.Tab("9. Diagnostics"):
|
| 199 |
+
create_diagnostics_tab()
|
| 200 |
+
else:
|
| 201 |
+
with gr.Tab("9. Diagnostics"):
|
| 202 |
+
create_placeholder_tab("System Diagnostics", 9)
|
| 203 |
+
|
| 204 |
+
with gr.Tab("10. System Monitor"):
|
| 205 |
+
create_placeholder_tab("System Monitor", 10)
|
| 206 |
|
| 207 |
+
with gr.Tab("11. About"):
|
| 208 |
+
create_placeholder_tab("About", 11)
|
| 209 |
|
| 210 |
# Footer
|
| 211 |
gr.Markdown("""
|
|
|
|
| 227 |
print(f" Tab 6 (Settings): {'β
Available' if TAB6_AVAILABLE else 'β Not Available'}")
|
| 228 |
print(f" Tab 7 (Chunk Tools): {'β
Available' if TAB7_AVAILABLE else 'β Not Available'}")
|
| 229 |
print(f" Tab 8 (JSON Generate): {'β
Available' if TAB8_AVAILABLE else 'β Not Available'}")
|
| 230 |
+
print(f" Tab 9 (Diagnostics): {'β
Available' if TAB_DIAGNOSTICS_AVAILABLE else 'β Not Available'}")
|
| 231 |
print(" Other Tabs: π§ Placeholder (Coming Soon)")
|
| 232 |
print("-" * 50)
|
| 233 |
|
gradio_tabs/gradio_imports.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Common import utilities for Gradio tabs - HuggingFace deployment compatibility
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
def safe_import(module_name, package=None):
|
| 10 |
+
"""Safely import modules with HuggingFace deployment compatibility"""
|
| 11 |
+
try:
|
| 12 |
+
if package:
|
| 13 |
+
return __import__(f"{package}.{module_name}", fromlist=[module_name])
|
| 14 |
+
else:
|
| 15 |
+
return __import__(module_name, fromlist=[''])
|
| 16 |
+
except ImportError:
|
| 17 |
+
# Try adding parent directory to path for HuggingFace deployment
|
| 18 |
+
current_dir = os.path.dirname(__file__)
|
| 19 |
+
parent_dir = os.path.join(current_dir, '..')
|
| 20 |
+
if parent_dir not in sys.path:
|
| 21 |
+
sys.path.append(parent_dir)
|
| 22 |
+
try:
|
| 23 |
+
if package:
|
| 24 |
+
return __import__(f"{package}.{module_name}", fromlist=[module_name])
|
| 25 |
+
else:
|
| 26 |
+
return __import__(module_name, fromlist=[''])
|
| 27 |
+
except ImportError:
|
| 28 |
+
raise
|
| 29 |
+
|
| 30 |
+
def safe_import_config():
|
| 31 |
+
"""Safely import config module and return all config variables"""
|
| 32 |
+
try:
|
| 33 |
+
config_module = safe_import('config', 'config')
|
| 34 |
+
# Return dictionary of all config variables
|
| 35 |
+
return {name: getattr(config_module, name) for name in dir(config_module) if not name.startswith('_')}, True
|
| 36 |
+
except ImportError as e:
|
| 37 |
+
print(f"β οΈ Config not available: {e} - using defaults")
|
| 38 |
+
return {}, False
|
| 39 |
+
|
| 40 |
+
def get_default_config():
|
| 41 |
+
"""Return default configuration values for when config is not available"""
|
| 42 |
+
return {
|
| 43 |
+
'AUDIOBOOK_ROOT': 'Audiobook',
|
| 44 |
+
'TEXT_INPUT_ROOT': 'Text_Input',
|
| 45 |
+
'VOICE_SAMPLES_DIR': 'Voice_Samples',
|
| 46 |
+
'MAX_WORKERS': 2,
|
| 47 |
+
'BATCH_SIZE': 100,
|
| 48 |
+
'MIN_CHUNK_WORDS': 5,
|
| 49 |
+
'MAX_CHUNK_WORDS': 25,
|
| 50 |
+
'ENABLE_NORMALIZATION': True,
|
| 51 |
+
'TARGET_LUFS': -16,
|
| 52 |
+
'ENABLE_AUDIO_TRIMMING': True,
|
| 53 |
+
'SPEECH_ENDPOINT_THRESHOLD': 0.005,
|
| 54 |
+
'TRIMMING_BUFFER_MS': 100,
|
| 55 |
+
'TTS_PARAM_MIN_EXAGGERATION': 0.0,
|
| 56 |
+
'TTS_PARAM_MAX_EXAGGERATION': 2.0,
|
| 57 |
+
'TTS_PARAM_MIN_CFG_WEIGHT': 0.0,
|
| 58 |
+
'TTS_PARAM_MAX_CFG_WEIGHT': 1.0,
|
| 59 |
+
'TTS_PARAM_MIN_TEMPERATURE': 0.0,
|
| 60 |
+
'TTS_PARAM_MAX_TEMPERATURE': 5.0,
|
| 61 |
+
'DEFAULT_EXAGGERATION': 0.5,
|
| 62 |
+
'DEFAULT_CFG_WEIGHT': 0.5,
|
| 63 |
+
'DEFAULT_TEMPERATURE': 0.8,
|
| 64 |
+
'VADER_EXAGGERATION_SENSITIVITY': 0.3,
|
| 65 |
+
'VADER_CFG_WEIGHT_SENSITIVITY': 0.3,
|
| 66 |
+
'VADER_TEMPERATURE_SENSITIVITY': 0.3,
|
| 67 |
+
'SILENCE_CHAPTER_START': 1000,
|
| 68 |
+
'SILENCE_CHAPTER_END': 1500,
|
| 69 |
+
'SILENCE_SECTION_BREAK': 800,
|
| 70 |
+
'SILENCE_PARAGRAPH_END': 500,
|
| 71 |
+
'SILENCE_COMMA': 200,
|
| 72 |
+
'SILENCE_PERIOD': 400,
|
| 73 |
+
'SILENCE_QUESTION_MARK': 500,
|
| 74 |
+
'SILENCE_EXCLAMATION': 500,
|
| 75 |
+
'CHUNK_END_SILENCE_MS': 0,
|
| 76 |
+
'ENABLE_SENTIMENT_SMOOTHING': True,
|
| 77 |
+
'SENTIMENT_SMOOTHING_WINDOW': 3,
|
| 78 |
+
'SENTIMENT_SMOOTHING_METHOD': 'gaussian',
|
| 79 |
+
'BASE_EXAGGERATION': 0.5,
|
| 80 |
+
'BASE_CFG_WEIGHT': 0.5,
|
| 81 |
+
'BASE_TEMPERATURE': 0.8,
|
| 82 |
+
'DEFAULT_MIN_P': 0.1,
|
| 83 |
+
'DEFAULT_TOP_P': 0.9,
|
| 84 |
+
'DEFAULT_REPETITION_PENALTY': 1.0
|
| 85 |
+
}
|
gradio_tabs/tab1_convert_book.py
CHANGED
|
@@ -71,8 +71,7 @@ conversion_state = {
|
|
| 71 |
'vram_usage': '-- GB',
|
| 72 |
'current_chunk': '--',
|
| 73 |
'eta': '--',
|
| 74 |
-
'elapsed': '--'
|
| 75 |
-
'needs_refresh': False
|
| 76 |
}
|
| 77 |
|
| 78 |
def parse_progress_stats(output_line):
|
|
@@ -533,6 +532,14 @@ def create_convert_book_tab():
|
|
| 533 |
value=2.0,
|
| 534 |
info="Reduce repetition"
|
| 535 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
| 537 |
# Action Buttons and Status
|
| 538 |
with gr.Row():
|
|
@@ -779,7 +786,8 @@ def create_convert_book_tab():
|
|
| 779 |
regen_enabled_val, max_attempts_val, quality_thresh_val,
|
| 780 |
sentiment_smooth_val, smooth_window_val, smooth_method_val,
|
| 781 |
mfcc_val, output_val, spectral_thresh_val, output_thresh_val,
|
| 782 |
-
exag_val, cfg_val, temp_val, min_p_val, top_p_val, rep_penalty_val
|
|
|
|
| 783 |
"""Start the actual book conversion - file upload version"""
|
| 784 |
|
| 785 |
# Validation
|
|
@@ -882,7 +890,8 @@ def create_convert_book_tab():
|
|
| 882 |
'vader_enabled': vader_val,
|
| 883 |
'asr_enabled': asr_val,
|
| 884 |
'asr_config': asr_config,
|
| 885 |
-
'add_to_batch': add_to_batch_val
|
|
|
|
| 886 |
}
|
| 887 |
|
| 888 |
# Set conversion state
|
|
@@ -905,8 +914,6 @@ def create_convert_book_tab():
|
|
| 905 |
if result['success']:
|
| 906 |
conversion_state['status'] = 'β
Conversion completed successfully!'
|
| 907 |
conversion_state['progress'] = 100
|
| 908 |
-
# Trigger automatic refresh of audiobook dropdowns
|
| 909 |
-
conversion_state['needs_refresh'] = True
|
| 910 |
else:
|
| 911 |
conversion_state['status'] = f"β Conversion failed: {result.get('error', 'Unknown error')}"
|
| 912 |
conversion_state['progress'] = 0
|
|
@@ -974,7 +981,8 @@ def create_convert_book_tab():
|
|
| 974 |
regeneration_enabled, max_attempts, quality_threshold,
|
| 975 |
sentiment_smoothing, smoothing_window, smoothing_method,
|
| 976 |
mfcc_validation, output_validation, spectral_threshold, output_threshold,
|
| 977 |
-
exaggeration, cfg_weight, temperature, min_p, top_p, repetition_penalty
|
|
|
|
| 978 |
],
|
| 979 |
outputs=[status_display, progress_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 980 |
)
|
|
@@ -1060,21 +1068,6 @@ def create_convert_book_tab():
|
|
| 1060 |
if audiobook_choices['choices']:
|
| 1061 |
latest_audiobook = load_selected_audiobook(audiobook_choices['choices'][0])
|
| 1062 |
|
| 1063 |
-
return (
|
| 1064 |
-
conversion_state['status'],
|
| 1065 |
-
conversion_state['progress'],
|
| 1066 |
-
latest_audiobook,
|
| 1067 |
-
audiobook_choices,
|
| 1068 |
-
m4b_choices
|
| 1069 |
-
)
|
| 1070 |
-
elif conversion_state.get('needs_refresh', False):
|
| 1071 |
-
# Auto-refresh requested
|
| 1072 |
-
conversion_state['needs_refresh'] = False
|
| 1073 |
-
audiobook_choices, m4b_choices = update_audiobook_dropdowns_after_conversion()
|
| 1074 |
-
latest_audiobook = None
|
| 1075 |
-
if audiobook_choices['choices']:
|
| 1076 |
-
latest_audiobook = load_selected_audiobook(audiobook_choices['choices'][0])
|
| 1077 |
-
|
| 1078 |
return (
|
| 1079 |
conversion_state['status'],
|
| 1080 |
conversion_state['progress'],
|
|
|
|
| 71 |
'vram_usage': '-- GB',
|
| 72 |
'current_chunk': '--',
|
| 73 |
'eta': '--',
|
| 74 |
+
'elapsed': '--'
|
|
|
|
| 75 |
}
|
| 76 |
|
| 77 |
def parse_progress_stats(output_line):
|
|
|
|
| 532 |
value=2.0,
|
| 533 |
info="Reduce repetition"
|
| 534 |
)
|
| 535 |
+
|
| 536 |
+
# NEW: TTS Inference Batch Size
|
| 537 |
+
tts_batch_size = gr.Slider(
|
| 538 |
+
label="TTS Inference Batch Size (VADER Off)",
|
| 539 |
+
minimum=1, maximum=64, step=1,
|
| 540 |
+
value=16, # Default value
|
| 541 |
+
info="Number of chunks to process simultaneously when VADER is disabled for speed."
|
| 542 |
+
)
|
| 543 |
|
| 544 |
# Action Buttons and Status
|
| 545 |
with gr.Row():
|
|
|
|
| 786 |
regen_enabled_val, max_attempts_val, quality_thresh_val,
|
| 787 |
sentiment_smooth_val, smooth_window_val, smooth_method_val,
|
| 788 |
mfcc_val, output_val, spectral_thresh_val, output_thresh_val,
|
| 789 |
+
exag_val, cfg_val, temp_val, min_p_val, top_p_val, rep_penalty_val,
|
| 790 |
+
tts_batch_size_val):
|
| 791 |
"""Start the actual book conversion - file upload version"""
|
| 792 |
|
| 793 |
# Validation
|
|
|
|
| 890 |
'vader_enabled': vader_val,
|
| 891 |
'asr_enabled': asr_val,
|
| 892 |
'asr_config': asr_config,
|
| 893 |
+
'add_to_batch': add_to_batch_val,
|
| 894 |
+
'tts_batch_size': tts_batch_size_val
|
| 895 |
}
|
| 896 |
|
| 897 |
# Set conversion state
|
|
|
|
| 914 |
if result['success']:
|
| 915 |
conversion_state['status'] = 'β
Conversion completed successfully!'
|
| 916 |
conversion_state['progress'] = 100
|
|
|
|
|
|
|
| 917 |
else:
|
| 918 |
conversion_state['status'] = f"β Conversion failed: {result.get('error', 'Unknown error')}"
|
| 919 |
conversion_state['progress'] = 0
|
|
|
|
| 981 |
regeneration_enabled, max_attempts, quality_threshold,
|
| 982 |
sentiment_smoothing, smoothing_window, smoothing_method,
|
| 983 |
mfcc_validation, output_validation, spectral_threshold, output_threshold,
|
| 984 |
+
exaggeration, cfg_weight, temperature, min_p, top_p, repetition_penalty,
|
| 985 |
+
tts_batch_size
|
| 986 |
],
|
| 987 |
outputs=[status_display, progress_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 988 |
)
|
|
|
|
| 1068 |
if audiobook_choices['choices']:
|
| 1069 |
latest_audiobook = load_selected_audiobook(audiobook_choices['choices'][0])
|
| 1070 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1071 |
return (
|
| 1072 |
conversion_state['status'],
|
| 1073 |
conversion_state['progress'],
|
gradio_tabs/tab1_convert_book.py.20250811-120000.bak
ADDED
|
@@ -0,0 +1,1173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gradio Tab 1: Convert Book
|
| 4 |
+
Exact replica of PyQt5 GUI Tab 1 functionality
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import threading
|
| 11 |
+
import subprocess
|
| 12 |
+
import tempfile
|
| 13 |
+
import json
|
| 14 |
+
import warnings
|
| 15 |
+
import re
|
| 16 |
+
import time
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 19 |
+
|
| 20 |
+
# Suppress CUDA deprecation warnings
|
| 21 |
+
warnings.filterwarnings("ignore", category=FutureWarning, message=".*torch.backends.cuda.sdp_kernel.*")
|
| 22 |
+
warnings.filterwarnings("ignore", category=FutureWarning, message=".*sdp_kernel.*")
|
| 23 |
+
|
| 24 |
+
# Import ChatterboxTTS modules and ensure all config variables are available
|
| 25 |
+
# First set defaults, then try to import from config
|
| 26 |
+
DEFAULT_EXAGGERATION = 0.4
|
| 27 |
+
DEFAULT_CFG_WEIGHT = 0.5
|
| 28 |
+
DEFAULT_TEMPERATURE = 0.9
|
| 29 |
+
TTS_PARAM_MIN_EXAGGERATION = 0.0
|
| 30 |
+
TTS_PARAM_MAX_EXAGGERATION = 2.0
|
| 31 |
+
TTS_PARAM_MIN_CFG_WEIGHT = 0.0
|
| 32 |
+
TTS_PARAM_MAX_CFG_WEIGHT = 1.0
|
| 33 |
+
TTS_PARAM_MIN_TEMPERATURE = 0.0
|
| 34 |
+
TTS_PARAM_MAX_TEMPERATURE = 5.0
|
| 35 |
+
ENABLE_REGENERATION_LOOP = True
|
| 36 |
+
MAX_REGENERATION_ATTEMPTS = 3
|
| 37 |
+
QUALITY_THRESHOLD = 0.7
|
| 38 |
+
ENABLE_SENTIMENT_SMOOTHING = True
|
| 39 |
+
SENTIMENT_SMOOTHING_WINDOW = 3
|
| 40 |
+
SENTIMENT_SMOOTHING_METHOD = "rolling"
|
| 41 |
+
ENABLE_MFCC_VALIDATION = False
|
| 42 |
+
ENABLE_OUTPUT_VALIDATION = False
|
| 43 |
+
SPECTRAL_ANOMALY_THRESHOLD = 0.8
|
| 44 |
+
OUTPUT_VALIDATION_THRESHOLD = 0.85
|
| 45 |
+
|
| 46 |
+
# Try to import config and override defaults if available
|
| 47 |
+
try:
|
| 48 |
+
from config.config import *
|
| 49 |
+
CONFIG_AVAILABLE = True
|
| 50 |
+
print("β
Config loaded successfully")
|
| 51 |
+
except ImportError:
|
| 52 |
+
print("β οΈ Config not available - using defaults")
|
| 53 |
+
CONFIG_AVAILABLE = False
|
| 54 |
+
|
| 55 |
+
# Import the actual conversion functions from GUI
|
| 56 |
+
try:
|
| 57 |
+
# We need to import the actual conversion logic
|
| 58 |
+
import importlib.util
|
| 59 |
+
gui_spec = importlib.util.spec_from_file_location("chatterbox_gui", "chatterbox_gui.py")
|
| 60 |
+
gui_module = importlib.util.module_from_spec(gui_spec)
|
| 61 |
+
# We'll access the GUI's conversion methods
|
| 62 |
+
GUI_AVAILABLE = True
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"β οΈ GUI module not available: {e}")
|
| 65 |
+
GUI_AVAILABLE = False
|
| 66 |
+
|
| 67 |
+
# Global state for conversion with enhanced stats
|
| 68 |
+
conversion_state = {
|
| 69 |
+
'running': False,
|
| 70 |
+
'progress': 0,
|
| 71 |
+
'status': 'Ready',
|
| 72 |
+
'thread': None,
|
| 73 |
+
'realtime_factor': '--',
|
| 74 |
+
'vram_usage': '-- GB',
|
| 75 |
+
'current_chunk': '--',
|
| 76 |
+
'eta': '--',
|
| 77 |
+
'elapsed': '--'
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
def parse_progress_stats(output_line):
|
| 81 |
+
"""Parse progress statistics from TTS engine output"""
|
| 82 |
+
# Look for progress pattern: "π Chunk 2/13 | β± Elapsed: 0:01:31 | ETA: 0:09:54 | Remaining: 0:08:23 | Realtime: 0.11x | VRAM: 3.3GB"
|
| 83 |
+
progress_pattern = r'π Chunk (\d+)/(\d+).*?Realtime: ([\d.]+)x.*?VRAM: ([\d.]+)GB'
|
| 84 |
+
match = re.search(progress_pattern, output_line)
|
| 85 |
+
|
| 86 |
+
if match:
|
| 87 |
+
current_chunk = int(match.group(1))
|
| 88 |
+
total_chunks = int(match.group(2))
|
| 89 |
+
realtime_factor = f"{match.group(3)}x"
|
| 90 |
+
vram_usage = f"{match.group(4)} GB"
|
| 91 |
+
|
| 92 |
+
# Update global state
|
| 93 |
+
conversion_state['current_chunk'] = f"{current_chunk}/{total_chunks}"
|
| 94 |
+
conversion_state['realtime_factor'] = realtime_factor
|
| 95 |
+
conversion_state['vram_usage'] = vram_usage
|
| 96 |
+
conversion_state['progress'] = int((current_chunk / total_chunks) * 100) if total_chunks > 0 else 0
|
| 97 |
+
|
| 98 |
+
print(f"π Stats Updated: Chunk {current_chunk}/{total_chunks}, {realtime_factor}, {vram_usage}")
|
| 99 |
+
return True
|
| 100 |
+
else:
|
| 101 |
+
# Try alternative patterns in case the format is different
|
| 102 |
+
alt_pattern = r'Chunk (\d+)/(\d+).*?Realtime: ([\d.]+)x.*?VRAM: ([\d.]+)GB'
|
| 103 |
+
alt_match = re.search(alt_pattern, output_line)
|
| 104 |
+
if alt_match:
|
| 105 |
+
current_chunk = int(alt_match.group(1))
|
| 106 |
+
total_chunks = int(alt_match.group(2))
|
| 107 |
+
realtime_factor = f"{alt_match.group(3)}x"
|
| 108 |
+
vram_usage = f"{alt_match.group(4)} GB"
|
| 109 |
+
|
| 110 |
+
conversion_state['current_chunk'] = f"{current_chunk}/{total_chunks}"
|
| 111 |
+
conversion_state['realtime_factor'] = realtime_factor
|
| 112 |
+
conversion_state['vram_usage'] = vram_usage
|
| 113 |
+
conversion_state['progress'] = int((current_chunk / total_chunks) * 100) if total_chunks > 0 else 0
|
| 114 |
+
|
| 115 |
+
print(f"π Stats Updated: Chunk {current_chunk}/{total_chunks}, {realtime_factor}, {vram_usage}")
|
| 116 |
+
return True
|
| 117 |
+
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
def get_progress_stats():
|
| 121 |
+
"""Get current progress statistics for UI update"""
|
| 122 |
+
return (
|
| 123 |
+
conversion_state['realtime_factor'],
|
| 124 |
+
conversion_state['vram_usage'],
|
| 125 |
+
conversion_state['current_chunk'],
|
| 126 |
+
conversion_state['progress']
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
def get_book_folders():
|
| 130 |
+
"""Get available book folders from Text_Input directory"""
|
| 131 |
+
text_input_dir = Path("Text_Input")
|
| 132 |
+
if not text_input_dir.exists():
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
folders = []
|
| 136 |
+
for item in text_input_dir.iterdir():
|
| 137 |
+
if item.is_dir():
|
| 138 |
+
folders.append(item.name) # Show only folder name, not full path
|
| 139 |
+
|
| 140 |
+
return sorted(folders)
|
| 141 |
+
|
| 142 |
+
def get_text_files_in_folder(folder_name):
|
| 143 |
+
"""Get text files in selected book folder"""
|
| 144 |
+
if not folder_name:
|
| 145 |
+
return []
|
| 146 |
+
|
| 147 |
+
# Build full path from folder name
|
| 148 |
+
folder = Path("Text_Input") / folder_name
|
| 149 |
+
if not folder.exists():
|
| 150 |
+
return []
|
| 151 |
+
|
| 152 |
+
text_files = []
|
| 153 |
+
for file in folder.glob("*.txt"):
|
| 154 |
+
text_files.append(file.name)
|
| 155 |
+
|
| 156 |
+
return sorted(text_files)
|
| 157 |
+
|
| 158 |
+
def get_voice_samples():
|
| 159 |
+
"""Get available voice samples from Voice_Samples directory"""
|
| 160 |
+
voice_dir = Path("Voice_Samples")
|
| 161 |
+
if not voice_dir.exists():
|
| 162 |
+
return []
|
| 163 |
+
|
| 164 |
+
voices = []
|
| 165 |
+
for file in voice_dir.glob("*.wav"):
|
| 166 |
+
voices.append(file.name) # Show only filename, not full path
|
| 167 |
+
|
| 168 |
+
return sorted(voices)
|
| 169 |
+
|
| 170 |
+
def find_generated_audiobook(book_folder_path, voice_sample_path):
|
| 171 |
+
"""Find the generated audiobook files"""
|
| 172 |
+
try:
|
| 173 |
+
book_folder = Path(book_folder_path)
|
| 174 |
+
voice_file = Path(voice_sample_path)
|
| 175 |
+
voice_name = voice_file.stem
|
| 176 |
+
|
| 177 |
+
# Look in Output/ directory first (final audiobooks)
|
| 178 |
+
output_dir = Path("Output")
|
| 179 |
+
if output_dir.exists():
|
| 180 |
+
# Look for M4B files with voice name
|
| 181 |
+
for m4b_file in output_dir.glob(f"*[{voice_name}]*.m4b"):
|
| 182 |
+
if m4b_file.exists():
|
| 183 |
+
return str(m4b_file), "M4B audiobook"
|
| 184 |
+
|
| 185 |
+
# Look for WAV files with voice name
|
| 186 |
+
for wav_file in output_dir.glob(f"*[{voice_name}]*.wav"):
|
| 187 |
+
if wav_file.exists():
|
| 188 |
+
return str(wav_file), "WAV audiobook"
|
| 189 |
+
|
| 190 |
+
# Look in Audiobook/ directory (processing output)
|
| 191 |
+
audiobook_dir = Path("Audiobook") / book_folder.name
|
| 192 |
+
if audiobook_dir.exists():
|
| 193 |
+
# Look for M4B files
|
| 194 |
+
for m4b_file in audiobook_dir.glob(f"*[{voice_name}]*.m4b"):
|
| 195 |
+
if m4b_file.exists():
|
| 196 |
+
return str(m4b_file), "M4B audiobook"
|
| 197 |
+
|
| 198 |
+
# Look for WAV files
|
| 199 |
+
for wav_file in audiobook_dir.glob(f"*[{voice_name}]*.wav"):
|
| 200 |
+
if wav_file.exists():
|
| 201 |
+
return str(wav_file), "WAV audiobook"
|
| 202 |
+
|
| 203 |
+
# Look for combined files
|
| 204 |
+
for combined_file in audiobook_dir.glob("*_combined.*"):
|
| 205 |
+
if combined_file.suffix in ['.wav', '.m4b', '.mp3']:
|
| 206 |
+
return str(combined_file), f"{combined_file.suffix.upper()[1:]} combined audiobook"
|
| 207 |
+
|
| 208 |
+
return None, "No audiobook found"
|
| 209 |
+
|
| 210 |
+
except Exception as e:
|
| 211 |
+
print(f"Error finding audiobook: {e}")
|
| 212 |
+
return None, f"Error: {str(e)}"
|
| 213 |
+
|
| 214 |
+
def run_book_conversion(book_path, text_file_path, voice_path, tts_params, quality_params, config_params):
|
| 215 |
+
"""Run the actual book conversion - Direct call to TTS engine with progress monitoring"""
|
| 216 |
+
try:
|
| 217 |
+
# Import the real TTS engine function directly (avoid interface.py)
|
| 218 |
+
from modules.tts_engine import process_book_folder
|
| 219 |
+
|
| 220 |
+
# Extract enable_asr from tts_params (matching GUI exactly)
|
| 221 |
+
enable_asr = tts_params.get('enable_asr', False)
|
| 222 |
+
|
| 223 |
+
print(f"π Starting book conversion with GUI parameters")
|
| 224 |
+
print(f"π Book: {book_path}")
|
| 225 |
+
print(f"π Text file: {text_file_path}")
|
| 226 |
+
print(f"π€ Voice: {voice_path}")
|
| 227 |
+
print(f"ποΈ TTS Params: {tts_params}")
|
| 228 |
+
print(f"π¬ Quality Params: {quality_params}")
|
| 229 |
+
print(f"βοΈ Config Params: {config_params}")
|
| 230 |
+
|
| 231 |
+
# Set up progress callback function
|
| 232 |
+
def progress_callback(current_chunk, total_chunks, realtime_factor, vram_usage):
|
| 233 |
+
"""Callback function to update progress from TTS engine"""
|
| 234 |
+
conversion_state['current_chunk'] = f"{current_chunk}/{total_chunks}"
|
| 235 |
+
conversion_state['realtime_factor'] = f"{realtime_factor}x"
|
| 236 |
+
conversion_state['vram_usage'] = f"{vram_usage} GB"
|
| 237 |
+
conversion_state['progress'] = int((current_chunk / total_chunks) * 100) if total_chunks > 0 else 0
|
| 238 |
+
print(f"π Progress: {current_chunk}/{total_chunks} ({conversion_state['progress']}%) - {realtime_factor}x - {vram_usage}GB")
|
| 239 |
+
|
| 240 |
+
# Add progress callback to config params
|
| 241 |
+
config_params['progress_callback'] = progress_callback
|
| 242 |
+
|
| 243 |
+
# Convert string paths to Path objects (required by TTS engine)
|
| 244 |
+
book_dir_path = Path(book_path)
|
| 245 |
+
voice_path_obj = Path(voice_path)
|
| 246 |
+
|
| 247 |
+
# Auto-detect device with fallback to CPU
|
| 248 |
+
import torch
|
| 249 |
+
if torch.cuda.is_available():
|
| 250 |
+
device = "cuda"
|
| 251 |
+
print("β
Using CUDA GPU for processing")
|
| 252 |
+
else:
|
| 253 |
+
device = "cpu"
|
| 254 |
+
print("π» Using CPU for processing (no GPU available)")
|
| 255 |
+
|
| 256 |
+
# Direct call to TTS engine (function only accepts: book_dir, voice_path, tts_params, device, skip_cleanup)
|
| 257 |
+
result = process_book_folder(
|
| 258 |
+
book_dir=book_dir_path,
|
| 259 |
+
voice_path=voice_path_obj,
|
| 260 |
+
tts_params=tts_params,
|
| 261 |
+
device=device,
|
| 262 |
+
skip_cleanup=False
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
print(f"β
Conversion completed successfully")
|
| 266 |
+
return {'success': True, 'result': result}
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
print(f"β Conversion failed: {e}")
|
| 270 |
+
import traceback
|
| 271 |
+
traceback.print_exc()
|
| 272 |
+
return {'success': False, 'error': str(e)}
|
| 273 |
+
|
| 274 |
+
def regenerate_m4b_file(selected_m4b, playback_speed):
|
| 275 |
+
"""Regenerate M4B file with new playback speed"""
|
| 276 |
+
if not selected_m4b:
|
| 277 |
+
return "β Please select an M4B file first", None
|
| 278 |
+
|
| 279 |
+
try:
|
| 280 |
+
print(f"π Regenerating M4B: {selected_m4b} at {playback_speed}x speed")
|
| 281 |
+
|
| 282 |
+
# Import M4B regeneration tools
|
| 283 |
+
from tools.combine_only import apply_playback_speed_to_m4b
|
| 284 |
+
|
| 285 |
+
# Find the M4B file path
|
| 286 |
+
audiobook_root = Path("Audiobook")
|
| 287 |
+
m4b_path = None
|
| 288 |
+
|
| 289 |
+
for book_dir in audiobook_root.iterdir():
|
| 290 |
+
if book_dir.is_dir():
|
| 291 |
+
for m4b_file in book_dir.glob("*.m4b"):
|
| 292 |
+
if m4b_file.name == selected_m4b:
|
| 293 |
+
m4b_path = m4b_file
|
| 294 |
+
break
|
| 295 |
+
if m4b_path:
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
if not m4b_path:
|
| 299 |
+
return "β M4B file not found", None
|
| 300 |
+
|
| 301 |
+
# Create new filename with speed suffix
|
| 302 |
+
speed_suffix = f"_speed{playback_speed}x".replace(".", "p")
|
| 303 |
+
new_name = m4b_path.stem + speed_suffix + ".m4b"
|
| 304 |
+
output_path = m4b_path.parent / new_name
|
| 305 |
+
|
| 306 |
+
# Apply speed change
|
| 307 |
+
success = apply_playback_speed_to_m4b(str(m4b_path), str(output_path), playback_speed)
|
| 308 |
+
|
| 309 |
+
if success:
|
| 310 |
+
return f"β
Regenerated M4B at {playback_speed}x speed: {new_name}", str(output_path)
|
| 311 |
+
else:
|
| 312 |
+
return "β Failed to regenerate M4B", None
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
print(f"β M4B regeneration failed: {e}")
|
| 316 |
+
return f"β Error: {str(e)}", None
|
| 317 |
+
|
| 318 |
+
def create_convert_book_tab():
|
| 319 |
+
"""Create Tab 1: Convert Book with all GUI functionality"""
|
| 320 |
+
|
| 321 |
+
with gr.Column():
|
| 322 |
+
gr.Markdown("# π Convert Book")
|
| 323 |
+
gr.Markdown("*Main TTS conversion functionality - matches GUI Tab 1*")
|
| 324 |
+
|
| 325 |
+
# Main Content Layout
|
| 326 |
+
with gr.Row():
|
| 327 |
+
# Left Column - File Uploads
|
| 328 |
+
with gr.Column(scale=2):
|
| 329 |
+
gr.Markdown("### π Book Selection")
|
| 330 |
+
|
| 331 |
+
# Book text file upload only
|
| 332 |
+
text_file_upload = gr.File(
|
| 333 |
+
label="π Upload Book Text File",
|
| 334 |
+
file_types=[".txt"],
|
| 335 |
+
file_count="single",
|
| 336 |
+
interactive=True
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
gr.Markdown("### π€ Voice Selection")
|
| 340 |
+
|
| 341 |
+
# Single voice upload with integrated playback
|
| 342 |
+
voice_file_upload = gr.File(
|
| 343 |
+
label="π€ Upload Voice Sample",
|
| 344 |
+
file_types=[".wav", ".mp3", ".m4a"],
|
| 345 |
+
file_count="single",
|
| 346 |
+
interactive=True
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
# Voice sample player (becomes active after upload)
|
| 350 |
+
voice_audio = gr.Audio(
|
| 351 |
+
label="Voice Sample Preview",
|
| 352 |
+
interactive=False,
|
| 353 |
+
show_download_button=False,
|
| 354 |
+
visible=False
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
# Right Column - All Settings
|
| 358 |
+
with gr.Column(scale=1):
|
| 359 |
+
gr.Markdown("### βοΈ Quick Settings")
|
| 360 |
+
|
| 361 |
+
# VADER and ASR
|
| 362 |
+
vader_enabled = gr.Checkbox(
|
| 363 |
+
label="Use VADER sentiment analysis",
|
| 364 |
+
value=True,
|
| 365 |
+
info="Adjust TTS params per chunk based on emotion"
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
# ASR System with intelligent model selection
|
| 369 |
+
with gr.Row():
|
| 370 |
+
asr_enabled = gr.Checkbox(
|
| 371 |
+
label="π€ Enable ASR validation",
|
| 372 |
+
value=False,
|
| 373 |
+
info="Smart quality control with automatic model selection"
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
# ASR Configuration (initially hidden)
|
| 377 |
+
with gr.Column(visible=False) as asr_config_group:
|
| 378 |
+
gr.Markdown("#### π ASR Configuration")
|
| 379 |
+
|
| 380 |
+
# System analysis display
|
| 381 |
+
system_analysis = gr.Textbox(
|
| 382 |
+
label="System Analysis",
|
| 383 |
+
value="Click 'Analyze System' to detect capabilities",
|
| 384 |
+
lines=3,
|
| 385 |
+
interactive=False
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
analyze_system_btn = gr.Button(
|
| 389 |
+
"π Analyze System",
|
| 390 |
+
size="sm",
|
| 391 |
+
variant="secondary"
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# ASR Level Selection
|
| 395 |
+
asr_level = gr.Radio(
|
| 396 |
+
label="ASR Quality Level",
|
| 397 |
+
choices=[
|
| 398 |
+
("π’ SAFE - Fast processing, basic accuracy", "safe"),
|
| 399 |
+
("π‘ MODERATE - Balanced speed/accuracy (recommended)", "moderate"),
|
| 400 |
+
("π΄ INSANE - Best accuracy, may stress system", "insane")
|
| 401 |
+
],
|
| 402 |
+
value="moderate",
|
| 403 |
+
info="Automatically selects best models for your system"
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
# Selected models display
|
| 407 |
+
selected_models = gr.Textbox(
|
| 408 |
+
label="Selected ASR Models",
|
| 409 |
+
value="Select level to see model configuration",
|
| 410 |
+
lines=2,
|
| 411 |
+
interactive=False
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# Batch processing
|
| 415 |
+
add_to_batch = gr.Checkbox(
|
| 416 |
+
label="π¦ Add to batch queue",
|
| 417 |
+
value=False,
|
| 418 |
+
info="Queue for batch processing"
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
gr.Markdown("### π Regeneration Settings")
|
| 422 |
+
|
| 423 |
+
regeneration_enabled = gr.Checkbox(
|
| 424 |
+
label="Enable automatic chunk regeneration",
|
| 425 |
+
value=ENABLE_REGENERATION_LOOP,
|
| 426 |
+
info="Retry failed chunks automatically"
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
max_attempts = gr.Slider(
|
| 430 |
+
label="Max Attempts",
|
| 431 |
+
minimum=1, maximum=10, step=1,
|
| 432 |
+
value=MAX_REGENERATION_ATTEMPTS
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
quality_threshold = gr.Slider(
|
| 436 |
+
label="Quality Threshold",
|
| 437 |
+
minimum=0.1, maximum=1.0, step=0.05,
|
| 438 |
+
value=QUALITY_THRESHOLD
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
gr.Markdown("### π Sentiment Smoothing")
|
| 442 |
+
|
| 443 |
+
sentiment_smoothing = gr.Checkbox(
|
| 444 |
+
label="Enable sentiment smoothing",
|
| 445 |
+
value=ENABLE_SENTIMENT_SMOOTHING,
|
| 446 |
+
info="Smooth emotional transitions"
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
smoothing_window = gr.Slider(
|
| 450 |
+
label="Window Size",
|
| 451 |
+
minimum=1, maximum=10, step=1,
|
| 452 |
+
value=SENTIMENT_SMOOTHING_WINDOW
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
smoothing_method = gr.Dropdown(
|
| 456 |
+
label="Smoothing Method",
|
| 457 |
+
choices=["rolling", "exp_decay"],
|
| 458 |
+
value=SENTIMENT_SMOOTHING_METHOD
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
gr.Markdown("### π Advanced Detection")
|
| 462 |
+
|
| 463 |
+
mfcc_validation = gr.Checkbox(
|
| 464 |
+
label="MFCC spectral analysis",
|
| 465 |
+
value=ENABLE_MFCC_VALIDATION,
|
| 466 |
+
info="Advanced audio quality detection"
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
output_validation = gr.Checkbox(
|
| 470 |
+
label="Output validation",
|
| 471 |
+
value=ENABLE_OUTPUT_VALIDATION,
|
| 472 |
+
info="Quality control clearinghouse for enabled checks"
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
spectral_threshold = gr.Slider(
|
| 476 |
+
label="Spectral Threshold",
|
| 477 |
+
minimum=0.1, maximum=1.0, step=0.05,
|
| 478 |
+
value=SPECTRAL_ANOMALY_THRESHOLD
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
output_threshold = gr.Slider(
|
| 482 |
+
label="Output Threshold",
|
| 483 |
+
minimum=0.1, maximum=1.0, step=0.05,
|
| 484 |
+
value=OUTPUT_VALIDATION_THRESHOLD
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
# TTS Parameters
|
| 489 |
+
with gr.Row():
|
| 490 |
+
with gr.Column():
|
| 491 |
+
gr.Markdown("### ποΈ TTS Parameters")
|
| 492 |
+
|
| 493 |
+
exaggeration = gr.Slider(
|
| 494 |
+
label="Exaggeration",
|
| 495 |
+
minimum=TTS_PARAM_MIN_EXAGGERATION,
|
| 496 |
+
maximum=TTS_PARAM_MAX_EXAGGERATION,
|
| 497 |
+
step=0.1,
|
| 498 |
+
value=DEFAULT_EXAGGERATION,
|
| 499 |
+
info="Emotional intensity"
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
cfg_weight = gr.Slider(
|
| 503 |
+
label="CFG Weight",
|
| 504 |
+
minimum=TTS_PARAM_MIN_CFG_WEIGHT,
|
| 505 |
+
maximum=TTS_PARAM_MAX_CFG_WEIGHT,
|
| 506 |
+
step=0.1,
|
| 507 |
+
value=DEFAULT_CFG_WEIGHT,
|
| 508 |
+
info="Text faithfulness"
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
temperature = gr.Slider(
|
| 512 |
+
label="Temperature",
|
| 513 |
+
minimum=TTS_PARAM_MIN_TEMPERATURE,
|
| 514 |
+
maximum=TTS_PARAM_MAX_TEMPERATURE,
|
| 515 |
+
step=0.1,
|
| 516 |
+
value=DEFAULT_TEMPERATURE,
|
| 517 |
+
info="Creativity/randomness"
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
with gr.Column():
|
| 521 |
+
gr.Markdown("### β‘ Advanced Sampling")
|
| 522 |
+
|
| 523 |
+
min_p = gr.Slider(
|
| 524 |
+
label="Min-P",
|
| 525 |
+
minimum=0.0, maximum=0.5, step=0.01,
|
| 526 |
+
value=0.05,
|
| 527 |
+
info="Minimum probability threshold"
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
top_p = gr.Slider(
|
| 531 |
+
label="Top-P",
|
| 532 |
+
minimum=0.5, maximum=1.0, step=0.1,
|
| 533 |
+
value=1.0,
|
| 534 |
+
info="Nucleus sampling"
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
repetition_penalty = gr.Slider(
|
| 538 |
+
label="Repetition Penalty",
|
| 539 |
+
minimum=1.0, maximum=3.0, step=0.1,
|
| 540 |
+
value=2.0,
|
| 541 |
+
info="Reduce repetition"
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
gr.Markdown("### βοΈ Performance Settings")
|
| 545 |
+
|
| 546 |
+
max_workers = gr.Number(
|
| 547 |
+
label="Max Workers",
|
| 548 |
+
minimum=1, maximum=8, step=1,
|
| 549 |
+
value=2,
|
| 550 |
+
info="β οΈ Only increase above 2 if CPU/GPU utilization < 70%"
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
+
# Action Buttons and Status
|
| 554 |
+
with gr.Row():
|
| 555 |
+
with gr.Column(scale=2):
|
| 556 |
+
convert_btn = gr.Button(
|
| 557 |
+
"π Start Conversion",
|
| 558 |
+
variant="primary",
|
| 559 |
+
size="lg",
|
| 560 |
+
interactive=True
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
# Status Display
|
| 564 |
+
status_display = gr.Textbox(
|
| 565 |
+
label="Status",
|
| 566 |
+
value="βΈ Ready",
|
| 567 |
+
interactive=False,
|
| 568 |
+
lines=1
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
progress_display = gr.Number(
|
| 572 |
+
label="Progress %",
|
| 573 |
+
value=0,
|
| 574 |
+
interactive=False,
|
| 575 |
+
precision=0
|
| 576 |
+
)
|
| 577 |
+
|
| 578 |
+
with gr.Column(scale=1):
|
| 579 |
+
gr.Markdown("### π Processing Stats")
|
| 580 |
+
|
| 581 |
+
realtime_factor = gr.Textbox(
|
| 582 |
+
label="Realtime Factor",
|
| 583 |
+
value="--",
|
| 584 |
+
interactive=False
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
vram_usage = gr.Textbox(
|
| 588 |
+
label="VRAM Usage",
|
| 589 |
+
value="-- GB",
|
| 590 |
+
interactive=False
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
current_chunk = gr.Textbox(
|
| 594 |
+
label="Current Chunk",
|
| 595 |
+
value="--",
|
| 596 |
+
interactive=False
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
# Regenerate M4B Section (moved above audiobook player)
|
| 600 |
+
with gr.Row():
|
| 601 |
+
with gr.Column():
|
| 602 |
+
gr.Markdown("### π Regenerate M4B")
|
| 603 |
+
|
| 604 |
+
with gr.Row():
|
| 605 |
+
with gr.Column(scale=2):
|
| 606 |
+
m4b_file_selector = gr.Dropdown(
|
| 607 |
+
label="Select M4B File to Regenerate",
|
| 608 |
+
choices=[],
|
| 609 |
+
value=None,
|
| 610 |
+
interactive=True,
|
| 611 |
+
info="Choose from generated audiobook files"
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
with gr.Column(scale=1):
|
| 615 |
+
playback_speed = gr.Slider(
|
| 616 |
+
label="Playback Speed",
|
| 617 |
+
minimum=0.5,
|
| 618 |
+
maximum=2.0,
|
| 619 |
+
step=0.1,
|
| 620 |
+
value=1.0,
|
| 621 |
+
info="Speed adjustment for regeneration"
|
| 622 |
+
)
|
| 623 |
+
|
| 624 |
+
regenerate_m4b_btn = gr.Button(
|
| 625 |
+
"π Regenerate M4B",
|
| 626 |
+
variant="secondary",
|
| 627 |
+
size="lg"
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
# Generated Audiobook Player (simplified, play-only)
|
| 631 |
+
with gr.Row():
|
| 632 |
+
with gr.Column():
|
| 633 |
+
gr.Markdown("### π§ Generated Audiobook Player")
|
| 634 |
+
|
| 635 |
+
# Audiobook file selector dropdown
|
| 636 |
+
audiobook_selector = gr.Dropdown(
|
| 637 |
+
label="Select Audiobook",
|
| 638 |
+
choices=[],
|
| 639 |
+
value=None,
|
| 640 |
+
interactive=True,
|
| 641 |
+
info="Choose from session audiobooks"
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
# Main audio player - play only, no upload
|
| 645 |
+
audio_player = gr.Audio(
|
| 646 |
+
label="Audiobook Player",
|
| 647 |
+
value=None,
|
| 648 |
+
interactive=False,
|
| 649 |
+
show_download_button=True,
|
| 650 |
+
show_share_button=False,
|
| 651 |
+
waveform_options=gr.WaveformOptions(
|
| 652 |
+
show_controls=True,
|
| 653 |
+
show_recording_waveform=False,
|
| 654 |
+
skip_length=10
|
| 655 |
+
)
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
# Event Handlers
|
| 659 |
+
def handle_voice_upload(voice_file):
|
| 660 |
+
"""Handle voice file upload and show player"""
|
| 661 |
+
if voice_file is None:
|
| 662 |
+
return gr.update(value=None, visible=False)
|
| 663 |
+
|
| 664 |
+
# Show the voice player with uploaded file
|
| 665 |
+
return gr.update(value=voice_file, visible=True)
|
| 666 |
+
|
| 667 |
+
def get_session_audiobooks():
|
| 668 |
+
"""Get list of M4B files from current session, sorted by creation time (newest first)"""
|
| 669 |
+
audiobooks = []
|
| 670 |
+
|
| 671 |
+
# Look in Audiobook directory for M4B files
|
| 672 |
+
audiobook_root = Path("Audiobook")
|
| 673 |
+
if audiobook_root.exists():
|
| 674 |
+
for book_dir in audiobook_root.iterdir():
|
| 675 |
+
if book_dir.is_dir():
|
| 676 |
+
# Look for M4B files in book directory
|
| 677 |
+
for m4b_file in book_dir.glob("*.m4b"):
|
| 678 |
+
# Get creation time for sorting
|
| 679 |
+
creation_time = m4b_file.stat().st_mtime
|
| 680 |
+
audiobooks.append((str(m4b_file), m4b_file.name, creation_time))
|
| 681 |
+
|
| 682 |
+
# Also check Output directory
|
| 683 |
+
output_root = Path("Output")
|
| 684 |
+
if output_root.exists():
|
| 685 |
+
for m4b_file in output_root.glob("*.m4b"):
|
| 686 |
+
creation_time = m4b_file.stat().st_mtime
|
| 687 |
+
audiobooks.append((str(m4b_file), m4b_file.name, creation_time))
|
| 688 |
+
|
| 689 |
+
# Sort by creation time (newest first)
|
| 690 |
+
audiobooks.sort(key=lambda x: x[2], reverse=True)
|
| 691 |
+
|
| 692 |
+
# Return just path and name (drop creation time)
|
| 693 |
+
return [(ab[0], ab[1]) for ab in audiobooks]
|
| 694 |
+
|
| 695 |
+
def update_audiobook_dropdowns(latest_file=None):
|
| 696 |
+
"""Update audiobook dropdowns - after conversion both show latest, after regeneration only playback updates"""
|
| 697 |
+
audiobooks = get_session_audiobooks()
|
| 698 |
+
choices = [ab[1] for ab in audiobooks] # Just filenames for display
|
| 699 |
+
|
| 700 |
+
# Determine what to set as selected
|
| 701 |
+
if latest_file:
|
| 702 |
+
# Use specific file if provided
|
| 703 |
+
selected_file = latest_file
|
| 704 |
+
elif choices:
|
| 705 |
+
# Default to newest file (first in sorted list)
|
| 706 |
+
selected_file = choices[0]
|
| 707 |
+
else:
|
| 708 |
+
selected_file = None
|
| 709 |
+
|
| 710 |
+
return (
|
| 711 |
+
gr.update(choices=choices, value=selected_file), # audiobook_selector (playback)
|
| 712 |
+
gr.update(choices=choices, value=selected_file) # m4b_file_selector (regeneration source)
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
def update_audiobook_dropdowns_after_conversion():
|
| 716 |
+
"""Update both dropdowns to show the newest generated file after conversion"""
|
| 717 |
+
return update_audiobook_dropdowns()
|
| 718 |
+
|
| 719 |
+
def update_playback_only(new_file_name):
|
| 720 |
+
"""Update only the playback dropdown after regeneration"""
|
| 721 |
+
audiobooks = get_session_audiobooks()
|
| 722 |
+
choices = [ab[1] for ab in audiobooks]
|
| 723 |
+
|
| 724 |
+
return (
|
| 725 |
+
gr.update(choices=choices, value=new_file_name), # audiobook_selector (playback) - new file
|
| 726 |
+
gr.update() # m4b_file_selector (regeneration) - no change
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
def load_selected_audiobook(selected_audiobook):
|
| 730 |
+
"""Load selected audiobook into player"""
|
| 731 |
+
if not selected_audiobook:
|
| 732 |
+
return None
|
| 733 |
+
|
| 734 |
+
# Find the full path for the selected audiobook
|
| 735 |
+
audiobooks = get_session_audiobooks()
|
| 736 |
+
for full_path, filename in audiobooks:
|
| 737 |
+
if filename == selected_audiobook:
|
| 738 |
+
return full_path
|
| 739 |
+
|
| 740 |
+
return None
|
| 741 |
+
|
| 742 |
+
def handle_asr_toggle(asr_enabled_val):
|
| 743 |
+
"""Show/hide ASR configuration when ASR is toggled"""
|
| 744 |
+
return gr.update(visible=asr_enabled_val)
|
| 745 |
+
|
| 746 |
+
def analyze_system():
|
| 747 |
+
"""Analyze system capabilities and return summary"""
|
| 748 |
+
try:
|
| 749 |
+
from modules.system_detector import get_system_profile, print_system_summary, categorize_system
|
| 750 |
+
|
| 751 |
+
profile = get_system_profile()
|
| 752 |
+
categories = categorize_system(profile)
|
| 753 |
+
|
| 754 |
+
summary = f"π₯οΈ System Profile:\n"
|
| 755 |
+
summary += f"VRAM: {profile['gpu']['total_mb']:,}MB total, {profile['available_vram_after_tts']:,}MB available after TTS ({categories['vram']} class)\n"
|
| 756 |
+
summary += f"RAM: {profile['ram']['total_mb']:,}MB total, {profile['ram']['available_mb']:,}MB available ({categories['ram']} class)\n"
|
| 757 |
+
summary += f"CPU: {profile['cpu_cores']} cores ({categories['cpu']} class)"
|
| 758 |
+
|
| 759 |
+
if not profile['has_gpu']:
|
| 760 |
+
summary += f"\nβ οΈ No CUDA GPU detected - ASR will run on CPU only"
|
| 761 |
+
|
| 762 |
+
return summary
|
| 763 |
+
|
| 764 |
+
except Exception as e:
|
| 765 |
+
return f"β Error analyzing system: {str(e)}"
|
| 766 |
+
|
| 767 |
+
def update_asr_models(asr_level_val):
|
| 768 |
+
"""Update ASR model display based on selected level"""
|
| 769 |
+
try:
|
| 770 |
+
from modules.system_detector import get_system_profile, recommend_asr_models
|
| 771 |
+
|
| 772 |
+
profile = get_system_profile()
|
| 773 |
+
recommendations = recommend_asr_models(profile)
|
| 774 |
+
|
| 775 |
+
if asr_level_val not in recommendations:
|
| 776 |
+
return "β Invalid ASR level selected"
|
| 777 |
+
|
| 778 |
+
config = recommendations[asr_level_val]
|
| 779 |
+
primary = config['primary']
|
| 780 |
+
fallback = config['fallback']
|
| 781 |
+
|
| 782 |
+
result = f"Primary: {primary['model']} on {primary['device'].upper()}\n"
|
| 783 |
+
result += f"Fallback: {fallback['model']} on {fallback['device'].upper()}"
|
| 784 |
+
|
| 785 |
+
if asr_level_val == 'insane':
|
| 786 |
+
result += f"\nβ οΈ WARNING: INSANE mode may cause memory pressure"
|
| 787 |
+
|
| 788 |
+
return result
|
| 789 |
+
|
| 790 |
+
except Exception as e:
|
| 791 |
+
return f"β Error getting models: {str(e)}"
|
| 792 |
+
|
| 793 |
+
def start_conversion(text_file_upload, voice_file_upload,
|
| 794 |
+
vader_val, asr_val, asr_level_val, add_to_batch_val,
|
| 795 |
+
regen_enabled_val, max_attempts_val, quality_thresh_val,
|
| 796 |
+
sentiment_smooth_val, smooth_window_val, smooth_method_val,
|
| 797 |
+
mfcc_val, output_val, spectral_thresh_val, output_thresh_val,
|
| 798 |
+
exag_val, cfg_val, temp_val, min_p_val, top_p_val, rep_penalty_val,
|
| 799 |
+
max_workers_val):
|
| 800 |
+
"""Start the actual book conversion - file upload version"""
|
| 801 |
+
|
| 802 |
+
# Validation
|
| 803 |
+
if not text_file_upload:
|
| 804 |
+
return "β Please upload a text file", 0, None, None
|
| 805 |
+
if not voice_file_upload:
|
| 806 |
+
return "β Please upload a voice sample", 0, None, None
|
| 807 |
+
|
| 808 |
+
# Check if already running
|
| 809 |
+
if conversion_state['running']:
|
| 810 |
+
return "β οΈ Conversion already in progress", conversion_state['progress'], None, None
|
| 811 |
+
|
| 812 |
+
try:
|
| 813 |
+
# Create temporary book structure from uploads
|
| 814 |
+
import tempfile
|
| 815 |
+
import shutil
|
| 816 |
+
from datetime import datetime
|
| 817 |
+
|
| 818 |
+
# Generate unique book name from text file
|
| 819 |
+
text_filename = Path(text_file_upload).name
|
| 820 |
+
book_name = text_filename.replace('.txt', '').replace(' ', '_')
|
| 821 |
+
timestamp = datetime.now().strftime("%H%M%S")
|
| 822 |
+
unique_book_name = f"{book_name}_{timestamp}"
|
| 823 |
+
|
| 824 |
+
# Create directory structure
|
| 825 |
+
text_input_dir = Path("Text_Input")
|
| 826 |
+
text_input_dir.mkdir(exist_ok=True)
|
| 827 |
+
|
| 828 |
+
book_dir = text_input_dir / unique_book_name
|
| 829 |
+
book_dir.mkdir(exist_ok=True)
|
| 830 |
+
|
| 831 |
+
# Copy uploaded files to expected locations
|
| 832 |
+
text_dest = book_dir / f"{unique_book_name}.txt"
|
| 833 |
+
shutil.copy2(text_file_upload, text_dest)
|
| 834 |
+
|
| 835 |
+
voice_samples_dir = Path("Voice_Samples")
|
| 836 |
+
voice_samples_dir.mkdir(exist_ok=True)
|
| 837 |
+
|
| 838 |
+
voice_filename = Path(voice_file_upload).name
|
| 839 |
+
voice_dest = voice_samples_dir / voice_filename
|
| 840 |
+
shutil.copy2(voice_file_upload, voice_dest)
|
| 841 |
+
|
| 842 |
+
print(f"π Created book structure: {book_dir}")
|
| 843 |
+
print(f"π Text file: {text_dest}")
|
| 844 |
+
print(f"π€ Voice file: {voice_dest}")
|
| 845 |
+
|
| 846 |
+
except Exception as e:
|
| 847 |
+
return f"β Error setting up files: {e}", 0, None, None
|
| 848 |
+
|
| 849 |
+
# Build ASR configuration first
|
| 850 |
+
asr_config = {'enabled': False}
|
| 851 |
+
if asr_val:
|
| 852 |
+
try:
|
| 853 |
+
from modules.system_detector import get_system_profile, recommend_asr_models
|
| 854 |
+
profile = get_system_profile()
|
| 855 |
+
recommendations = recommend_asr_models(profile)
|
| 856 |
+
|
| 857 |
+
if asr_level_val in recommendations:
|
| 858 |
+
selected_config = recommendations[asr_level_val]
|
| 859 |
+
primary = selected_config['primary']
|
| 860 |
+
fallback = selected_config['fallback']
|
| 861 |
+
|
| 862 |
+
asr_config = {
|
| 863 |
+
'enabled': True,
|
| 864 |
+
'level': asr_level_val,
|
| 865 |
+
'primary_model': primary['model'],
|
| 866 |
+
'primary_device': primary['device'],
|
| 867 |
+
'fallback_model': fallback['model'],
|
| 868 |
+
'fallback_device': fallback['device']
|
| 869 |
+
}
|
| 870 |
+
except Exception as e:
|
| 871 |
+
print(f"β οΈ Error configuring ASR: {e}")
|
| 872 |
+
asr_config = {'enabled': False}
|
| 873 |
+
|
| 874 |
+
# Prepare parameters (matching GUI structure exactly)
|
| 875 |
+
tts_params = {
|
| 876 |
+
'exaggeration': exag_val,
|
| 877 |
+
'cfg_weight': cfg_val,
|
| 878 |
+
'temperature': temp_val,
|
| 879 |
+
'min_p': min_p_val,
|
| 880 |
+
'top_p': top_p_val,
|
| 881 |
+
'repetition_penalty': rep_penalty_val,
|
| 882 |
+
'enable_asr': asr_config.get('enabled', False), # Match GUI pattern
|
| 883 |
+
'max_workers': int(max_workers_val) # User-defined worker count
|
| 884 |
+
}
|
| 885 |
+
|
| 886 |
+
quality_params = {
|
| 887 |
+
'regeneration_enabled': regen_enabled_val,
|
| 888 |
+
'max_attempts': max_attempts_val,
|
| 889 |
+
'quality_threshold': quality_thresh_val,
|
| 890 |
+
'sentiment_smoothing': sentiment_smooth_val,
|
| 891 |
+
'smoothing_window': smooth_window_val,
|
| 892 |
+
'smoothing_method': smooth_method_val,
|
| 893 |
+
'mfcc_validation': mfcc_val,
|
| 894 |
+
'output_validation': output_val,
|
| 895 |
+
'spectral_threshold': spectral_thresh_val,
|
| 896 |
+
'output_threshold': output_thresh_val
|
| 897 |
+
}
|
| 898 |
+
|
| 899 |
+
config_params = {
|
| 900 |
+
'vader_enabled': vader_val,
|
| 901 |
+
'asr_enabled': asr_val,
|
| 902 |
+
'asr_config': asr_config,
|
| 903 |
+
'add_to_batch': add_to_batch_val
|
| 904 |
+
}
|
| 905 |
+
|
| 906 |
+
# Set conversion state
|
| 907 |
+
conversion_state['running'] = True
|
| 908 |
+
conversion_state['progress'] = 0
|
| 909 |
+
conversion_state['status'] = 'Starting conversion...'
|
| 910 |
+
conversion_state['current_book'] = book_dir.name # Track current book
|
| 911 |
+
|
| 912 |
+
try:
|
| 913 |
+
# Run conversion using the modular backend in a separate thread
|
| 914 |
+
import threading
|
| 915 |
+
|
| 916 |
+
def run_conversion_thread():
|
| 917 |
+
try:
|
| 918 |
+
result = run_book_conversion(
|
| 919 |
+
str(book_dir), str(text_dest), str(voice_dest),
|
| 920 |
+
tts_params, quality_params, config_params
|
| 921 |
+
)
|
| 922 |
+
|
| 923 |
+
if result['success']:
|
| 924 |
+
conversion_state['status'] = 'π CONVERSION COMPLETE! M4B audiobook ready for playback.'
|
| 925 |
+
conversion_state['progress'] = 100
|
| 926 |
+
conversion_state['auto_refresh_needed'] = True # Flag for auto-refresh
|
| 927 |
+
else:
|
| 928 |
+
conversion_state['status'] = f"β Conversion failed: {result.get('error', 'Unknown error')}"
|
| 929 |
+
conversion_state['progress'] = 0
|
| 930 |
+
|
| 931 |
+
except Exception as e:
|
| 932 |
+
conversion_state['status'] = f"β Error: {str(e)}"
|
| 933 |
+
conversion_state['progress'] = 0
|
| 934 |
+
finally:
|
| 935 |
+
conversion_state['running'] = False
|
| 936 |
+
|
| 937 |
+
# Start conversion thread
|
| 938 |
+
thread = threading.Thread(target=run_conversion_thread)
|
| 939 |
+
thread.start()
|
| 940 |
+
|
| 941 |
+
# Return immediate response - user will need to refresh to see final results
|
| 942 |
+
return (
|
| 943 |
+
"π Conversion started in background...",
|
| 944 |
+
5, # Initial progress
|
| 945 |
+
None,
|
| 946 |
+
gr.update(),
|
| 947 |
+
gr.update()
|
| 948 |
+
)
|
| 949 |
+
|
| 950 |
+
except Exception as e:
|
| 951 |
+
conversion_state['status'] = f"β Error: {str(e)}"
|
| 952 |
+
return conversion_state['status'], 0, None, gr.update(), gr.update()
|
| 953 |
+
finally:
|
| 954 |
+
conversion_state['running'] = False
|
| 955 |
+
|
| 956 |
+
|
| 957 |
+
# Connect event handlers
|
| 958 |
+
|
| 959 |
+
# ASR event handlers
|
| 960 |
+
asr_enabled.change(
|
| 961 |
+
handle_asr_toggle,
|
| 962 |
+
inputs=[asr_enabled],
|
| 963 |
+
outputs=[asr_config_group]
|
| 964 |
+
)
|
| 965 |
+
|
| 966 |
+
analyze_system_btn.click(
|
| 967 |
+
analyze_system,
|
| 968 |
+
inputs=[],
|
| 969 |
+
outputs=[system_analysis]
|
| 970 |
+
)
|
| 971 |
+
|
| 972 |
+
asr_level.change(
|
| 973 |
+
update_asr_models,
|
| 974 |
+
inputs=[asr_level],
|
| 975 |
+
outputs=[selected_models]
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
+
# Voice upload handler
|
| 979 |
+
voice_file_upload.change(
|
| 980 |
+
handle_voice_upload,
|
| 981 |
+
inputs=[voice_file_upload],
|
| 982 |
+
outputs=[voice_audio]
|
| 983 |
+
)
|
| 984 |
+
|
| 985 |
+
# Main conversion handler
|
| 986 |
+
convert_btn.click(
|
| 987 |
+
start_conversion,
|
| 988 |
+
inputs=[
|
| 989 |
+
text_file_upload, voice_file_upload,
|
| 990 |
+
vader_enabled, asr_enabled, asr_level, add_to_batch,
|
| 991 |
+
regeneration_enabled, max_attempts, quality_threshold,
|
| 992 |
+
sentiment_smoothing, smoothing_window, smoothing_method,
|
| 993 |
+
mfcc_validation, output_validation, spectral_threshold, output_threshold,
|
| 994 |
+
exaggeration, cfg_weight, temperature, min_p, top_p, repetition_penalty,
|
| 995 |
+
max_workers
|
| 996 |
+
],
|
| 997 |
+
outputs=[status_display, progress_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 998 |
+
)
|
| 999 |
+
|
| 1000 |
+
# Audiobook selector handler
|
| 1001 |
+
audiobook_selector.change(
|
| 1002 |
+
load_selected_audiobook,
|
| 1003 |
+
inputs=[audiobook_selector],
|
| 1004 |
+
outputs=[audio_player]
|
| 1005 |
+
)
|
| 1006 |
+
|
| 1007 |
+
# M4B regeneration handler
|
| 1008 |
+
def handle_m4b_regeneration(selected_m4b, speed):
|
| 1009 |
+
"""Handle M4B regeneration and update player"""
|
| 1010 |
+
status_msg, new_m4b_path = regenerate_m4b_file(selected_m4b, speed)
|
| 1011 |
+
|
| 1012 |
+
if new_m4b_path:
|
| 1013 |
+
# Load the new M4B in the player
|
| 1014 |
+
new_file_name = Path(new_m4b_path).name
|
| 1015 |
+
new_audio = load_selected_audiobook(new_file_name)
|
| 1016 |
+
# Update only playback dropdown, keep regeneration dropdown on source file
|
| 1017 |
+
audiobook_choices, m4b_choices = update_playback_only(new_file_name)
|
| 1018 |
+
return status_msg, new_audio, audiobook_choices, m4b_choices
|
| 1019 |
+
else:
|
| 1020 |
+
return status_msg, None, gr.update(), gr.update()
|
| 1021 |
+
|
| 1022 |
+
regenerate_m4b_btn.click(
|
| 1023 |
+
handle_m4b_regeneration,
|
| 1024 |
+
inputs=[m4b_file_selector, playback_speed],
|
| 1025 |
+
outputs=[status_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 1026 |
+
)
|
| 1027 |
+
|
| 1028 |
+
# Progress monitoring with file-based approach
|
| 1029 |
+
def get_current_stats():
|
| 1030 |
+
"""Get current progress statistics by monitoring output files"""
|
| 1031 |
+
try:
|
| 1032 |
+
if conversion_state['running']:
|
| 1033 |
+
# Look for generated audio chunks to estimate progress
|
| 1034 |
+
book_name = conversion_state.get('current_book', 'unknown')
|
| 1035 |
+
audiobook_root = Path("Audiobook") / book_name / "TTS" / "audio_chunks"
|
| 1036 |
+
|
| 1037 |
+
if audiobook_root.exists():
|
| 1038 |
+
chunk_files = list(audiobook_root.glob("chunk_*.wav"))
|
| 1039 |
+
current_chunks = len(chunk_files)
|
| 1040 |
+
|
| 1041 |
+
# Try to estimate total from JSON if available
|
| 1042 |
+
json_path = Path("Text_Input") / f"{book_name}_chunks.json"
|
| 1043 |
+
total_chunks = 0
|
| 1044 |
+
if json_path.exists():
|
| 1045 |
+
import json
|
| 1046 |
+
with open(json_path, 'r') as f:
|
| 1047 |
+
data = json.load(f)
|
| 1048 |
+
total_chunks = len(data)
|
| 1049 |
+
|
| 1050 |
+
if total_chunks > 0:
|
| 1051 |
+
progress = int((current_chunks / total_chunks) * 100)
|
| 1052 |
+
conversion_state['progress'] = progress
|
| 1053 |
+
conversion_state['current_chunk'] = f"{current_chunks}/{total_chunks}"
|
| 1054 |
+
|
| 1055 |
+
return (
|
| 1056 |
+
conversion_state.get('realtime_factor', '--'),
|
| 1057 |
+
conversion_state.get('vram_usage', '-- GB'),
|
| 1058 |
+
f"{current_chunks}/{total_chunks}",
|
| 1059 |
+
progress
|
| 1060 |
+
)
|
| 1061 |
+
|
| 1062 |
+
return (
|
| 1063 |
+
conversion_state.get('realtime_factor', '--'),
|
| 1064 |
+
conversion_state.get('vram_usage', '-- GB'),
|
| 1065 |
+
conversion_state.get('current_chunk', '--'),
|
| 1066 |
+
conversion_state.get('progress', 0)
|
| 1067 |
+
)
|
| 1068 |
+
except Exception as e:
|
| 1069 |
+
print(f"Error getting stats: {e}")
|
| 1070 |
+
return "--", "-- GB", "--", conversion_state.get('progress', 0)
|
| 1071 |
+
|
| 1072 |
+
def auto_check_completion():
|
| 1073 |
+
"""Automatically check for completion and refresh interface"""
|
| 1074 |
+
# First get current stats
|
| 1075 |
+
stats = get_current_stats()
|
| 1076 |
+
|
| 1077 |
+
# Check if conversion just completed and needs auto-refresh
|
| 1078 |
+
if (not conversion_state['running'] and
|
| 1079 |
+
conversion_state['progress'] == 100 and
|
| 1080 |
+
conversion_state.get('auto_refresh_needed', False)):
|
| 1081 |
+
|
| 1082 |
+
# Clear the auto-refresh flag
|
| 1083 |
+
conversion_state['auto_refresh_needed'] = False
|
| 1084 |
+
print("π Auto-detected completion! Refreshing interface...")
|
| 1085 |
+
|
| 1086 |
+
# Get completion results
|
| 1087 |
+
status, progress, audio, audiobook_choices, m4b_choices = get_status_and_results()
|
| 1088 |
+
|
| 1089 |
+
# Return combined stats + completion results
|
| 1090 |
+
return (
|
| 1091 |
+
stats[0], # realtime_factor
|
| 1092 |
+
stats[1], # vram_usage
|
| 1093 |
+
stats[2], # current_chunk
|
| 1094 |
+
100, # progress (completed)
|
| 1095 |
+
status, # completion status
|
| 1096 |
+
audio, # audio player
|
| 1097 |
+
audiobook_choices, # audiobook dropdown
|
| 1098 |
+
m4b_choices # m4b dropdown
|
| 1099 |
+
)
|
| 1100 |
+
else:
|
| 1101 |
+
# Return stats + current status (no completion)
|
| 1102 |
+
return (
|
| 1103 |
+
stats[0], # realtime_factor
|
| 1104 |
+
stats[1], # vram_usage
|
| 1105 |
+
stats[2], # current_chunk
|
| 1106 |
+
stats[3], # progress
|
| 1107 |
+
conversion_state.get('status', 'βΈ Ready'), # current status
|
| 1108 |
+
gr.update(), # no audio update
|
| 1109 |
+
gr.update(), # no audiobook update
|
| 1110 |
+
gr.update() # no m4b update
|
| 1111 |
+
)
|
| 1112 |
+
|
| 1113 |
+
def get_status_and_results():
|
| 1114 |
+
"""Get conversion status and results after completion"""
|
| 1115 |
+
if not conversion_state['running'] and conversion_state['progress'] == 100:
|
| 1116 |
+
# Conversion completed, update dropdowns
|
| 1117 |
+
audiobook_choices, m4b_choices = update_audiobook_dropdowns_after_conversion()
|
| 1118 |
+
latest_audiobook = None
|
| 1119 |
+
if audiobook_choices['choices']:
|
| 1120 |
+
latest_audiobook = load_selected_audiobook(audiobook_choices['choices'][0])
|
| 1121 |
+
|
| 1122 |
+
return (
|
| 1123 |
+
conversion_state['status'],
|
| 1124 |
+
conversion_state['progress'],
|
| 1125 |
+
latest_audiobook,
|
| 1126 |
+
audiobook_choices,
|
| 1127 |
+
m4b_choices
|
| 1128 |
+
)
|
| 1129 |
+
else:
|
| 1130 |
+
return (
|
| 1131 |
+
conversion_state['status'],
|
| 1132 |
+
conversion_state['progress'],
|
| 1133 |
+
None,
|
| 1134 |
+
gr.update(),
|
| 1135 |
+
gr.update()
|
| 1136 |
+
)
|
| 1137 |
+
|
| 1138 |
+
# Create refresh buttons
|
| 1139 |
+
with gr.Row():
|
| 1140 |
+
refresh_stats_btn = gr.Button("π Refresh Stats", size="sm", variant="secondary")
|
| 1141 |
+
check_completion_btn = gr.Button("π Check Completion", size="sm", variant="secondary")
|
| 1142 |
+
|
| 1143 |
+
# Auto-refresh timer (checks every 5 seconds during conversion)
|
| 1144 |
+
auto_timer = gr.Timer(5.0) # 5 second interval
|
| 1145 |
+
|
| 1146 |
+
refresh_stats_btn.click(
|
| 1147 |
+
auto_check_completion,
|
| 1148 |
+
outputs=[realtime_factor, vram_usage, current_chunk, progress_display, status_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 1149 |
+
)
|
| 1150 |
+
|
| 1151 |
+
check_completion_btn.click(
|
| 1152 |
+
get_status_and_results,
|
| 1153 |
+
outputs=[status_display, progress_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 1154 |
+
)
|
| 1155 |
+
|
| 1156 |
+
# Auto-timer for progress monitoring and completion detection
|
| 1157 |
+
auto_timer.tick(
|
| 1158 |
+
auto_check_completion,
|
| 1159 |
+
outputs=[realtime_factor, vram_usage, current_chunk, progress_display, status_display, audio_player, audiobook_selector, m4b_file_selector]
|
| 1160 |
+
)
|
| 1161 |
+
|
| 1162 |
+
return {
|
| 1163 |
+
'convert_button': convert_btn,
|
| 1164 |
+
'status_display': status_display,
|
| 1165 |
+
'progress': progress_display
|
| 1166 |
+
}
|
| 1167 |
+
|
| 1168 |
+
if __name__ == "__main__":
|
| 1169 |
+
# Test the tab
|
| 1170 |
+
with gr.Blocks() as demo:
|
| 1171 |
+
create_convert_book_tab()
|
| 1172 |
+
|
| 1173 |
+
demo.launch()
|
gradio_tabs/tab2_configuration.py
CHANGED
|
@@ -11,15 +11,23 @@ import json
|
|
| 11 |
from pathlib import Path
|
| 12 |
from typing import Dict, Any, Tuple, List
|
| 13 |
|
| 14 |
-
# Import configuration
|
| 15 |
try:
|
| 16 |
-
from
|
| 17 |
-
CONFIG_AVAILABLE =
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
except ImportError as e:
|
| 20 |
-
print(f"β οΈ
|
| 21 |
CONFIG_AVAILABLE = False
|
| 22 |
-
#
|
| 23 |
MAX_WORKERS = 2
|
| 24 |
BATCH_SIZE = 100
|
| 25 |
MIN_CHUNK_WORDS = 5
|
|
@@ -391,11 +399,19 @@ def create_configuration_tab():
|
|
| 391 |
'CHUNK_END_SILENCE_MS': int(values[30]) if values[29] else 0
|
| 392 |
}
|
| 393 |
|
| 394 |
-
# Import the config module and update values
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
return "β
Configuration saved successfully!\nπ Settings updated in memory. Restart application to persist changes."
|
| 401 |
|
|
@@ -450,44 +466,51 @@ def create_configuration_tab():
|
|
| 450 |
if not CONFIG_AVAILABLE:
|
| 451 |
return "β Configuration module not available"
|
| 452 |
|
| 453 |
-
# Reload config module
|
| 454 |
import importlib
|
| 455 |
-
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
# Return reloaded values
|
| 459 |
return (
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
"β
Configuration reloaded from file"
|
| 492 |
)
|
| 493 |
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
from typing import Dict, Any, Tuple, List
|
| 13 |
|
| 14 |
+
# Import configuration with HuggingFace deployment compatibility
|
| 15 |
try:
|
| 16 |
+
from .gradio_imports import safe_import_config, get_default_config
|
| 17 |
+
config_vars, CONFIG_AVAILABLE = safe_import_config()
|
| 18 |
+
if CONFIG_AVAILABLE:
|
| 19 |
+
print("β
Config module loaded successfully")
|
| 20 |
+
# Update local variables with config values
|
| 21 |
+
locals().update(config_vars)
|
| 22 |
+
else:
|
| 23 |
+
print("β οΈ Config not available - using defaults")
|
| 24 |
+
# Get default values
|
| 25 |
+
default_config = get_default_config()
|
| 26 |
+
locals().update(default_config)
|
| 27 |
except ImportError as e:
|
| 28 |
+
print(f"β οΈ Import system not available: {e} - using fallback defaults")
|
| 29 |
CONFIG_AVAILABLE = False
|
| 30 |
+
# Fallback default values if gradio_imports not available
|
| 31 |
MAX_WORKERS = 2
|
| 32 |
BATCH_SIZE = 100
|
| 33 |
MIN_CHUNK_WORDS = 5
|
|
|
|
| 399 |
'CHUNK_END_SILENCE_MS': int(values[30]) if values[29] else 0
|
| 400 |
}
|
| 401 |
|
| 402 |
+
# Import the config module and update values using safe import
|
| 403 |
+
try:
|
| 404 |
+
from .gradio_imports import safe_import
|
| 405 |
+
config_module = safe_import('config', 'config')
|
| 406 |
+
for key, value in config_values.items():
|
| 407 |
+
if hasattr(config_module, key):
|
| 408 |
+
setattr(config_module, key, value)
|
| 409 |
+
except ImportError:
|
| 410 |
+
# Fallback to direct import
|
| 411 |
+
from config import config
|
| 412 |
+
for key, value in config_values.items():
|
| 413 |
+
if hasattr(config, key):
|
| 414 |
+
setattr(config, key, value)
|
| 415 |
|
| 416 |
return "β
Configuration saved successfully!\nπ Settings updated in memory. Restart application to persist changes."
|
| 417 |
|
|
|
|
| 466 |
if not CONFIG_AVAILABLE:
|
| 467 |
return "β Configuration module not available"
|
| 468 |
|
| 469 |
+
# Reload config module using safe import
|
| 470 |
import importlib
|
| 471 |
+
try:
|
| 472 |
+
from .gradio_imports import safe_import
|
| 473 |
+
config_module = safe_import('config', 'config')
|
| 474 |
+
importlib.reload(config_module)
|
| 475 |
+
except ImportError:
|
| 476 |
+
# Fallback to direct import
|
| 477 |
+
from config import config
|
| 478 |
+
config_module = config
|
| 479 |
+
importlib.reload(config)
|
| 480 |
|
| 481 |
# Return reloaded values
|
| 482 |
return (
|
| 483 |
+
config_module.MAX_WORKERS,
|
| 484 |
+
config_module.BATCH_SIZE,
|
| 485 |
+
config_module.MIN_CHUNK_WORDS,
|
| 486 |
+
config_module.MAX_CHUNK_WORDS,
|
| 487 |
+
config_module.ENABLE_NORMALIZATION,
|
| 488 |
+
config_module.TARGET_LUFS,
|
| 489 |
+
config_module.ENABLE_AUDIO_TRIMMING,
|
| 490 |
+
config_module.SPEECH_ENDPOINT_THRESHOLD,
|
| 491 |
+
config_module.TRIMMING_BUFFER_MS,
|
| 492 |
+
config_module.TTS_PARAM_MIN_EXAGGERATION,
|
| 493 |
+
config_module.TTS_PARAM_MAX_EXAGGERATION,
|
| 494 |
+
config_module.TTS_PARAM_MIN_CFG_WEIGHT,
|
| 495 |
+
config_module.TTS_PARAM_MAX_CFG_WEIGHT,
|
| 496 |
+
config_module.TTS_PARAM_MIN_TEMPERATURE,
|
| 497 |
+
config_module.TTS_PARAM_MAX_TEMPERATURE,
|
| 498 |
+
config_module.DEFAULT_EXAGGERATION,
|
| 499 |
+
config_module.DEFAULT_CFG_WEIGHT,
|
| 500 |
+
config_module.DEFAULT_TEMPERATURE,
|
| 501 |
+
config_module.VADER_EXAGGERATION_SENSITIVITY,
|
| 502 |
+
config_module.VADER_CFG_WEIGHT_SENSITIVITY,
|
| 503 |
+
config_module.VADER_TEMPERATURE_SENSITIVITY,
|
| 504 |
+
config_module.SILENCE_CHAPTER_START,
|
| 505 |
+
config_module.SILENCE_CHAPTER_END,
|
| 506 |
+
config_module.SILENCE_SECTION_BREAK,
|
| 507 |
+
config_module.SILENCE_PARAGRAPH_END,
|
| 508 |
+
config_module.SILENCE_COMMA,
|
| 509 |
+
config_module.SILENCE_PERIOD,
|
| 510 |
+
config_module.SILENCE_QUESTION_MARK,
|
| 511 |
+
config_module.SILENCE_EXCLAMATION,
|
| 512 |
+
config_module.CHUNK_END_SILENCE_MS > 0,
|
| 513 |
+
config_module.CHUNK_END_SILENCE_MS,
|
| 514 |
"β
Configuration reloaded from file"
|
| 515 |
)
|
| 516 |
|
gradio_tabs/tab_diagnostics.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gradio Diagnostics Tab
|
| 4 |
+
Run parallel processing diagnostics through web interface
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import time
|
| 9 |
+
import threading
|
| 10 |
+
import multiprocessing
|
| 11 |
+
import concurrent.futures
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import torch
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import io
|
| 17 |
+
from contextlib import redirect_stdout
|
| 18 |
+
|
| 19 |
+
# Try to import psutil, fallback if not available
|
| 20 |
+
try:
|
| 21 |
+
import psutil
|
| 22 |
+
PSUTIL_AVAILABLE = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
PSUTIL_AVAILABLE = False
|
| 25 |
+
|
| 26 |
+
class DiagnosticRunner:
|
| 27 |
+
def __init__(self):
|
| 28 |
+
self.running = False
|
| 29 |
+
|
| 30 |
+
def test_basic_multiprocessing(self):
|
| 31 |
+
"""Test 1: Basic multiprocessing capability"""
|
| 32 |
+
output = []
|
| 33 |
+
output.append("=== TEST 1: Basic Multiprocessing ===")
|
| 34 |
+
|
| 35 |
+
def simple_task(n):
|
| 36 |
+
return n * n
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Sequential
|
| 40 |
+
start = time.time()
|
| 41 |
+
results_seq = [simple_task(i) for i in range(100)]
|
| 42 |
+
seq_time = time.time() - start
|
| 43 |
+
output.append(f"Sequential: {seq_time:.3f}s")
|
| 44 |
+
|
| 45 |
+
# Parallel
|
| 46 |
+
start = time.time()
|
| 47 |
+
with multiprocessing.Pool(processes=4) as pool:
|
| 48 |
+
results_par = pool.map(simple_task, range(100))
|
| 49 |
+
par_time = time.time() - start
|
| 50 |
+
output.append(f"Parallel (4 workers): {par_time:.3f}s")
|
| 51 |
+
output.append(f"Speedup: {seq_time/par_time:.2f}x")
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
output.append(f"ERROR: {e}")
|
| 55 |
+
|
| 56 |
+
output.append("")
|
| 57 |
+
return "\n".join(output)
|
| 58 |
+
|
| 59 |
+
def test_thread_vs_process(self):
|
| 60 |
+
"""Test 2: Threading vs Processing"""
|
| 61 |
+
output = []
|
| 62 |
+
output.append("=== TEST 2: Threading vs Processing ===")
|
| 63 |
+
|
| 64 |
+
def cpu_task(n):
|
| 65 |
+
total = 0
|
| 66 |
+
for i in range(n * 1000):
|
| 67 |
+
total += i * i
|
| 68 |
+
return total
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
tasks = [1000] * 8
|
| 72 |
+
|
| 73 |
+
# Sequential
|
| 74 |
+
start = time.time()
|
| 75 |
+
seq_results = [cpu_task(t) for t in tasks]
|
| 76 |
+
seq_time = time.time() - start
|
| 77 |
+
output.append(f"Sequential: {seq_time:.3f}s")
|
| 78 |
+
|
| 79 |
+
# Threading
|
| 80 |
+
start = time.time()
|
| 81 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 82 |
+
thread_results = list(executor.map(cpu_task, tasks))
|
| 83 |
+
thread_time = time.time() - start
|
| 84 |
+
output.append(f"ThreadPool: {thread_time:.3f}s, speedup: {seq_time/thread_time:.2f}x")
|
| 85 |
+
|
| 86 |
+
# Processing
|
| 87 |
+
start = time.time()
|
| 88 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
|
| 89 |
+
process_results = list(executor.map(cpu_task, tasks))
|
| 90 |
+
process_time = time.time() - start
|
| 91 |
+
output.append(f"ProcessPool: {process_time:.3f}s, speedup: {seq_time/process_time:.2f}x")
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
output.append(f"ERROR: {e}")
|
| 95 |
+
|
| 96 |
+
output.append("")
|
| 97 |
+
return "\n".join(output)
|
| 98 |
+
|
| 99 |
+
def test_gpu_access(self):
|
| 100 |
+
"""Test 3: GPU sharing capability"""
|
| 101 |
+
output = []
|
| 102 |
+
output.append("=== TEST 3: GPU Access ===")
|
| 103 |
+
|
| 104 |
+
if not torch.cuda.is_available():
|
| 105 |
+
output.append("No CUDA available - skipping GPU test")
|
| 106 |
+
output.append("")
|
| 107 |
+
return "\n".join(output)
|
| 108 |
+
|
| 109 |
+
def gpu_task(worker_id):
|
| 110 |
+
try:
|
| 111 |
+
device = torch.device("cuda")
|
| 112 |
+
x = torch.randn(1000, 1000, device=device)
|
| 113 |
+
y = torch.randn(1000, 1000, device=device)
|
| 114 |
+
start = time.time()
|
| 115 |
+
for _ in range(10):
|
| 116 |
+
z = torch.mm(x, y)
|
| 117 |
+
duration = time.time() - start
|
| 118 |
+
return f"Worker {worker_id}: {duration:.3f}s"
|
| 119 |
+
except Exception as e:
|
| 120 |
+
return f"Worker {worker_id}: ERROR - {e}"
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
# Sequential GPU access
|
| 124 |
+
start = time.time()
|
| 125 |
+
seq_results = [gpu_task(i) for i in range(4)]
|
| 126 |
+
seq_time = time.time() - start
|
| 127 |
+
output.append("Sequential GPU:")
|
| 128 |
+
for result in seq_results:
|
| 129 |
+
output.append(f" {result}")
|
| 130 |
+
output.append(f"Total sequential time: {seq_time:.3f}s")
|
| 131 |
+
|
| 132 |
+
# Parallel GPU access
|
| 133 |
+
start = time.time()
|
| 134 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 135 |
+
par_results = list(executor.map(gpu_task, range(4)))
|
| 136 |
+
par_time = time.time() - start
|
| 137 |
+
output.append("Parallel GPU:")
|
| 138 |
+
for result in par_results:
|
| 139 |
+
output.append(f" {result}")
|
| 140 |
+
output.append(f"Total parallel time: {par_time:.3f}s")
|
| 141 |
+
|
| 142 |
+
except Exception as e:
|
| 143 |
+
output.append(f"ERROR: {e}")
|
| 144 |
+
|
| 145 |
+
output.append("")
|
| 146 |
+
return "\n".join(output)
|
| 147 |
+
|
| 148 |
+
def test_model_loading(self):
|
| 149 |
+
"""Test 4: Model loading overhead"""
|
| 150 |
+
output = []
|
| 151 |
+
output.append("=== TEST 4: Model Loading Simulation ===")
|
| 152 |
+
|
| 153 |
+
def load_model():
|
| 154 |
+
time.sleep(0.5) # 500ms loading time
|
| 155 |
+
return {"model": "loaded", "size": "large"}
|
| 156 |
+
|
| 157 |
+
def task_with_model_loading(worker_id):
|
| 158 |
+
start = time.time()
|
| 159 |
+
model = load_model()
|
| 160 |
+
processing_time = 0.1
|
| 161 |
+
time.sleep(processing_time)
|
| 162 |
+
total_time = time.time() - start
|
| 163 |
+
return f"Worker {worker_id}: {total_time:.3f}s"
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
output.append("Each worker loads model:")
|
| 167 |
+
start = time.time()
|
| 168 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 169 |
+
results = list(executor.map(task_with_model_loading, range(4)))
|
| 170 |
+
total_time = time.time() - start
|
| 171 |
+
|
| 172 |
+
for result in results:
|
| 173 |
+
output.append(f" {result}")
|
| 174 |
+
output.append(f"Total time with per-worker loading: {total_time:.3f}s")
|
| 175 |
+
|
| 176 |
+
shared_load_time = 0.5
|
| 177 |
+
processing_time = 0.1 * 4
|
| 178 |
+
simulated_shared_time = shared_load_time + processing_time
|
| 179 |
+
output.append(f"Simulated shared model time: {simulated_shared_time:.3f}s")
|
| 180 |
+
output.append(f"Overhead from per-worker loading: {total_time - simulated_shared_time:.3f}s")
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
output.append(f"ERROR: {e}")
|
| 184 |
+
|
| 185 |
+
output.append("")
|
| 186 |
+
return "\n".join(output)
|
| 187 |
+
|
| 188 |
+
def test_environment_info(self):
|
| 189 |
+
"""Test 5: Environment information"""
|
| 190 |
+
output = []
|
| 191 |
+
output.append("=== TEST 5: Environment Info ===")
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
output.append(f"Python version: {sys.version}")
|
| 195 |
+
output.append(f"Platform: {sys.platform}")
|
| 196 |
+
output.append(f"CPU cores: {multiprocessing.cpu_count()}")
|
| 197 |
+
|
| 198 |
+
if PSUTIL_AVAILABLE:
|
| 199 |
+
output.append(f"CPU usage: {psutil.cpu_percent()}%")
|
| 200 |
+
output.append(f"Memory: {psutil.virtual_memory().percent}% used")
|
| 201 |
+
else:
|
| 202 |
+
output.append("psutil not available - limited system info")
|
| 203 |
+
|
| 204 |
+
if torch.cuda.is_available():
|
| 205 |
+
output.append(f"CUDA available: Yes")
|
| 206 |
+
output.append(f"CUDA devices: {torch.cuda.device_count()}")
|
| 207 |
+
output.append(f"Current device: {torch.cuda.current_device()}")
|
| 208 |
+
output.append(f"Device name: {torch.cuda.get_device_name()}")
|
| 209 |
+
if hasattr(torch.cuda, 'memory_summary'):
|
| 210 |
+
output.append("GPU Memory:")
|
| 211 |
+
output.append(torch.cuda.memory_summary(abbreviated=True))
|
| 212 |
+
else:
|
| 213 |
+
output.append("CUDA available: No")
|
| 214 |
+
|
| 215 |
+
mp_vars = [
|
| 216 |
+
'OMP_NUM_THREADS', 'MKL_NUM_THREADS', 'OPENBLAS_NUM_THREADS',
|
| 217 |
+
'VECLIB_MAXIMUM_THREADS', 'NUMEXPR_NUM_THREADS'
|
| 218 |
+
]
|
| 219 |
+
output.append("Threading environment variables:")
|
| 220 |
+
for var in mp_vars:
|
| 221 |
+
value = os.environ.get(var, 'Not set')
|
| 222 |
+
output.append(f" {var}: {value}")
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
output.append(f"ERROR: {e}")
|
| 226 |
+
|
| 227 |
+
output.append("")
|
| 228 |
+
return "\n".join(output)
|
| 229 |
+
|
| 230 |
+
def test_worker_creation(self):
|
| 231 |
+
"""Test 6: Worker creation monitoring"""
|
| 232 |
+
output = []
|
| 233 |
+
output.append("=== TEST 6: Worker Creation ===")
|
| 234 |
+
|
| 235 |
+
def monitored_task(worker_id):
|
| 236 |
+
pid = os.getpid()
|
| 237 |
+
tid = threading.get_ident()
|
| 238 |
+
return f"Worker {worker_id}: PID={pid}, TID={tid}"
|
| 239 |
+
|
| 240 |
+
try:
|
| 241 |
+
output.append("Main process:")
|
| 242 |
+
output.append(f" PID: {os.getpid()}")
|
| 243 |
+
output.append(f" TID: {threading.get_ident()}")
|
| 244 |
+
|
| 245 |
+
output.append("ThreadPoolExecutor workers:")
|
| 246 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 247 |
+
results = list(executor.map(monitored_task, range(4)))
|
| 248 |
+
for result in results:
|
| 249 |
+
output.append(f" {result}")
|
| 250 |
+
|
| 251 |
+
output.append("ProcessPoolExecutor workers:")
|
| 252 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
|
| 253 |
+
results = list(executor.map(monitored_task, range(4)))
|
| 254 |
+
for result in results:
|
| 255 |
+
output.append(f" {result}")
|
| 256 |
+
|
| 257 |
+
except Exception as e:
|
| 258 |
+
output.append(f"ERROR: {e}")
|
| 259 |
+
|
| 260 |
+
output.append("")
|
| 261 |
+
return "\n".join(output)
|
| 262 |
+
|
| 263 |
+
def test_tts_model_performance(self):
|
| 264 |
+
"""Test 7: Real TTS model performance"""
|
| 265 |
+
output = []
|
| 266 |
+
output.append("=== TEST 7: TTS Model Performance ===")
|
| 267 |
+
|
| 268 |
+
try:
|
| 269 |
+
# Import TTS components
|
| 270 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 271 |
+
from modules.tts_engine import load_optimized_model, detect_deployment_environment
|
| 272 |
+
|
| 273 |
+
# Detect environment
|
| 274 |
+
env = detect_deployment_environment()
|
| 275 |
+
output.append(f"π Environment: {env}")
|
| 276 |
+
|
| 277 |
+
# Test 1: Model loading time
|
| 278 |
+
output.append("\n--- MODEL LOADING TEST ---")
|
| 279 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 280 |
+
output.append(f"π Loading model on {device}...")
|
| 281 |
+
|
| 282 |
+
start_time = time.time()
|
| 283 |
+
model = load_optimized_model(device)
|
| 284 |
+
load_time = time.time() - start_time
|
| 285 |
+
output.append(f"β±οΈ Model load time: {load_time:.2f}s")
|
| 286 |
+
|
| 287 |
+
# Test 2: Single inference timing
|
| 288 |
+
output.append("\n--- SINGLE INFERENCE TEST ---")
|
| 289 |
+
test_text = "Hello world, this is a test."
|
| 290 |
+
|
| 291 |
+
# Warmup run
|
| 292 |
+
try:
|
| 293 |
+
with torch.no_grad():
|
| 294 |
+
_ = model.generate(test_text, exaggeration=0.5, cfg_weight=0.5, temperature=0.7)
|
| 295 |
+
if torch.cuda.is_available():
|
| 296 |
+
torch.cuda.synchronize()
|
| 297 |
+
output.append("β
Warmup completed")
|
| 298 |
+
except Exception as e:
|
| 299 |
+
output.append(f"β οΈ Warmup failed: {e}")
|
| 300 |
+
|
| 301 |
+
# Timed run
|
| 302 |
+
start_time = time.time()
|
| 303 |
+
try:
|
| 304 |
+
with torch.no_grad():
|
| 305 |
+
audio = model.generate(test_text, exaggeration=0.5, cfg_weight=0.5, temperature=0.7)
|
| 306 |
+
if torch.cuda.is_available():
|
| 307 |
+
torch.cuda.synchronize()
|
| 308 |
+
inference_time = time.time() - start_time
|
| 309 |
+
|
| 310 |
+
# Calculate realtime factor
|
| 311 |
+
if hasattr(audio, 'shape'):
|
| 312 |
+
sample_rate = getattr(model, 'sr', 24000)
|
| 313 |
+
audio_duration = audio.shape[-1] / sample_rate
|
| 314 |
+
realtime_factor = audio_duration / inference_time if inference_time > 0 else 0
|
| 315 |
+
output.append(f"β±οΈ Inference time: {inference_time:.3f}s")
|
| 316 |
+
output.append(f"π΅ Audio duration: {audio_duration:.3f}s")
|
| 317 |
+
output.append(f"π Realtime factor: {realtime_factor:.2f}x")
|
| 318 |
+
|
| 319 |
+
# Check if this matches your slow performance
|
| 320 |
+
if realtime_factor < 0.5:
|
| 321 |
+
output.append("β οΈ WARNING: Very slow realtime factor!")
|
| 322 |
+
output.append(" This matches your reported slow performance")
|
| 323 |
+
elif realtime_factor > 1.0:
|
| 324 |
+
output.append("β
Good realtime factor - issue may be elsewhere")
|
| 325 |
+
else:
|
| 326 |
+
output.append(f"β±οΈ Inference time: {inference_time:.3f}s")
|
| 327 |
+
output.append("β οΈ Could not determine audio duration")
|
| 328 |
+
|
| 329 |
+
except Exception as e:
|
| 330 |
+
output.append(f"β Inference failed: {e}")
|
| 331 |
+
|
| 332 |
+
# Test 3: Multiple sequential runs (simulating current problem)
|
| 333 |
+
output.append("\n--- SEQUENTIAL PROCESSING TEST ---")
|
| 334 |
+
sequential_times = []
|
| 335 |
+
for i in range(3):
|
| 336 |
+
start_time = time.time()
|
| 337 |
+
try:
|
| 338 |
+
with torch.no_grad():
|
| 339 |
+
_ = model.generate(f"Test run number {i+1}.", exaggeration=0.5, cfg_weight=0.5, temperature=0.7)
|
| 340 |
+
if torch.cuda.is_available():
|
| 341 |
+
torch.cuda.synchronize()
|
| 342 |
+
run_time = time.time() - start_time
|
| 343 |
+
sequential_times.append(run_time)
|
| 344 |
+
output.append(f" Run {i+1}: {run_time:.3f}s")
|
| 345 |
+
except Exception as e:
|
| 346 |
+
output.append(f" Run {i+1} failed: {e}")
|
| 347 |
+
|
| 348 |
+
if sequential_times:
|
| 349 |
+
avg_time = sum(sequential_times) / len(sequential_times)
|
| 350 |
+
output.append(f"π Average sequential time: {avg_time:.3f}s")
|
| 351 |
+
|
| 352 |
+
# Check consistency
|
| 353 |
+
if max(sequential_times) - min(sequential_times) > 0.5:
|
| 354 |
+
output.append("β οΈ High variance in processing times - possible memory issues")
|
| 355 |
+
|
| 356 |
+
# Test 4: Threading test with actual model
|
| 357 |
+
output.append("\n--- THREADING WITH TTS MODEL TEST ---")
|
| 358 |
+
try:
|
| 359 |
+
def tts_worker(text_idx):
|
| 360 |
+
try:
|
| 361 |
+
start = time.time()
|
| 362 |
+
with torch.no_grad():
|
| 363 |
+
_ = model.generate(f"Threading test {text_idx}.",
|
| 364 |
+
exaggeration=0.5, cfg_weight=0.5, temperature=0.7)
|
| 365 |
+
if torch.cuda.is_available():
|
| 366 |
+
torch.cuda.synchronize()
|
| 367 |
+
return time.time() - start
|
| 368 |
+
except Exception as e:
|
| 369 |
+
return f"Error: {e}"
|
| 370 |
+
|
| 371 |
+
# Test with 2 workers (like current setup)
|
| 372 |
+
start_time = time.time()
|
| 373 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
| 374 |
+
futures = [executor.submit(tts_worker, i) for i in range(4)]
|
| 375 |
+
thread_results = [f.result() for f in futures]
|
| 376 |
+
|
| 377 |
+
total_thread_time = time.time() - start_time
|
| 378 |
+
output.append(f"β±οΈ Threading (2 workers, 4 tasks): {total_thread_time:.3f}s")
|
| 379 |
+
|
| 380 |
+
successful_times = [r for r in thread_results if isinstance(r, float)]
|
| 381 |
+
if successful_times:
|
| 382 |
+
output.append(f"π Successful tasks: {len(successful_times)}/4")
|
| 383 |
+
output.append(f"π Average task time: {sum(successful_times)/len(successful_times):.3f}s")
|
| 384 |
+
|
| 385 |
+
# Compare with sequential
|
| 386 |
+
if sequential_times:
|
| 387 |
+
expected_sequential = avg_time * 4
|
| 388 |
+
speedup = expected_sequential / total_thread_time
|
| 389 |
+
output.append(f"π Threading speedup: {speedup:.2f}x")
|
| 390 |
+
|
| 391 |
+
if speedup < 1.2:
|
| 392 |
+
output.append("β οΈ Threading provides minimal speedup")
|
| 393 |
+
output.append(" This explains your slow HuggingFace performance!")
|
| 394 |
+
else:
|
| 395 |
+
output.append("β
Threading working well")
|
| 396 |
+
else:
|
| 397 |
+
output.append("β All threading tasks failed")
|
| 398 |
+
for i, result in enumerate(thread_results):
|
| 399 |
+
output.append(f" Task {i+1}: {result}")
|
| 400 |
+
|
| 401 |
+
except Exception as e:
|
| 402 |
+
output.append(f"β Threading test failed: {e}")
|
| 403 |
+
|
| 404 |
+
# Test 5: Model reloading overhead
|
| 405 |
+
output.append("\n--- MODEL RELOADING TEST ---")
|
| 406 |
+
try:
|
| 407 |
+
# Simulate what might be happening in your slow processing
|
| 408 |
+
reload_times = []
|
| 409 |
+
for i in range(3):
|
| 410 |
+
# Delete and reload model
|
| 411 |
+
del model
|
| 412 |
+
if torch.cuda.is_available():
|
| 413 |
+
torch.cuda.empty_cache()
|
| 414 |
+
|
| 415 |
+
start_time = time.time()
|
| 416 |
+
model = load_optimized_model(device)
|
| 417 |
+
# Single inference after reload
|
| 418 |
+
with torch.no_grad():
|
| 419 |
+
_ = model.generate("Reload test.", exaggeration=0.5, cfg_weight=0.5, temperature=0.7)
|
| 420 |
+
if torch.cuda.is_available():
|
| 421 |
+
torch.cuda.synchronize()
|
| 422 |
+
reload_time = time.time() - start_time
|
| 423 |
+
reload_times.append(reload_time)
|
| 424 |
+
output.append(f" Reload + inference {i+1}: {reload_time:.3f}s")
|
| 425 |
+
|
| 426 |
+
avg_reload_time = sum(reload_times) / len(reload_times)
|
| 427 |
+
output.append(f"π Average reload + inference: {avg_reload_time:.3f}s")
|
| 428 |
+
|
| 429 |
+
if sequential_times and avg_reload_time > avg_time * 2:
|
| 430 |
+
output.append("β οΈ Model reloading adds significant overhead")
|
| 431 |
+
output.append(" Workers may be reloading models per chunk!")
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
output.append(f"β Model reloading test failed: {e}")
|
| 435 |
+
|
| 436 |
+
# Cleanup
|
| 437 |
+
try:
|
| 438 |
+
del model
|
| 439 |
+
if torch.cuda.is_available():
|
| 440 |
+
torch.cuda.empty_cache()
|
| 441 |
+
output.append("\nβ
Model cleanup completed")
|
| 442 |
+
except:
|
| 443 |
+
pass
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
output.append(f"β TTS performance test failed: {e}")
|
| 447 |
+
import traceback
|
| 448 |
+
output.append(f"Traceback: {traceback.format_exc()}")
|
| 449 |
+
|
| 450 |
+
output.append("")
|
| 451 |
+
return "\n".join(output)
|
| 452 |
+
|
| 453 |
+
def run_all_diagnostics(self, progress=gr.Progress()):
|
| 454 |
+
"""Run all diagnostic tests"""
|
| 455 |
+
if self.running:
|
| 456 |
+
return "Diagnostics already running..."
|
| 457 |
+
|
| 458 |
+
self.running = True
|
| 459 |
+
|
| 460 |
+
try:
|
| 461 |
+
results = []
|
| 462 |
+
results.append("π Parallel Processing Diagnostic Tool")
|
| 463 |
+
results.append("=" * 50)
|
| 464 |
+
results.append("")
|
| 465 |
+
|
| 466 |
+
# Run each test with progress updates
|
| 467 |
+
progress(0.1, desc="Environment Info...")
|
| 468 |
+
results.append(self.test_environment_info())
|
| 469 |
+
|
| 470 |
+
progress(0.2, desc="Basic Multiprocessing...")
|
| 471 |
+
results.append(self.test_basic_multiprocessing())
|
| 472 |
+
|
| 473 |
+
progress(0.4, desc="Thread vs Process...")
|
| 474 |
+
results.append(self.test_thread_vs_process())
|
| 475 |
+
|
| 476 |
+
progress(0.6, desc="GPU Access...")
|
| 477 |
+
results.append(self.test_gpu_access())
|
| 478 |
+
|
| 479 |
+
progress(0.8, desc="Model Loading...")
|
| 480 |
+
results.append(self.test_model_loading())
|
| 481 |
+
|
| 482 |
+
progress(0.85, desc="Worker Creation...")
|
| 483 |
+
results.append(self.test_worker_creation())
|
| 484 |
+
|
| 485 |
+
progress(0.95, desc="TTS Model Performance...")
|
| 486 |
+
results.append(self.test_tts_model_performance())
|
| 487 |
+
|
| 488 |
+
progress(1.0, desc="Complete!")
|
| 489 |
+
|
| 490 |
+
results.append("π Diagnostic complete!")
|
| 491 |
+
results.append("")
|
| 492 |
+
results.append("ANALYSIS:")
|
| 493 |
+
results.append("- If basic multiprocessing is slow: Environment blocks parallelism")
|
| 494 |
+
results.append("- If threading faster than processing: Use ThreadPoolExecutor")
|
| 495 |
+
results.append("- If GPU parallel time >> sequential: GPU contention issue")
|
| 496 |
+
results.append("- If model loading overhead high: Need model sharing strategy")
|
| 497 |
+
results.append("- If same PID for all workers: Using threads, not processes")
|
| 498 |
+
results.append("- If TTS realtime factor < 0.5x: Severe performance bottleneck")
|
| 499 |
+
results.append("- If model reloading overhead high: Workers reloading models per chunk")
|
| 500 |
+
|
| 501 |
+
return "\n".join(results)
|
| 502 |
+
|
| 503 |
+
finally:
|
| 504 |
+
self.running = False
|
| 505 |
+
|
| 506 |
+
# Create global diagnostic runner
|
| 507 |
+
diagnostic_runner = DiagnosticRunner()
|
| 508 |
+
|
| 509 |
+
def create_diagnostics_tab():
|
| 510 |
+
"""Create the diagnostics tab interface"""
|
| 511 |
+
|
| 512 |
+
with gr.Column():
|
| 513 |
+
gr.Markdown("# π System Diagnostics")
|
| 514 |
+
gr.Markdown("*Test parallel processing capabilities and identify performance bottlenecks*")
|
| 515 |
+
|
| 516 |
+
with gr.Row():
|
| 517 |
+
run_diagnostics_btn = gr.Button("π Run Full Diagnostics", variant="primary", size="lg")
|
| 518 |
+
tts_diagnostics_btn = gr.Button("π€ TTS Performance Test", variant="secondary", size="lg")
|
| 519 |
+
|
| 520 |
+
with gr.Row():
|
| 521 |
+
diagnostic_output = gr.Textbox(
|
| 522 |
+
label="Diagnostic Results",
|
| 523 |
+
lines=30,
|
| 524 |
+
max_lines=50,
|
| 525 |
+
interactive=False,
|
| 526 |
+
show_copy_button=True
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
# Button click handlers
|
| 530 |
+
run_diagnostics_btn.click(
|
| 531 |
+
diagnostic_runner.run_all_diagnostics,
|
| 532 |
+
outputs=[diagnostic_output]
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
tts_diagnostics_btn.click(
|
| 536 |
+
diagnostic_runner.test_tts_model_performance,
|
| 537 |
+
outputs=[diagnostic_output]
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
# Instructions
|
| 541 |
+
with gr.Accordion("π How to Interpret Results", open=False):
|
| 542 |
+
gr.Markdown("""
|
| 543 |
+
**Key Metrics to Look For:**
|
| 544 |
+
|
| 545 |
+
1. **Basic Multiprocessing Speedup**: Should be > 2x with 4 workers
|
| 546 |
+
2. **ThreadPool vs ProcessPool**: Which is faster indicates best approach
|
| 547 |
+
3. **GPU Sequential vs Parallel**: Large difference indicates contention
|
| 548 |
+
4. **Model Loading Overhead**: High overhead means workers reload models
|
| 549 |
+
5. **Worker PIDs**: Same PID = threads, different PID = processes
|
| 550 |
+
|
| 551 |
+
**Common Issues:**
|
| 552 |
+
- **No speedup**: Environment blocks multiprocessing
|
| 553 |
+
- **GPU parallel slower**: GPU memory contention
|
| 554 |
+
- **High model loading overhead**: Need shared model architecture
|
| 555 |
+
- **Threading faster than processing**: Use ThreadPoolExecutor for TTS
|
| 556 |
+
""")
|
| 557 |
+
|
| 558 |
+
return {}
|
hold/chatterbox (copy).tar.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:311c3484631f207ce85dbbd14b41b04453783279315c5e5a669a168710d8b934
|
| 3 |
+
size 7728541
|
modules/tts_engine.py
CHANGED
|
@@ -1,45 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
OVERVIEW:
|
| 6 |
-
This is the heart of the ChatterboxTTS system, responsible for loading TTS models,
|
| 7 |
-
processing audio chunks, and managing the complete text-to-speech pipeline.
|
| 8 |
-
It handles voice embedding caching, memory optimization, and parallel processing
|
| 9 |
-
for efficient audiobook generation.
|
| 10 |
-
|
| 11 |
-
MAIN COMPONENTS:
|
| 12 |
-
1. MODEL MANAGEMENT: Loading, caching, and optimizing ChatterboxTTS models
|
| 13 |
-
2. VOICE PROCESSING: Voice sample analysis and embedding caching
|
| 14 |
-
3. CHUNK PROCESSING: Individual text chunk β audio conversion
|
| 15 |
-
4. MEMORY OPTIMIZATION: VRAM management and garbage collection
|
| 16 |
-
5. PARALLEL PROCESSING: Multi-threaded chunk processing with producer-consumer pattern
|
| 17 |
-
6. PERFORMANCE MONITORING: Real-time progress tracking and ETA calculations
|
| 18 |
-
|
| 19 |
-
CRITICAL PERFORMANCE FEATURES:
|
| 20 |
-
- Voice embedding caching (5-10% speed improvement)
|
| 21 |
-
- GPU persistence mode for faster model loading
|
| 22 |
-
- In-memory processing pipeline (eliminates temp files)
|
| 23 |
-
- Producer-consumer threading for parallel processing
|
| 24 |
-
- Automatic memory management and VRAM monitoring
|
| 25 |
-
- Model reinitialization every 500 chunks for stability
|
| 26 |
-
|
| 27 |
-
WORKFLOW:
|
| 28 |
-
Text Chunks β Voice Embedding β TTS Processing β Audio Generation β
|
| 29 |
-
Quality Validation β Silence Insertion β Final WAV Output
|
| 30 |
-
|
| 31 |
-
TECHNICAL DETAILS:
|
| 32 |
-
- Supports ChatterboxTTS models with custom voice cloning
|
| 33 |
-
- Handles variable TTS parameters (temperature, CFG, exaggeration)
|
| 34 |
-
- Implements VADER sentiment-driven parameter adjustment
|
| 35 |
-
- Memory-safe processing with configurable VRAM thresholds
|
| 36 |
-
- Automatic fallback for CUDA memory issues
|
| 37 |
-
|
| 38 |
-
USAGE CONTEXTS:
|
| 39 |
-
- Called by main processing scripts (GenTTS_Claude.py)
|
| 40 |
-
- Used by JSON generation utilities
|
| 41 |
-
- Integrated with chunk repair tools
|
| 42 |
-
- Supports both GUI and CLI interfaces
|
| 43 |
"""
|
| 44 |
|
| 45 |
import torch
|
|
@@ -48,16 +9,11 @@ import time
|
|
| 48 |
import logging
|
| 49 |
import shutil
|
| 50 |
import sys
|
| 51 |
-
import os
|
| 52 |
-
import subprocess
|
| 53 |
-
import psutil
|
| 54 |
import numpy as np
|
| 55 |
from datetime import timedelta
|
| 56 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 57 |
from pathlib import Path
|
| 58 |
import torchaudio as ta
|
| 59 |
-
import queue
|
| 60 |
-
import threading
|
| 61 |
|
| 62 |
from config.config import *
|
| 63 |
from modules.text_processor import smart_punctuate, sentence_chunk_text, detect_content_boundaries
|
|
@@ -98,6 +54,16 @@ from modules.file_manager import (
|
|
| 98 |
)
|
| 99 |
from modules.progress_tracker import setup_logging, log_chunk_progress, log_run
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
# ============================================================================
|
| 102 |
# MEMORY AND MODEL MANAGEMENT
|
| 103 |
# ============================================================================
|
|
@@ -126,186 +92,11 @@ def monitor_vram_usage(operation_name=""):
|
|
| 126 |
|
| 127 |
if allocated > VRAM_SAFETY_THRESHOLD:
|
| 128 |
logging.warning(f"β οΈ High VRAM usage during {operation_name}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved")
|
| 129 |
-
|
| 130 |
|
| 131 |
return allocated, reserved
|
| 132 |
return 0, 0
|
| 133 |
|
| 134 |
-
# ============================================================================
|
| 135 |
-
# PERFORMANCE OPTIMIZATION UTILITIES
|
| 136 |
-
# ============================================================================
|
| 137 |
-
|
| 138 |
-
def detect_deployment_environment():
|
| 139 |
-
"""Detect deployment environment for optimization adaptation"""
|
| 140 |
-
if os.getenv("RUNPOD_POD_ID"):
|
| 141 |
-
return "runpod"
|
| 142 |
-
elif os.getenv("SPACE_ID"): # Hugging Face Spaces
|
| 143 |
-
return "huggingface"
|
| 144 |
-
elif os.path.exists("/.dockerenv"):
|
| 145 |
-
return "container"
|
| 146 |
-
elif torch.cuda.is_available():
|
| 147 |
-
return "local_gpu"
|
| 148 |
-
else:
|
| 149 |
-
return "local_cpu"
|
| 150 |
-
|
| 151 |
-
def get_available_memory():
|
| 152 |
-
"""Get available system memory in MB"""
|
| 153 |
-
try:
|
| 154 |
-
memory = psutil.virtual_memory()
|
| 155 |
-
return memory.available // (1024 * 1024)
|
| 156 |
-
except:
|
| 157 |
-
return 8192 # Safe default of 8GB
|
| 158 |
-
|
| 159 |
-
def has_nvidia_smi():
|
| 160 |
-
"""Check if nvidia-smi is available"""
|
| 161 |
-
try:
|
| 162 |
-
subprocess.run(['nvidia-smi', '--version'], capture_output=True, check=True)
|
| 163 |
-
return True
|
| 164 |
-
except:
|
| 165 |
-
return False
|
| 166 |
-
|
| 167 |
-
def enable_gpu_persistence_mode():
|
| 168 |
-
"""Enable GPU persistence mode with proper fallbacks"""
|
| 169 |
-
if not ENABLE_GPU_PERSISTENCE_MODE:
|
| 170 |
-
return False
|
| 171 |
-
|
| 172 |
-
try:
|
| 173 |
-
if torch.cuda.is_available() and has_nvidia_smi():
|
| 174 |
-
for attempt in range(GPU_PERSISTENCE_RETRY_COUNT):
|
| 175 |
-
result = subprocess.run(['nvidia-smi', '-pm', '1'],
|
| 176 |
-
capture_output=True, text=True)
|
| 177 |
-
if result.returncode == 0:
|
| 178 |
-
logging.info("β
GPU persistence mode enabled")
|
| 179 |
-
return True
|
| 180 |
-
elif "Insufficient permissions" in result.stderr:
|
| 181 |
-
logging.warning("β οΈ GPU persistence mode failed (insufficient privileges)")
|
| 182 |
-
break
|
| 183 |
-
time.sleep(0.5) # Brief delay between attempts
|
| 184 |
-
|
| 185 |
-
logging.warning("π Continuing with standard GPU power management")
|
| 186 |
-
else:
|
| 187 |
-
logging.info("βΉοΈ GPU persistence mode not applicable (no NVIDIA GPU detected)")
|
| 188 |
-
except Exception as e:
|
| 189 |
-
logging.warning(f"β οΈ GPU persistence mode failed: {e}")
|
| 190 |
-
|
| 191 |
-
return False
|
| 192 |
-
|
| 193 |
-
def setup_cuda_memory_pool():
|
| 194 |
-
"""Configure CUDA memory pool for enhanced performance and reduced fragmentation"""
|
| 195 |
-
if not ENABLE_CUDA_MEMORY_POOL or not torch.cuda.is_available():
|
| 196 |
-
return False
|
| 197 |
-
|
| 198 |
-
try:
|
| 199 |
-
# Get current device and memory info
|
| 200 |
-
device = torch.cuda.current_device()
|
| 201 |
-
total_memory = torch.cuda.get_device_properties(device).total_memory
|
| 202 |
-
total_memory_gb = total_memory / (1024**3)
|
| 203 |
-
|
| 204 |
-
deployment_env = detect_deployment_environment()
|
| 205 |
-
|
| 206 |
-
# Adaptive pool sizing based on environment and available memory
|
| 207 |
-
if ENABLE_ADAPTIVE_MEMORY_POOL:
|
| 208 |
-
if deployment_env == "runpod":
|
| 209 |
-
pool_fraction = min(CUDA_MEMORY_POOL_FRACTION, 0.85) # More conservative on RunPod
|
| 210 |
-
elif deployment_env == "huggingface":
|
| 211 |
-
pool_fraction = min(CUDA_MEMORY_POOL_FRACTION, 0.75) # Very conservative on HF Spaces
|
| 212 |
-
elif total_memory_gb < 8:
|
| 213 |
-
pool_fraction = min(CUDA_MEMORY_POOL_FRACTION, 0.8) # Conservative for <8GB GPUs
|
| 214 |
-
else:
|
| 215 |
-
pool_fraction = CUDA_MEMORY_POOL_FRACTION # Use full config for high-memory GPUs
|
| 216 |
-
else:
|
| 217 |
-
pool_fraction = CUDA_MEMORY_POOL_FRACTION
|
| 218 |
-
|
| 219 |
-
# Calculate pool size
|
| 220 |
-
pool_size = int(total_memory * pool_fraction)
|
| 221 |
-
pool_size_gb = pool_size / (1024**3)
|
| 222 |
-
|
| 223 |
-
# Configure memory pool allocator settings
|
| 224 |
-
# Set memory pool to reduce fragmentation and improve allocation speed
|
| 225 |
-
if hasattr(torch.cuda, 'memory') and hasattr(torch.cuda.memory, 'set_per_process_memory_fraction'):
|
| 226 |
-
torch.cuda.memory.set_per_process_memory_fraction(pool_fraction, device)
|
| 227 |
-
logging.info(f"β
CUDA memory pool configured: {pool_size_gb:.1f}GB ({pool_fraction*100:.0f}% of {total_memory_gb:.1f}GB)")
|
| 228 |
-
|
| 229 |
-
# Configure allocator settings for better memory management
|
| 230 |
-
if hasattr(torch.cuda, 'empty_cache'):
|
| 231 |
-
# Clear any existing allocations before setting up pool
|
| 232 |
-
torch.cuda.empty_cache()
|
| 233 |
-
|
| 234 |
-
# Enable memory pool optimizations if available in PyTorch version
|
| 235 |
-
try:
|
| 236 |
-
# Try to enable expandable segments for better memory utilization
|
| 237 |
-
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 238 |
-
logging.info("β
CUDA expandable segments enabled")
|
| 239 |
-
except:
|
| 240 |
-
pass # Not available in all PyTorch versions
|
| 241 |
-
|
| 242 |
-
# Warm up the memory pool with a small allocation
|
| 243 |
-
try:
|
| 244 |
-
warmup_tensor = torch.zeros(1024, 1024, device=device)
|
| 245 |
-
del warmup_tensor
|
| 246 |
-
torch.cuda.empty_cache()
|
| 247 |
-
logging.info("β
CUDA memory pool warmed up")
|
| 248 |
-
except Exception as e:
|
| 249 |
-
logging.warning(f"β οΈ Memory pool warmup failed: {e}")
|
| 250 |
-
|
| 251 |
-
logging.info(f"π CUDA memory pool setup complete - environment: {deployment_env}")
|
| 252 |
-
return True
|
| 253 |
-
|
| 254 |
-
except Exception as e:
|
| 255 |
-
logging.error(f"β CUDA memory pool setup failed: {e}")
|
| 256 |
-
return False
|
| 257 |
-
|
| 258 |
-
def optimize_cuda_memory_usage():
|
| 259 |
-
"""Advanced CUDA memory optimization for better performance"""
|
| 260 |
-
if not torch.cuda.is_available():
|
| 261 |
-
return
|
| 262 |
-
|
| 263 |
-
try:
|
| 264 |
-
# More aggressive cleanup for memory pool systems
|
| 265 |
-
torch.cuda.empty_cache()
|
| 266 |
-
|
| 267 |
-
# Synchronize to ensure all operations complete before cleanup
|
| 268 |
-
torch.cuda.synchronize()
|
| 269 |
-
|
| 270 |
-
# Additional memory pool optimization if available
|
| 271 |
-
if hasattr(torch.cuda, 'reset_peak_memory_stats'):
|
| 272 |
-
torch.cuda.reset_peak_memory_stats()
|
| 273 |
-
|
| 274 |
-
except Exception as e:
|
| 275 |
-
logging.warning(f"β οΈ CUDA memory optimization failed: {e}")
|
| 276 |
-
|
| 277 |
-
# Global voice embedding cache
|
| 278 |
-
_voice_embedding_cache = {}
|
| 279 |
-
_cache_memory_usage = 0
|
| 280 |
-
|
| 281 |
-
def get_voice_cache_key(voice_path, exaggeration):
|
| 282 |
-
"""Generate cache key for voice embeddings"""
|
| 283 |
-
try:
|
| 284 |
-
# Use file path and modification time for cache invalidation
|
| 285 |
-
stat = os.stat(voice_path)
|
| 286 |
-
return f"{voice_path}:{stat.st_mtime}:{exaggeration}"
|
| 287 |
-
except:
|
| 288 |
-
return f"{voice_path}:{exaggeration}"
|
| 289 |
-
|
| 290 |
-
def clear_voice_embedding_cache():
|
| 291 |
-
"""Clear voice embedding cache to free memory"""
|
| 292 |
-
global _voice_embedding_cache, _cache_memory_usage
|
| 293 |
-
_voice_embedding_cache.clear()
|
| 294 |
-
_cache_memory_usage = 0
|
| 295 |
-
if torch.cuda.is_available():
|
| 296 |
-
torch.cuda.empty_cache()
|
| 297 |
-
logging.info("ποΈ Voice embedding cache cleared")
|
| 298 |
-
|
| 299 |
-
def estimate_cache_memory_mb(conds_object):
|
| 300 |
-
"""Estimate memory usage of cached voice embeddings in MB"""
|
| 301 |
-
try:
|
| 302 |
-
if hasattr(conds_object, 't3') and hasattr(conds_object.t3, 'voice_embed'):
|
| 303 |
-
# Rough estimate based on typical voice embedding sizes
|
| 304 |
-
return 50 # Typical voice embedding ~50MB
|
| 305 |
-
return 30 # Conservative estimate
|
| 306 |
-
except:
|
| 307 |
-
return 30
|
| 308 |
-
|
| 309 |
def get_optimal_workers():
|
| 310 |
"""Dynamic worker allocation based on VRAM usage"""
|
| 311 |
if not USE_DYNAMIC_WORKERS:
|
|
@@ -401,299 +192,28 @@ def get_best_available_device():
|
|
| 401 |
return "cpu"
|
| 402 |
|
| 403 |
def load_optimized_model(device):
|
| 404 |
-
"""Load TTS model with memory optimizations
|
| 405 |
from src.chatterbox.tts import ChatterboxTTS
|
| 406 |
-
|
| 407 |
-
# Validate device availability
|
| 408 |
-
original_device = device
|
| 409 |
-
try:
|
| 410 |
-
if device == "cuda":
|
| 411 |
-
# Test CUDA availability with a small operation
|
| 412 |
-
test_tensor = torch.tensor([1.0]).to("cuda")
|
| 413 |
-
del test_tensor
|
| 414 |
-
torch.cuda.empty_cache()
|
| 415 |
-
logging.info(f"β
CUDA device validated successfully")
|
| 416 |
-
elif device == "mps" and torch.backends.mps.is_available():
|
| 417 |
-
# Test MPS availability
|
| 418 |
-
test_tensor = torch.tensor([1.0]).to("mps")
|
| 419 |
-
del test_tensor
|
| 420 |
-
logging.info(f"β
MPS device validated successfully")
|
| 421 |
-
except Exception as e:
|
| 422 |
-
logging.warning(f"β οΈ Device {device} failed validation: {e}")
|
| 423 |
-
logging.info("π Falling back to CPU")
|
| 424 |
-
device = "cpu"
|
| 425 |
|
| 426 |
try:
|
| 427 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
model = ChatterboxTTS.from_pretrained(device=device)
|
| 429 |
-
logging.info(
|
| 430 |
-
|
| 431 |
-
if original_device != device:
|
| 432 |
-
logging.info(f"π Note: Requested {original_device.upper()} but using {device.upper()} due to availability")
|
| 433 |
-
|
| 434 |
-
except Exception as e:
|
| 435 |
-
logging.error(f"β Failed to load model on {device}: {e}")
|
| 436 |
-
if device != "cpu":
|
| 437 |
-
logging.info("π Final fallback to CPU...")
|
| 438 |
-
device = "cpu"
|
| 439 |
-
model = ChatterboxTTS.from_pretrained(device=device)
|
| 440 |
-
logging.info("β
Model loaded on CPU as final fallback")
|
| 441 |
-
else:
|
| 442 |
-
raise RuntimeError(f"Failed to load model even on CPU: {e}")
|
| 443 |
|
| 444 |
# Only apply eval() and benchmark if the model has these attributes
|
| 445 |
if hasattr(model, 'eval'):
|
| 446 |
model.eval()
|
| 447 |
|
| 448 |
-
# Set CUDNN benchmark for performance (if available
|
| 449 |
-
if
|
| 450 |
torch.backends.cudnn.benchmark = True
|
| 451 |
-
logging.info("β
CUDNN benchmark enabled for performance")
|
| 452 |
-
|
| 453 |
-
# Initialize CUDA memory pool if enabled and using CUDA
|
| 454 |
-
if device == "cuda" and ENABLE_CUDA_MEMORY_POOL:
|
| 455 |
-
memory_pool_success = setup_cuda_memory_pool()
|
| 456 |
-
if memory_pool_success:
|
| 457 |
-
logging.info("π CUDA memory pool optimization enabled")
|
| 458 |
-
else:
|
| 459 |
-
logging.warning("β οΈ CUDA memory pool setup failed, continuing without optimization")
|
| 460 |
|
| 461 |
return model
|
| 462 |
|
| 463 |
-
# ============================================================================
|
| 464 |
-
# PRODUCER-CONSUMER PIPELINE (PHASE 4)
|
| 465 |
-
# ============================================================================
|
| 466 |
-
|
| 467 |
-
def chunk_producer_thread(all_chunks, chunk_queue, start_index=0, max_queue_size=10):
|
| 468 |
-
"""
|
| 469 |
-
Producer thread that pre-loads chunks into a queue for worker threads to consume.
|
| 470 |
-
This eliminates chunk loading overhead during TTS processing.
|
| 471 |
-
|
| 472 |
-
Args:
|
| 473 |
-
all_chunks: List of chunk data (dict format with text, boundary_type, etc)
|
| 474 |
-
chunk_queue: Queue to place prepared chunk data
|
| 475 |
-
start_index: Index to start producing from (for resume functionality)
|
| 476 |
-
max_queue_size: Maximum queue size to prevent memory overflow
|
| 477 |
-
"""
|
| 478 |
-
try:
|
| 479 |
-
logging.info(f"π Producer thread started - pre-loading chunks from index {start_index}")
|
| 480 |
-
|
| 481 |
-
for i, chunk_data in enumerate(all_chunks[start_index:], start=start_index):
|
| 482 |
-
# Check if we should stop (via sentinel or shutdown)
|
| 483 |
-
if shutdown_requested:
|
| 484 |
-
break
|
| 485 |
-
|
| 486 |
-
# Handle both dictionary and tuple formats for backward compatibility
|
| 487 |
-
if isinstance(chunk_data, dict):
|
| 488 |
-
chunk_text = chunk_data["text"]
|
| 489 |
-
boundary_type = chunk_data.get("boundary_type", "none")
|
| 490 |
-
chunk_tts_params = chunk_data.get("tts_params", None)
|
| 491 |
-
else:
|
| 492 |
-
# Handle old tuple format (text, is_para_end)
|
| 493 |
-
chunk_text = chunk_data[0] if len(chunk_data) > 0 else str(chunk_data)
|
| 494 |
-
is_old_para_end = chunk_data[1] if len(chunk_data) > 1 else False
|
| 495 |
-
boundary_type = "paragraph_end" if is_old_para_end else "none"
|
| 496 |
-
chunk_tts_params = None
|
| 497 |
-
|
| 498 |
-
# Create standardized chunk package for workers
|
| 499 |
-
chunk_package = {
|
| 500 |
-
'index': i,
|
| 501 |
-
'text': chunk_text,
|
| 502 |
-
'boundary_type': boundary_type,
|
| 503 |
-
'tts_params': chunk_tts_params
|
| 504 |
-
}
|
| 505 |
-
|
| 506 |
-
# Put chunk in queue (blocks if queue is full)
|
| 507 |
-
chunk_queue.put(chunk_package, timeout=30)
|
| 508 |
-
|
| 509 |
-
# Log progress every 50 chunks to avoid spam
|
| 510 |
-
if (i + 1) % 50 == 0:
|
| 511 |
-
logging.info(f"π¦ Producer queued {i + 1} chunks")
|
| 512 |
-
|
| 513 |
-
logging.info(f"β
Producer thread completed - {len(all_chunks) - start_index} chunks queued")
|
| 514 |
-
|
| 515 |
-
except Exception as e:
|
| 516 |
-
logging.error(f"β Producer thread failed: {e}")
|
| 517 |
-
finally:
|
| 518 |
-
# Signal completion by adding sentinel value
|
| 519 |
-
try:
|
| 520 |
-
chunk_queue.put(None, timeout=5) # None = end of chunks signal
|
| 521 |
-
except queue.Full:
|
| 522 |
-
logging.warning("β οΈ Could not add completion signal - queue full")
|
| 523 |
-
|
| 524 |
-
def process_chunks_with_pipeline(
|
| 525 |
-
all_chunks, batch_chunks, chunk_offset, text_chunks_dir, audio_chunks_dir,
|
| 526 |
-
voice_path, tts_params, start_time, total_chunks, punc_norm, book_name,
|
| 527 |
-
log_run_func, log_path, device, model, asr_model, asr_enabled, optimal_workers,
|
| 528 |
-
accumulated_audio_duration=0.0
|
| 529 |
-
):
|
| 530 |
-
"""
|
| 531 |
-
Enhanced chunk processing with producer-consumer pipeline for 5-10% performance improvement.
|
| 532 |
-
|
| 533 |
-
Args:
|
| 534 |
-
all_chunks: Complete list of all chunks (for context)
|
| 535 |
-
batch_chunks: Current batch of chunks to process
|
| 536 |
-
chunk_offset: Offset for global chunk indexing
|
| 537 |
-
... (other parameters same as original ThreadPoolExecutor pattern)
|
| 538 |
-
|
| 539 |
-
Returns:
|
| 540 |
-
Tuple of (batch_results, total_audio_duration) where:
|
| 541 |
-
- batch_results: List of (index, wav_path) tuples for successful chunks
|
| 542 |
-
- total_audio_duration: Total audio duration for batch (for progress tracking)
|
| 543 |
-
"""
|
| 544 |
-
try:
|
| 545 |
-
# Create thread-safe queue with size limit to prevent memory overflow
|
| 546 |
-
max_queue_size = min(optimal_workers * 3, 20) # 3x workers or 20, whichever is smaller
|
| 547 |
-
chunk_queue = queue.Queue(maxsize=max_queue_size)
|
| 548 |
-
|
| 549 |
-
# Start producer thread to pre-load chunks
|
| 550 |
-
producer_thread = threading.Thread(
|
| 551 |
-
target=chunk_producer_thread,
|
| 552 |
-
args=(batch_chunks, chunk_queue, 0, max_queue_size),
|
| 553 |
-
daemon=True
|
| 554 |
-
)
|
| 555 |
-
producer_thread.start()
|
| 556 |
-
|
| 557 |
-
logging.info(f"π Producer-consumer pipeline started with queue size {max_queue_size}")
|
| 558 |
-
|
| 559 |
-
# Consumer pattern: workers pull from queue instead of sequential loading
|
| 560 |
-
batch_results = []
|
| 561 |
-
futures = []
|
| 562 |
-
|
| 563 |
-
with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
|
| 564 |
-
# Process chunks as they become available and handle results in real-time
|
| 565 |
-
chunks_submitted = 0
|
| 566 |
-
completed_count = 0
|
| 567 |
-
total_audio_duration = accumulated_audio_duration
|
| 568 |
-
|
| 569 |
-
# Import audio processing functions
|
| 570 |
-
from modules.audio_processor import get_chunk_audio_duration
|
| 571 |
-
from modules.progress_tracker import log_chunk_progress
|
| 572 |
-
|
| 573 |
-
while True:
|
| 574 |
-
try:
|
| 575 |
-
# Get next chunk from producer (blocks until available)
|
| 576 |
-
chunk_package = chunk_queue.get(timeout=10)
|
| 577 |
-
|
| 578 |
-
# Check for completion signal
|
| 579 |
-
if chunk_package is None:
|
| 580 |
-
break
|
| 581 |
-
|
| 582 |
-
# Check for shutdown request
|
| 583 |
-
if shutdown_requested:
|
| 584 |
-
logging.info("π Shutdown requested - stopping chunk submission")
|
| 585 |
-
break
|
| 586 |
-
|
| 587 |
-
# Extract chunk data from package
|
| 588 |
-
global_chunk_index = chunk_offset + chunk_package['index']
|
| 589 |
-
chunk_text = chunk_package['text']
|
| 590 |
-
boundary_type = chunk_package['boundary_type']
|
| 591 |
-
chunk_tts_params = chunk_package.get('tts_params') or tts_params
|
| 592 |
-
|
| 593 |
-
# Build context for chunk (all chunk texts)
|
| 594 |
-
all_chunk_texts = []
|
| 595 |
-
for cd in all_chunks:
|
| 596 |
-
if isinstance(cd, dict):
|
| 597 |
-
all_chunk_texts.append(cd["text"])
|
| 598 |
-
else:
|
| 599 |
-
all_chunk_texts.append(cd[0] if len(cd) > 0 else str(cd))
|
| 600 |
-
|
| 601 |
-
# Submit chunk to worker thread
|
| 602 |
-
future = executor.submit(
|
| 603 |
-
process_one_chunk,
|
| 604 |
-
global_chunk_index, chunk_text, text_chunks_dir, audio_chunks_dir,
|
| 605 |
-
voice_path, chunk_tts_params, start_time, total_chunks,
|
| 606 |
-
punc_norm, book_name, log_run_func, log_path, device,
|
| 607 |
-
model, asr_model, all_chunk_texts, boundary_type,
|
| 608 |
-
asr_enabled
|
| 609 |
-
)
|
| 610 |
-
futures.append(future)
|
| 611 |
-
|
| 612 |
-
chunks_submitted += 1
|
| 613 |
-
chunk_queue.task_done()
|
| 614 |
-
|
| 615 |
-
# Check for completed futures while submitting new ones
|
| 616 |
-
completed_futures = []
|
| 617 |
-
for fut in futures:
|
| 618 |
-
if fut.done():
|
| 619 |
-
completed_futures.append(fut)
|
| 620 |
-
|
| 621 |
-
# Process completed futures
|
| 622 |
-
for fut in completed_futures:
|
| 623 |
-
try:
|
| 624 |
-
idx, wav_path = fut.result()
|
| 625 |
-
if wav_path and wav_path.exists():
|
| 626 |
-
batch_results.append((idx, wav_path))
|
| 627 |
-
|
| 628 |
-
# Update totals for final batch calculation
|
| 629 |
-
chunk_duration = get_chunk_audio_duration(wav_path)
|
| 630 |
-
total_audio_duration += chunk_duration
|
| 631 |
-
completed_count += 1
|
| 632 |
-
|
| 633 |
-
futures.remove(fut) # Remove completed future
|
| 634 |
-
|
| 635 |
-
except Exception as e:
|
| 636 |
-
logging.error(f"β Future failed during real-time processing: {e}")
|
| 637 |
-
futures.remove(fut)
|
| 638 |
-
|
| 639 |
-
except queue.Empty:
|
| 640 |
-
# Timeout waiting for chunks - check if producer is done
|
| 641 |
-
if not producer_thread.is_alive():
|
| 642 |
-
break
|
| 643 |
-
else:
|
| 644 |
-
# Producer still working - check for completed futures while waiting
|
| 645 |
-
completed_futures = [fut for fut in futures if fut.done()]
|
| 646 |
-
for fut in completed_futures:
|
| 647 |
-
try:
|
| 648 |
-
idx, wav_path = fut.result()
|
| 649 |
-
if wav_path and wav_path.exists():
|
| 650 |
-
batch_results.append((idx, wav_path))
|
| 651 |
-
|
| 652 |
-
chunk_duration = get_chunk_audio_duration(wav_path)
|
| 653 |
-
total_audio_duration += chunk_duration
|
| 654 |
-
completed_count += 1
|
| 655 |
-
|
| 656 |
-
futures.remove(fut)
|
| 657 |
-
|
| 658 |
-
except Exception as e:
|
| 659 |
-
logging.error(f"β Future failed during timeout processing: {e}")
|
| 660 |
-
futures.remove(fut)
|
| 661 |
-
continue
|
| 662 |
-
|
| 663 |
-
except Exception as e:
|
| 664 |
-
logging.error(f"β Error in consumer loop: {e}")
|
| 665 |
-
break
|
| 666 |
-
|
| 667 |
-
# Process any remaining futures
|
| 668 |
-
if futures:
|
| 669 |
-
for fut in as_completed(futures):
|
| 670 |
-
try:
|
| 671 |
-
idx, wav_path = fut.result()
|
| 672 |
-
if wav_path and wav_path.exists():
|
| 673 |
-
batch_results.append((idx, wav_path))
|
| 674 |
-
|
| 675 |
-
# Update batch totals
|
| 676 |
-
chunk_duration = get_chunk_audio_duration(wav_path)
|
| 677 |
-
total_audio_duration += chunk_duration
|
| 678 |
-
completed_count += 1
|
| 679 |
-
|
| 680 |
-
except Exception as e:
|
| 681 |
-
logging.error(f"β Final future failed: {e}")
|
| 682 |
-
|
| 683 |
-
# Wait for producer thread to complete cleanly
|
| 684 |
-
if producer_thread.is_alive():
|
| 685 |
-
producer_thread.join(timeout=5)
|
| 686 |
-
|
| 687 |
-
# Calculate batch-specific audio duration for return
|
| 688 |
-
batch_audio_duration = total_audio_duration - accumulated_audio_duration
|
| 689 |
-
logging.info(f"π Producer-consumer pipeline completed: {len(batch_results)} chunks processed")
|
| 690 |
-
return batch_results, batch_audio_duration
|
| 691 |
-
|
| 692 |
-
except Exception as e:
|
| 693 |
-
logging.error(f"β Producer-consumer pipeline failed: {e}")
|
| 694 |
-
logging.info("π Falling back to sequential processing...")
|
| 695 |
-
return [], 0.0 # Return empty results to trigger fallback
|
| 696 |
-
|
| 697 |
# ============================================================================
|
| 698 |
# CHUNK PROCESSING
|
| 699 |
# ============================================================================
|
|
@@ -710,11 +230,86 @@ def patch_alignment_layer(tfmr, alignment_layer_idx=12):
|
|
| 710 |
|
| 711 |
target_layer.forward = MethodType(patched_forward, target_layer)
|
| 712 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
def process_one_chunk(
|
| 714 |
i, chunk, text_chunks_dir, audio_chunks_dir,
|
| 715 |
voice_path, tts_params, start_time, total_chunks,
|
| 716 |
punc_norm, basename, log_run_func, log_path, device,
|
| 717 |
-
model, asr_model,
|
| 718 |
enable_asr=None
|
| 719 |
):
|
| 720 |
"""Enhanced chunk processing with quality control, contextual silence, and deep cleanup"""
|
|
@@ -938,32 +533,13 @@ def process_one_chunk(
|
|
| 938 |
|
| 939 |
# Enhanced regular cleanup (every chunk)
|
| 940 |
del wav
|
| 941 |
-
|
| 942 |
|
| 943 |
# Additional per-chunk cleanup for long runs
|
| 944 |
if (i + 1) % 50 == 0:
|
| 945 |
torch.cuda.empty_cache()
|
| 946 |
gc.collect()
|
| 947 |
|
| 948 |
-
# Show ETA progress updates during actual processing (every 2 chunks)
|
| 949 |
-
if i % 2 == 0:
|
| 950 |
-
try:
|
| 951 |
-
from modules.audio_processor import get_chunk_audio_duration
|
| 952 |
-
from modules.progress_tracker import log_chunk_progress
|
| 953 |
-
|
| 954 |
-
# Calculate running total audio duration by checking existing chunks
|
| 955 |
-
total_audio_duration = 0.0
|
| 956 |
-
for j in range(i + 1): # Include current chunk
|
| 957 |
-
check_path = audio_chunks_dir / f"chunk_{j+1:05}.wav"
|
| 958 |
-
if check_path.exists():
|
| 959 |
-
total_audio_duration += get_chunk_audio_duration(check_path)
|
| 960 |
-
|
| 961 |
-
# Show ETA update with accumulated audio
|
| 962 |
-
log_chunk_progress(i, total_chunks, start_time, total_audio_duration)
|
| 963 |
-
except Exception as e:
|
| 964 |
-
# Don't let ETA calculation failures break chunk processing
|
| 965 |
-
pass
|
| 966 |
-
|
| 967 |
return i, final_path
|
| 968 |
|
| 969 |
# ============================================================================
|
|
@@ -1261,13 +837,6 @@ def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=F
|
|
| 1261 |
log_path = output_root / "chunk_validation.log"
|
| 1262 |
total_audio_duration = 0.0
|
| 1263 |
|
| 1264 |
-
# Initialize performance optimizations
|
| 1265 |
-
deployment_env = detect_deployment_environment()
|
| 1266 |
-
print(f"π Deployment environment: {deployment_env}")
|
| 1267 |
-
|
| 1268 |
-
# Enable GPU persistence mode for better performance
|
| 1269 |
-
gpu_persistence_enabled = enable_gpu_persistence_mode()
|
| 1270 |
-
|
| 1271 |
# Batch processing
|
| 1272 |
print(f"π Processing {total_chunks} chunks in batches of {BATCH_SIZE}")
|
| 1273 |
|
|
@@ -1304,45 +873,51 @@ def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=F
|
|
| 1304 |
print(f"β ASR model loading failed completely - disabling ASR for this batch")
|
| 1305 |
asr_enabled = False
|
| 1306 |
|
|
|
|
|
|
|
|
|
|
| 1307 |
# Dynamic worker allocation
|
| 1308 |
optimal_workers = get_optimal_workers()
|
| 1309 |
print(f"π§ Using {optimal_workers} workers for batch {batch_start+1}-{batch_end}")
|
| 1310 |
|
| 1311 |
-
|
| 1312 |
-
|
| 1313 |
-
if
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
voice_path, tts_params, start_time, total_chunks, punc_norm, book_dir.name,
|
| 1319 |
-
log_run, log_path, device, model, asr_model, asr_enabled, optimal_workers,
|
| 1320 |
-
total_audio_duration # Pass accumulated duration for proper ETA calculation
|
| 1321 |
-
)
|
| 1322 |
-
|
| 1323 |
-
# Handle tuple return from pipeline
|
| 1324 |
-
if isinstance(pipeline_results, tuple) and len(pipeline_results) == 2:
|
| 1325 |
-
batch_results, batch_audio_duration = pipeline_results
|
| 1326 |
-
total_audio_duration += batch_audio_duration
|
| 1327 |
-
else:
|
| 1328 |
-
# Fallback for old return format
|
| 1329 |
-
batch_results = pipeline_results
|
| 1330 |
-
|
| 1331 |
-
if batch_results:
|
| 1332 |
-
print(f"β
Producer-consumer pipeline completed: {len(batch_results)} chunks")
|
| 1333 |
-
# Pipeline already handled progress logging internally
|
| 1334 |
-
|
| 1335 |
-
except Exception as e:
|
| 1336 |
-
logging.error(f"β Producer-consumer pipeline failed: {e}")
|
| 1337 |
-
if not ENABLE_PIPELINE_FALLBACK:
|
| 1338 |
-
raise
|
| 1339 |
-
batch_results = [] # Clear failed results
|
| 1340 |
-
|
| 1341 |
-
# Fallback to original sequential processing if pipeline disabled or failed
|
| 1342 |
-
if not batch_results:
|
| 1343 |
-
print(f"π Sequential processing fallback for batch {batch_start+1}-{batch_end}")
|
| 1344 |
-
futures = []
|
| 1345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1346 |
with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
|
| 1347 |
for i, chunk_data in enumerate(batch_chunks):
|
| 1348 |
global_chunk_index = batch_start + i
|
|
@@ -1366,21 +941,14 @@ def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=F
|
|
| 1366 |
boundary_type = "paragraph_end" if is_old_para_end else "none"
|
| 1367 |
chunk_tts_params = tts_params # Fallback for old format
|
| 1368 |
|
| 1369 |
-
|
| 1370 |
-
all_chunk_texts = []
|
| 1371 |
-
for cd in all_chunks:
|
| 1372 |
-
if isinstance(cd, dict):
|
| 1373 |
-
all_chunk_texts.append(cd["text"])
|
| 1374 |
-
else:
|
| 1375 |
-
# Handle old tuple format (text, is_para_end)
|
| 1376 |
-
all_chunk_texts.append(cd[0] if len(cd) > 0 else str(cd))
|
| 1377 |
|
| 1378 |
futures.append(executor.submit(
|
| 1379 |
process_one_chunk,
|
| 1380 |
global_chunk_index, chunk, text_chunks_dir, audio_chunks_dir,
|
| 1381 |
voice_path, chunk_tts_params, start_time, total_chunks,
|
| 1382 |
punc_norm, book_dir.name, log_run, log_path, device,
|
| 1383 |
-
model, asr_model,
|
| 1384 |
asr_enabled
|
| 1385 |
))
|
| 1386 |
|
|
@@ -1397,7 +965,7 @@ def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=F
|
|
| 1397 |
total_audio_duration += chunk_duration
|
| 1398 |
batch_results.append((idx, wav_path))
|
| 1399 |
|
| 1400 |
-
# Update progress every
|
| 1401 |
completed_count += 1
|
| 1402 |
if completed_count % 2 == 0:
|
| 1403 |
log_chunk_progress(batch_start + completed_count - 1, total_chunks, start_time, total_audio_duration)
|
|
@@ -1478,4 +1046,4 @@ def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=F
|
|
| 1478 |
log_run("\n".join(run_log_lines), output_root / "run.log")
|
| 1479 |
print(f"π Run log written to: {output_root / 'run.log'}")
|
| 1480 |
|
| 1481 |
-
return final_m4b_path, combined_wav_path, run_log_lines
|
|
|
|
| 1 |
"""
|
| 2 |
+
TTS Engine Module
|
| 3 |
+
Handles ChatterboxTTS interface, model loading, and chunk processing coordination
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import torch
|
|
|
|
| 9 |
import logging
|
| 10 |
import shutil
|
| 11 |
import sys
|
|
|
|
|
|
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
from datetime import timedelta
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
from pathlib import Path
|
| 16 |
import torchaudio as ta
|
|
|
|
|
|
|
| 17 |
|
| 18 |
from config.config import *
|
| 19 |
from modules.text_processor import smart_punctuate, sentence_chunk_text, detect_content_boundaries
|
|
|
|
| 54 |
)
|
| 55 |
from modules.progress_tracker import setup_logging, log_chunk_progress, log_run
|
| 56 |
|
| 57 |
+
# Global shutdown flag
|
| 58 |
+
shutdown_requested = False
|
| 59 |
+
|
| 60 |
+
# Console colors
|
| 61 |
+
RED = '\033[91m'
|
| 62 |
+
GREEN = '\033[92m'
|
| 63 |
+
YELLOW = '\033[93m'
|
| 64 |
+
CYAN = '\033[96m'
|
| 65 |
+
RESET = '\033[0m'
|
| 66 |
+
|
| 67 |
# ============================================================================
|
| 68 |
# MEMORY AND MODEL MANAGEMENT
|
| 69 |
# ============================================================================
|
|
|
|
| 92 |
|
| 93 |
if allocated > VRAM_SAFETY_THRESHOLD:
|
| 94 |
logging.warning(f"β οΈ High VRAM usage during {operation_name}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved")
|
| 95 |
+
optimize_memory_usage()
|
| 96 |
|
| 97 |
return allocated, reserved
|
| 98 |
return 0, 0
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def get_optimal_workers():
|
| 101 |
"""Dynamic worker allocation based on VRAM usage"""
|
| 102 |
if not USE_DYNAMIC_WORKERS:
|
|
|
|
| 192 |
return "cpu"
|
| 193 |
|
| 194 |
def load_optimized_model(device):
|
| 195 |
+
"""Load TTS model with memory optimizations"""
|
| 196 |
from src.chatterbox.tts import ChatterboxTTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
try:
|
| 199 |
+
# Try to load with FP16 if supported
|
| 200 |
+
model = ChatterboxTTS.from_pretrained(device=device, torch_dtype=torch.float16)
|
| 201 |
+
logging.info("β
Loaded model in FP16 mode (halved VRAM usage)")
|
| 202 |
+
except:
|
| 203 |
+
# Fallback to default loading
|
| 204 |
model = ChatterboxTTS.from_pretrained(device=device)
|
| 205 |
+
logging.info("β οΈ Using FP32 mode (FP16 not supported)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
# Only apply eval() and benchmark if the model has these attributes
|
| 208 |
if hasattr(model, 'eval'):
|
| 209 |
model.eval()
|
| 210 |
|
| 211 |
+
# Set CUDNN benchmark for performance (if available)
|
| 212 |
+
if torch.backends.cudnn.is_available():
|
| 213 |
torch.backends.cudnn.benchmark = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
return model
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
# ============================================================================
|
| 218 |
# CHUNK PROCESSING
|
| 219 |
# ============================================================================
|
|
|
|
| 230 |
|
| 231 |
target_layer.forward = MethodType(patched_forward, target_layer)
|
| 232 |
|
| 233 |
+
def process_batch(
|
| 234 |
+
batch, text_chunks_dir, audio_chunks_dir,
|
| 235 |
+
voice_path, tts_params, start_time, total_chunks,
|
| 236 |
+
punc_norm, basename, log_run_func, log_path, device,
|
| 237 |
+
model, asr_model, all_chunks,
|
| 238 |
+
enable_asr=None
|
| 239 |
+
):
|
| 240 |
+
"""
|
| 241 |
+
Process a batch of chunks using the batch-enabled TTS model.
|
| 242 |
+
"""
|
| 243 |
+
from pydub import AudioSegment
|
| 244 |
+
import io
|
| 245 |
+
import soundfile as sf
|
| 246 |
+
|
| 247 |
+
# 1. Prepare batch for TTS
|
| 248 |
+
texts = [chunk_data['text'] for chunk_data in batch]
|
| 249 |
+
|
| 250 |
+
# All params are the same, so we take them from the first chunk
|
| 251 |
+
shared_tts_params = batch[0].get("tts_params", tts_params)
|
| 252 |
+
supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"}
|
| 253 |
+
tts_args = {k: v for k, v in shared_tts_params.items() if k in supported_params}
|
| 254 |
+
|
| 255 |
+
# 2. Generate audio in a batch
|
| 256 |
+
try:
|
| 257 |
+
with torch.no_grad():
|
| 258 |
+
wavs = model.generate_batch(texts, **tts_args)
|
| 259 |
+
except Exception as e:
|
| 260 |
+
logging.error(f"β Batch TTS generation failed: {e}")
|
| 261 |
+
# Fallback to individual processing for this batch
|
| 262 |
+
results = []
|
| 263 |
+
for chunk_data in batch:
|
| 264 |
+
i = chunk_data['index']
|
| 265 |
+
chunk = chunk_data['text']
|
| 266 |
+
boundary_type = chunk_data.get("boundary_type", "none")
|
| 267 |
+
chunk_tts_params = chunk_data.get("tts_params", tts_params)
|
| 268 |
+
result = process_one_chunk(i, chunk, text_chunks_dir, audio_chunks_dir, voice_path, chunk_tts_params, start_time, total_chunks, punc_norm, basename, log_run_func, log_path, device, model, asr_model, boundary_type, enable_asr)
|
| 269 |
+
results.append(result)
|
| 270 |
+
return results
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# 3. Process and save each audio file from the batch
|
| 274 |
+
batch_results = []
|
| 275 |
+
for i, wav_tensor in enumerate(wavs):
|
| 276 |
+
chunk_data = batch[i]
|
| 277 |
+
chunk_index = chunk_data['index']
|
| 278 |
+
boundary_type = chunk_data.get("boundary_type", "none")
|
| 279 |
+
chunk_id_str = f"{chunk_index+1:05}"
|
| 280 |
+
|
| 281 |
+
if wav_tensor.dim() == 1:
|
| 282 |
+
wav_tensor = wav_tensor.unsqueeze(0)
|
| 283 |
+
|
| 284 |
+
wav_np = wav_tensor.squeeze().cpu().numpy()
|
| 285 |
+
with io.BytesIO() as wav_buffer:
|
| 286 |
+
sf.write(wav_buffer, wav_np, model.sr, format='wav')
|
| 287 |
+
wav_buffer.seek(0)
|
| 288 |
+
audio_segment = AudioSegment.from_wav(wav_buffer)
|
| 289 |
+
|
| 290 |
+
# Apply trimming and contextual silence
|
| 291 |
+
from modules.audio_processor import process_audio_with_trimming_and_silence, trim_audio_endpoint
|
| 292 |
+
if boundary_type and boundary_type != "none":
|
| 293 |
+
final_audio = process_audio_with_trimming_and_silence(audio_segment, boundary_type)
|
| 294 |
+
elif ENABLE_AUDIO_TRIMMING:
|
| 295 |
+
final_audio = trim_audio_endpoint(audio_segment)
|
| 296 |
+
else:
|
| 297 |
+
final_audio = audio_segment
|
| 298 |
+
|
| 299 |
+
# Final save
|
| 300 |
+
final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav"
|
| 301 |
+
final_audio.export(final_path, format="wav")
|
| 302 |
+
logging.info(f"β
Saved final chunk from batch: {final_path.name}")
|
| 303 |
+
|
| 304 |
+
batch_results.append((chunk_index, final_path))
|
| 305 |
+
|
| 306 |
+
return batch_results
|
| 307 |
+
|
| 308 |
def process_one_chunk(
|
| 309 |
i, chunk, text_chunks_dir, audio_chunks_dir,
|
| 310 |
voice_path, tts_params, start_time, total_chunks,
|
| 311 |
punc_norm, basename, log_run_func, log_path, device,
|
| 312 |
+
model, asr_model, boundary_type="none",
|
| 313 |
enable_asr=None
|
| 314 |
):
|
| 315 |
"""Enhanced chunk processing with quality control, contextual silence, and deep cleanup"""
|
|
|
|
| 533 |
|
| 534 |
# Enhanced regular cleanup (every chunk)
|
| 535 |
del wav
|
| 536 |
+
optimize_memory_usage()
|
| 537 |
|
| 538 |
# Additional per-chunk cleanup for long runs
|
| 539 |
if (i + 1) % 50 == 0:
|
| 540 |
torch.cuda.empty_cache()
|
| 541 |
gc.collect()
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
return i, final_path
|
| 544 |
|
| 545 |
# ============================================================================
|
|
|
|
| 837 |
log_path = output_root / "chunk_validation.log"
|
| 838 |
total_audio_duration = 0.0
|
| 839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
# Batch processing
|
| 841 |
print(f"π Processing {total_chunks} chunks in batches of {BATCH_SIZE}")
|
| 842 |
|
|
|
|
| 873 |
print(f"β ASR model loading failed completely - disabling ASR for this batch")
|
| 874 |
asr_enabled = False
|
| 875 |
|
| 876 |
+
futures = []
|
| 877 |
+
batch_results = []
|
| 878 |
+
|
| 879 |
# Dynamic worker allocation
|
| 880 |
optimal_workers = get_optimal_workers()
|
| 881 |
print(f"π§ Using {optimal_workers} workers for batch {batch_start+1}-{batch_end}")
|
| 882 |
|
| 883 |
+
use_vader = tts_params.get('use_vader', True)
|
| 884 |
+
|
| 885 |
+
if not use_vader:
|
| 886 |
+
# --- BATCH MODE ---
|
| 887 |
+
print(f"π VADER disabled. Running in high-performance batch mode.")
|
| 888 |
+
tts_batch_size = config_params.get('tts_batch_size', 16)
|
| 889 |
+
chunk_batches = [batch_chunks[i:i + tts_batch_size] for i in range(0, len(batch_chunks), tts_batch_size)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
|
| 891 |
+
print(f"π Processing {len(batch_chunks)} chunks in {len(chunk_batches)} batches of size {tts_batch_size}.")
|
| 892 |
+
|
| 893 |
+
with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
|
| 894 |
+
for batch in chunk_batches:
|
| 895 |
+
if shutdown_requested:
|
| 896 |
+
break
|
| 897 |
+
futures.append(executor.submit(
|
| 898 |
+
process_batch,
|
| 899 |
+
batch, text_chunks_dir, audio_chunks_dir,
|
| 900 |
+
voice_path, tts_params, start_time, total_chunks,
|
| 901 |
+
punc_norm, book_dir.name, log_run, log_path, device,
|
| 902 |
+
model, asr_model, all_chunks, asr_enabled
|
| 903 |
+
))
|
| 904 |
+
|
| 905 |
+
# Wait for batches to complete
|
| 906 |
+
for fut in as_completed(futures):
|
| 907 |
+
try:
|
| 908 |
+
# process_batch returns a list of (idx, wav_path) tuples
|
| 909 |
+
results_list = fut.result()
|
| 910 |
+
for idx, wav_path in results_list:
|
| 911 |
+
if wav_path and wav_path.exists():
|
| 912 |
+
chunk_duration = get_chunk_audio_duration(wav_path)
|
| 913 |
+
total_audio_duration += chunk_duration
|
| 914 |
+
batch_results.append((idx, wav_path))
|
| 915 |
+
log_chunk_progress(len(batch_results), total_chunks, start_time, total_audio_duration)
|
| 916 |
+
except Exception as e:
|
| 917 |
+
logging.error(f"Future failed in batch: {e}")
|
| 918 |
+
else:
|
| 919 |
+
# --- SINGLE/NUANCED MODE ---
|
| 920 |
+
print(f"π¨ VADER enabled. Running in nuanced, single-chunk mode.")
|
| 921 |
with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
|
| 922 |
for i, chunk_data in enumerate(batch_chunks):
|
| 923 |
global_chunk_index = batch_start + i
|
|
|
|
| 941 |
boundary_type = "paragraph_end" if is_old_para_end else "none"
|
| 942 |
chunk_tts_params = tts_params # Fallback for old format
|
| 943 |
|
| 944 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
|
| 946 |
futures.append(executor.submit(
|
| 947 |
process_one_chunk,
|
| 948 |
global_chunk_index, chunk, text_chunks_dir, audio_chunks_dir,
|
| 949 |
voice_path, chunk_tts_params, start_time, total_chunks,
|
| 950 |
punc_norm, book_dir.name, log_run, log_path, device,
|
| 951 |
+
model, asr_model, boundary_type,
|
| 952 |
asr_enabled
|
| 953 |
))
|
| 954 |
|
|
|
|
| 965 |
total_audio_duration += chunk_duration
|
| 966 |
batch_results.append((idx, wav_path))
|
| 967 |
|
| 968 |
+
# Update progress every 10 chunks within batch
|
| 969 |
completed_count += 1
|
| 970 |
if completed_count % 2 == 0:
|
| 971 |
log_chunk_progress(batch_start + completed_count - 1, total_chunks, start_time, total_audio_duration)
|
|
|
|
| 1046 |
log_run("\n".join(run_log_lines), output_root / "run.log")
|
| 1047 |
print(f"π Run log written to: {output_root / 'run.log'}")
|
| 1048 |
|
| 1049 |
+
return final_m4b_path, combined_wav_path, run_log_lines
|
modules/tts_engine.py.20250811-120000.bak
ADDED
|
@@ -0,0 +1,710 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TTS Engine Module
|
| 3 |
+
Handles ChatterboxTTS interface, model loading, and chunk processing coordination
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import gc
|
| 8 |
+
import time
|
| 9 |
+
import logging
|
| 10 |
+
import shutil
|
| 11 |
+
import sys
|
| 12 |
+
from datetime import timedelta
|
| 13 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import torchaudio as ta
|
| 16 |
+
|
| 17 |
+
from config.config import *
|
| 18 |
+
from modules.text_processor import smart_punctuate, sentence_chunk_text, detect_content_boundaries
|
| 19 |
+
|
| 20 |
+
def find_chunks_json_file(book_name):
|
| 21 |
+
"""Find the corresponding chunks JSON file for a book"""
|
| 22 |
+
from config.config import AUDIOBOOK_ROOT
|
| 23 |
+
|
| 24 |
+
# Look in the TTS processing directory
|
| 25 |
+
tts_chunks_dir = AUDIOBOOK_ROOT / book_name / "TTS" / "text_chunks"
|
| 26 |
+
json_path = tts_chunks_dir / "chunks_info.json"
|
| 27 |
+
|
| 28 |
+
if json_path.exists():
|
| 29 |
+
return json_path
|
| 30 |
+
|
| 31 |
+
# Also check old Text_Input location for backwards compatibility
|
| 32 |
+
text_input_dir = Path("Text_Input")
|
| 33 |
+
possible_names = [
|
| 34 |
+
f"{book_name}_chunks.json",
|
| 35 |
+
f"{book_name.lower()}_chunks.json",
|
| 36 |
+
f"{book_name.replace(' ', '_')}_chunks.json"
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
for name in possible_names:
|
| 40 |
+
old_json_path = text_input_dir / name
|
| 41 |
+
if old_json_path.exists():
|
| 42 |
+
return old_json_path
|
| 43 |
+
|
| 44 |
+
return None
|
| 45 |
+
from modules.audio_processor import (
|
| 46 |
+
smart_audio_validation, apply_smart_fade, add_chunk_end_silence,
|
| 47 |
+
add_contextual_silence, pause_for_chunk_review, get_chunk_audio_duration,
|
| 48 |
+
has_mid_energy_drop, apply_smart_fade_memory, smart_audio_validation_memory
|
| 49 |
+
)
|
| 50 |
+
from modules.file_manager import (
|
| 51 |
+
setup_book_directories, find_book_files, ensure_voice_sample_compatibility,
|
| 52 |
+
combine_audio_chunks, get_audio_files_in_directory, convert_to_m4b, add_metadata_to_m4b
|
| 53 |
+
)
|
| 54 |
+
from modules.progress_tracker import setup_logging, log_chunk_progress, log_run
|
| 55 |
+
|
| 56 |
+
# ============================================================================
|
| 57 |
+
# MEMORY AND MODEL MANAGEMENT
|
| 58 |
+
# ============================================================================
|
| 59 |
+
|
| 60 |
+
def monitor_gpu_activity(operation_name):
|
| 61 |
+
"""Lightweight GPU monitoring for high-speed processing"""
|
| 62 |
+
# Disabled expensive pynvml queries to free up GPU cycles
|
| 63 |
+
if torch.cuda.is_available():
|
| 64 |
+
allocated = torch.cuda.memory_allocated() / 1024**3
|
| 65 |
+
# Skip GPU utilization queries during production runs
|
| 66 |
+
return allocated, 0
|
| 67 |
+
return 0, 0
|
| 68 |
+
|
| 69 |
+
def optimize_memory_usage():
|
| 70 |
+
"""Aggressive memory management for 8GB VRAM"""
|
| 71 |
+
torch.cuda.empty_cache()
|
| 72 |
+
gc.collect()
|
| 73 |
+
if torch.cuda.is_available():
|
| 74 |
+
torch.cuda.ipc_collect()
|
| 75 |
+
|
| 76 |
+
def monitor_vram_usage(operation_name=""):
|
| 77 |
+
"""Real-time VRAM monitoring"""
|
| 78 |
+
if torch.cuda.is_available():
|
| 79 |
+
allocated = torch.cuda.memory_allocated() / 1024**3
|
| 80 |
+
reserved = torch.cuda.memory_reserved() / 1024**3
|
| 81 |
+
|
| 82 |
+
if allocated > VRAM_SAFETY_THRESHOLD:
|
| 83 |
+
logging.warning(f"β οΈ High VRAM usage during {operation_name}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved")
|
| 84 |
+
optimize_memory_usage()
|
| 85 |
+
|
| 86 |
+
return allocated, reserved
|
| 87 |
+
return 0, 0
|
| 88 |
+
|
| 89 |
+
def get_optimal_workers(user_max_workers=None):
|
| 90 |
+
"""Dynamic worker allocation based on device type and resources"""
|
| 91 |
+
# Check for user override first
|
| 92 |
+
if user_max_workers is not None:
|
| 93 |
+
print(f"π€ Using user-defined workers: {user_max_workers}")
|
| 94 |
+
return int(user_max_workers)
|
| 95 |
+
|
| 96 |
+
if not USE_DYNAMIC_WORKERS:
|
| 97 |
+
return MAX_WORKERS
|
| 98 |
+
|
| 99 |
+
# CPU-based worker calculation
|
| 100 |
+
if not torch.cuda.is_available():
|
| 101 |
+
import psutil
|
| 102 |
+
cpu_cores = psutil.cpu_count(logical=False) # Physical cores
|
| 103 |
+
available_memory = psutil.virtual_memory().available / 1024**3 # GB
|
| 104 |
+
|
| 105 |
+
# Each TTS model instance needs ~2-3GB RAM
|
| 106 |
+
# Conservative estimation: allow 1 worker per 4GB available RAM
|
| 107 |
+
memory_limited_workers = max(1, int(available_memory / 4))
|
| 108 |
+
|
| 109 |
+
# CPU-based calculation: use 50% of physical cores for intensive TTS work
|
| 110 |
+
cpu_limited_workers = max(1, int(cpu_cores * 0.5))
|
| 111 |
+
|
| 112 |
+
optimal_workers = min(memory_limited_workers, cpu_limited_workers, MAX_WORKERS)
|
| 113 |
+
print(f"π» CPU mode: {cpu_cores} cores, {available_memory:.1f}GB RAM β {optimal_workers} workers")
|
| 114 |
+
return optimal_workers
|
| 115 |
+
|
| 116 |
+
# GPU-based worker calculation (existing logic)
|
| 117 |
+
allocated_vram = torch.cuda.memory_allocated() / 1024**3
|
| 118 |
+
|
| 119 |
+
if allocated_vram < 5.0:
|
| 120 |
+
return min(TEST_MAX_WORKERS, MAX_WORKERS)
|
| 121 |
+
elif allocated_vram < VRAM_SAFETY_THRESHOLD:
|
| 122 |
+
return min(2, MAX_WORKERS)
|
| 123 |
+
else:
|
| 124 |
+
return 1
|
| 125 |
+
|
| 126 |
+
def load_optimized_model(device):
|
| 127 |
+
"""Load TTS model with memory optimizations and device detection"""
|
| 128 |
+
from chatterbox.tts import ChatterboxTTS
|
| 129 |
+
|
| 130 |
+
# Detect available device if not specified or if CUDA not available
|
| 131 |
+
if device == "cuda" and not torch.cuda.is_available():
|
| 132 |
+
print("β οΈ CUDA not available, falling back to CPU")
|
| 133 |
+
device = "cpu"
|
| 134 |
+
elif device == "auto":
|
| 135 |
+
if torch.cuda.is_available():
|
| 136 |
+
device = "cuda"
|
| 137 |
+
print("β
CUDA detected, using GPU")
|
| 138 |
+
else:
|
| 139 |
+
device = "cpu"
|
| 140 |
+
print("π» No GPU detected, using CPU")
|
| 141 |
+
|
| 142 |
+
print(f"π§ Loading ChatterboxTTS model on device: {device}")
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
# Load model (ChatterboxTTS.from_pretrained doesn't support torch_dtype parameter)
|
| 146 |
+
model = ChatterboxTTS.from_pretrained(device=device)
|
| 147 |
+
logging.info(f"β
Loaded ChatterboxTTS model on {device}")
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"β Failed to load model on {device}: {e}")
|
| 150 |
+
if device == "cuda":
|
| 151 |
+
print("π Retrying with CPU...")
|
| 152 |
+
try:
|
| 153 |
+
model = ChatterboxTTS.from_pretrained(device="cpu")
|
| 154 |
+
logging.info("β
Loaded model on CPU (GPU failed)")
|
| 155 |
+
device = "cpu"
|
| 156 |
+
except Exception as e2:
|
| 157 |
+
print(f"β Failed to load model on CPU: {e2}")
|
| 158 |
+
raise e2
|
| 159 |
+
else:
|
| 160 |
+
raise e
|
| 161 |
+
|
| 162 |
+
# Only apply eval() and benchmark if the model has these attributes
|
| 163 |
+
if hasattr(model, 'eval'):
|
| 164 |
+
model.eval()
|
| 165 |
+
|
| 166 |
+
# Set CUDNN benchmark for performance (if available)
|
| 167 |
+
if torch.backends.cudnn.is_available():
|
| 168 |
+
torch.backends.cudnn.benchmark = True
|
| 169 |
+
|
| 170 |
+
return model
|
| 171 |
+
|
| 172 |
+
# ============================================================================
|
| 173 |
+
# CHUNK PROCESSING
|
| 174 |
+
# ============================================================================
|
| 175 |
+
|
| 176 |
+
def patch_alignment_layer(tfmr, alignment_layer_idx=12):
|
| 177 |
+
"""Patch alignment layer to avoid recursion"""
|
| 178 |
+
from types import MethodType
|
| 179 |
+
target_layer = tfmr.layers[alignment_layer_idx].self_attn
|
| 180 |
+
original_forward = target_layer.forward
|
| 181 |
+
|
| 182 |
+
def patched_forward(self, *args, **kwargs):
|
| 183 |
+
kwargs['output_attentions'] = True
|
| 184 |
+
return original_forward(*args, **kwargs)
|
| 185 |
+
|
| 186 |
+
target_layer.forward = MethodType(patched_forward, target_layer)
|
| 187 |
+
|
| 188 |
+
def process_one_chunk(
|
| 189 |
+
i, chunk, text_chunks_dir, audio_chunks_dir,
|
| 190 |
+
voice_path, tts_params, start_time, total_chunks,
|
| 191 |
+
punc_norm, basename, log_run_func, log_path, device,
|
| 192 |
+
model, asr_model, all_chunks, boundary_type="none"
|
| 193 |
+
):
|
| 194 |
+
"""Enhanced chunk processing with quality control, contextual silence, and deep cleanup"""
|
| 195 |
+
import difflib
|
| 196 |
+
from pydub import AudioSegment
|
| 197 |
+
|
| 198 |
+
chunk_id_str = f"{i+1:05}"
|
| 199 |
+
chunk_path = text_chunks_dir / f"chunk_{chunk_id_str}.txt"
|
| 200 |
+
with open(chunk_path, 'w', encoding='utf-8') as cf:
|
| 201 |
+
cf.write(chunk)
|
| 202 |
+
|
| 203 |
+
chunk_audio_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav"
|
| 204 |
+
|
| 205 |
+
# ============================================================================
|
| 206 |
+
# ENHANCED PERIODIC DEEP CLEANUP
|
| 207 |
+
# ============================================================================
|
| 208 |
+
cleanup_interval = CLEANUP_INTERVAL
|
| 209 |
+
|
| 210 |
+
# Skip cleanup on model reinitialization chunks to avoid conflicts
|
| 211 |
+
if (i + 1) % cleanup_interval == 0 and (i + 1) % BATCH_SIZE != 0:
|
| 212 |
+
print(f"\nπ§Ή {YELLOW}DEEP CLEANUP at chunk {i+1}/{total_chunks}...{RESET}")
|
| 213 |
+
|
| 214 |
+
# Enhanced VRAM monitoring before cleanup
|
| 215 |
+
allocated_before = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
|
| 216 |
+
reserved_before = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0
|
| 217 |
+
|
| 218 |
+
print(f" Before: VRAM Allocated: {allocated_before:.1f}GB | Reserved: {reserved_before:.1f}GB")
|
| 219 |
+
|
| 220 |
+
# Bulk temp file cleanup
|
| 221 |
+
print(" ποΈ Cleaning bulk temporary files...")
|
| 222 |
+
temp_patterns = ["*_try*.wav", "*_pre.wav", "*_fade*.wav", "*_debug*.wav", "*_temp*.wav", "*_backup*.wav"]
|
| 223 |
+
total_temp_files = 0
|
| 224 |
+
for pattern in temp_patterns:
|
| 225 |
+
temp_files = list(audio_chunks_dir.glob(pattern))
|
| 226 |
+
for temp_file in temp_files:
|
| 227 |
+
temp_file.unlink(missing_ok=True)
|
| 228 |
+
total_temp_files += len(temp_files)
|
| 229 |
+
|
| 230 |
+
if total_temp_files > 0:
|
| 231 |
+
print(f" ποΈ Removed {total_temp_files} temporary audio files")
|
| 232 |
+
|
| 233 |
+
# Aggressive CUDA context reset
|
| 234 |
+
print(" π Performing aggressive CUDA context reset...")
|
| 235 |
+
torch.cuda.synchronize()
|
| 236 |
+
torch.cuda.empty_cache()
|
| 237 |
+
torch.cuda.ipc_collect()
|
| 238 |
+
|
| 239 |
+
# Force CUDA context reset
|
| 240 |
+
if hasattr(torch.cuda, 'reset_peak_memory_stats'):
|
| 241 |
+
torch.cuda.reset_peak_memory_stats()
|
| 242 |
+
if hasattr(torch._C, '_cuda_clearCublasWorkspaces'):
|
| 243 |
+
torch._C._cuda_clearCublasWorkspaces()
|
| 244 |
+
|
| 245 |
+
# Force garbage collection multiple times
|
| 246 |
+
for _ in range(3):
|
| 247 |
+
gc.collect()
|
| 248 |
+
|
| 249 |
+
# Clear model cache if it has one
|
| 250 |
+
if hasattr(model, 'clear_cache'):
|
| 251 |
+
model.clear_cache()
|
| 252 |
+
elif hasattr(model, 'reset_states'):
|
| 253 |
+
model.reset_states()
|
| 254 |
+
|
| 255 |
+
# Brief pause to let GPU settle
|
| 256 |
+
time.sleep(1.0)
|
| 257 |
+
|
| 258 |
+
# Monitor after cleanup
|
| 259 |
+
allocated_after = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
|
| 260 |
+
reserved_after = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0
|
| 261 |
+
|
| 262 |
+
print(f" After: VRAM Allocated: {allocated_after:.1f}GB | Reserved: {reserved_after:.1f}GB")
|
| 263 |
+
print(f" Freed: {allocated_before - allocated_after:.1f}GB allocated, {reserved_before - reserved_after:.1f}GB reserved")
|
| 264 |
+
print(f"π§Ή {GREEN}Deep cleanup complete!{RESET}\n")
|
| 265 |
+
|
| 266 |
+
best_sim, best_asr_text = -1, ""
|
| 267 |
+
wav_path_active = None
|
| 268 |
+
attempt_paths = []
|
| 269 |
+
mid_drop_retries = 0
|
| 270 |
+
max_mid_drop_retries = 2
|
| 271 |
+
|
| 272 |
+
for attempt_num in range(1, 3):
|
| 273 |
+
logging.info(f"π Starting TTS for chunk {chunk_id_str}, attempt {attempt_num}")
|
| 274 |
+
try:
|
| 275 |
+
tts_args = {k: v for k, v in tts_params.items() if k not in ["max_workers", "enable_asr"]}
|
| 276 |
+
|
| 277 |
+
# monitor_gpu_activity(f"Before TTS chunk_{chunk_id_str}") # Disabled for speed
|
| 278 |
+
with torch.no_grad():
|
| 279 |
+
wav = model.generate(chunk, **tts_args).detach().cpu()
|
| 280 |
+
# monitor_gpu_activity(f"After TTS chunk_{chunk_id_str}") # Disabled for speed
|
| 281 |
+
|
| 282 |
+
if wav.dim() == 1:
|
| 283 |
+
wav = wav.unsqueeze(0)
|
| 284 |
+
|
| 285 |
+
# Retry if mid-energy drop is enabled and detected (check in memory)
|
| 286 |
+
if ENABLE_MID_DROP_CHECK and has_mid_energy_drop(wav, model.sr):
|
| 287 |
+
mid_drop_retries += 1
|
| 288 |
+
if mid_drop_retries >= max_mid_drop_retries:
|
| 289 |
+
logging.info(f"β οΈ Mid-drop retry limit reached for {chunk_id_str}. Accepting audio.")
|
| 290 |
+
else:
|
| 291 |
+
logging.info(f"β οΈ Mid-chunk noise detected in {chunk_id_str}. Retrying...")
|
| 292 |
+
continue
|
| 293 |
+
|
| 294 |
+
# Convert tensor to AudioSegment for in-memory processing
|
| 295 |
+
import io
|
| 296 |
+
import soundfile as sf
|
| 297 |
+
from pydub import AudioSegment
|
| 298 |
+
|
| 299 |
+
# Convert wav tensor to AudioSegment (in memory)
|
| 300 |
+
wav_np = wav.squeeze().numpy()
|
| 301 |
+
with io.BytesIO() as wav_buffer:
|
| 302 |
+
sf.write(wav_buffer, wav_np, model.sr, format='wav')
|
| 303 |
+
wav_buffer.seek(0)
|
| 304 |
+
audio_segment = AudioSegment.from_wav(wav_buffer)
|
| 305 |
+
|
| 306 |
+
# Smart fade removed - replaced by precise audio trimming
|
| 307 |
+
# Audio health validation disabled for speed
|
| 308 |
+
|
| 309 |
+
# Note: Audio trimming will handle end-of-speech cleanup more precisely
|
| 310 |
+
|
| 311 |
+
# ASR validation (memory-based processing) - check user setting first
|
| 312 |
+
enable_asr_user = tts_params.get('enable_asr', False)
|
| 313 |
+
if (enable_asr_user or ENABLE_ASR) and asr_model is not None:
|
| 314 |
+
from modules.audio_processor import asr_f1_score
|
| 315 |
+
import io
|
| 316 |
+
import soundfile as sf
|
| 317 |
+
# monitor_gpu_activity(f"Before ASR chunk_{chunk_id_str}") # Disabled for speed
|
| 318 |
+
try:
|
| 319 |
+
# Process ASR completely in memory - no disk writes
|
| 320 |
+
# Convert AudioSegment to numpy array for ASR
|
| 321 |
+
samples = np.array(audio_segment.get_array_of_samples())
|
| 322 |
+
if audio_segment.channels == 2:
|
| 323 |
+
samples = samples.reshape((-1, 2)).mean(axis=1)
|
| 324 |
+
|
| 325 |
+
# Normalize to float32 for ASR model
|
| 326 |
+
audio_np = samples.astype(np.float32) / audio_segment.max_possible_amplitude
|
| 327 |
+
|
| 328 |
+
# Use ASR model directly on numpy array (if supported)
|
| 329 |
+
# Note: This depends on the ASR model's input capabilities
|
| 330 |
+
result = asr_model.transcribe(audio_np)
|
| 331 |
+
|
| 332 |
+
if not isinstance(result, dict) or "text" not in result:
|
| 333 |
+
raise ValueError(f"Invalid ASR result type: {type(result)}")
|
| 334 |
+
|
| 335 |
+
asr_text = result.get("text", "").strip()
|
| 336 |
+
sim_ratio = asr_f1_score(punc_norm(chunk), asr_text)
|
| 337 |
+
|
| 338 |
+
except Exception as e:
|
| 339 |
+
print(f"β ASR failed for {chunk_id_str}: {e}")
|
| 340 |
+
log_run_func(f"ASR VALIDATION FAILED - Chunk {chunk_id_str}:\nExpected:\n{chunk}\nActual:\n<ASR Failure: {e}>\nSimilarity: -1.000\n" + "="*50, log_path)
|
| 341 |
+
sim_ratio = -1.0
|
| 342 |
+
continue
|
| 343 |
+
|
| 344 |
+
logging.info(f"ASR similarity for chunk {chunk_id_str}: {sim_ratio:.3f}")
|
| 345 |
+
if sim_ratio < 0.7:
|
| 346 |
+
continue
|
| 347 |
+
|
| 348 |
+
# Track best valid match
|
| 349 |
+
best_sim = sim_ratio
|
| 350 |
+
best_asr_text = asr_text
|
| 351 |
+
# monitor_gpu_activity(f"After ASR chunk_{chunk_id_str}") # Disabled for speed
|
| 352 |
+
|
| 353 |
+
# Success - we have processed audio in memory
|
| 354 |
+
final_audio = audio_segment
|
| 355 |
+
break
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
import traceback
|
| 359 |
+
logging.error(f"Exception during TTS attempt {attempt_num} for chunk {chunk_id_str}: {e}")
|
| 360 |
+
traceback.print_exc()
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
if 'final_audio' not in locals():
|
| 364 |
+
logging.info(f"β Chunk {chunk_id_str} failed all attempts.")
|
| 365 |
+
return None, None
|
| 366 |
+
|
| 367 |
+
# Apply trimming and contextual silence in memory before final save
|
| 368 |
+
from modules.audio_processor import process_audio_with_trimming_and_silence
|
| 369 |
+
|
| 370 |
+
if boundary_type and boundary_type != "none":
|
| 371 |
+
final_audio = process_audio_with_trimming_and_silence(final_audio, boundary_type)
|
| 372 |
+
print(f"π Added {boundary_type} silence to chunk {i+1:05}")
|
| 373 |
+
else:
|
| 374 |
+
# Apply trimming even without boundary type if enabled
|
| 375 |
+
if ENABLE_AUDIO_TRIMMING:
|
| 376 |
+
from modules.audio_processor import trim_audio_endpoint
|
| 377 |
+
final_audio = trim_audio_endpoint(final_audio)
|
| 378 |
+
|
| 379 |
+
# Note: ENABLE_CHUNK_END_SILENCE is now handled by punctuation-specific silence
|
| 380 |
+
# The new system provides more precise silence based on actual punctuation
|
| 381 |
+
|
| 382 |
+
# Final save - only disk write in entire process
|
| 383 |
+
final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav"
|
| 384 |
+
final_audio.export(final_path, format="wav")
|
| 385 |
+
logging.info(f"β
Saved final chunk: {final_path.name}")
|
| 386 |
+
|
| 387 |
+
# No intermediate file cleanup needed - all processing done in memory
|
| 388 |
+
|
| 389 |
+
# Log details - only log ASR failures
|
| 390 |
+
asr_active = enable_asr_user or ENABLE_ASR
|
| 391 |
+
if asr_active and best_sim < 0.8:
|
| 392 |
+
log_run_func(f"ASR VALIDATION FAILED - Chunk {chunk_id_str}:\nExpected:\n{chunk}\nActual:\n{best_asr_text}\nSimilarity: {best_sim:.3f}\n" + "="*50, log_path)
|
| 393 |
+
elif not asr_active:
|
| 394 |
+
log_run_func(f"Chunk {chunk_id_str}: Original text: {chunk}", log_path)
|
| 395 |
+
|
| 396 |
+
# Silence already added in memory above - no disk processing needed
|
| 397 |
+
|
| 398 |
+
# Enhanced regular cleanup (every chunk)
|
| 399 |
+
del wav
|
| 400 |
+
optimize_memory_usage()
|
| 401 |
+
|
| 402 |
+
# Additional per-chunk cleanup for long runs
|
| 403 |
+
if (i + 1) % 50 == 0:
|
| 404 |
+
torch.cuda.empty_cache()
|
| 405 |
+
gc.collect()
|
| 406 |
+
|
| 407 |
+
return i, final_path
|
| 408 |
+
|
| 409 |
+
# ============================================================================
|
| 410 |
+
# MAIN BOOK PROCESSING FUNCTION
|
| 411 |
+
# ============================================================================
|
| 412 |
+
|
| 413 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 414 |
+
from wrapper.chunk_loader import save_chunks
|
| 415 |
+
|
| 416 |
+
def generate_enriched_chunks(text_file, output_dir, user_tts_params=None):
|
| 417 |
+
"""Reads a text file, performs VADER sentiment analysis, and returns enriched chunks."""
|
| 418 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 419 |
+
|
| 420 |
+
raw_text = text_file.read_text(encoding='utf-8')
|
| 421 |
+
cleaned = smart_punctuate(raw_text)
|
| 422 |
+
chunks = sentence_chunk_text(cleaned)
|
| 423 |
+
|
| 424 |
+
# Use user-provided parameters as base, or fall back to config defaults
|
| 425 |
+
if user_tts_params:
|
| 426 |
+
base_exaggeration = user_tts_params.get('exaggeration', BASE_EXAGGERATION)
|
| 427 |
+
base_cfg_weight = user_tts_params.get('cfg_weight', BASE_CFG_WEIGHT)
|
| 428 |
+
base_temperature = user_tts_params.get('temperature', BASE_TEMPERATURE)
|
| 429 |
+
else:
|
| 430 |
+
base_exaggeration = BASE_EXAGGERATION
|
| 431 |
+
base_cfg_weight = BASE_CFG_WEIGHT
|
| 432 |
+
base_temperature = BASE_TEMPERATURE
|
| 433 |
+
|
| 434 |
+
enriched = []
|
| 435 |
+
chunk_texts = [chunk_text for chunk_text, _ in chunks]
|
| 436 |
+
|
| 437 |
+
for i, (chunk_text, is_para_end) in enumerate(chunks):
|
| 438 |
+
sentiment_scores = analyzer.polarity_scores(chunk_text)
|
| 439 |
+
compound_score = sentiment_scores['compound']
|
| 440 |
+
|
| 441 |
+
exaggeration = base_exaggeration + (compound_score * VADER_EXAGGERATION_SENSITIVITY)
|
| 442 |
+
cfg_weight = base_cfg_weight + (compound_score * VADER_CFG_WEIGHT_SENSITIVITY)
|
| 443 |
+
temperature = base_temperature + (compound_score * VADER_TEMPERATURE_SENSITIVITY)
|
| 444 |
+
|
| 445 |
+
# Clamp values to defined min/max
|
| 446 |
+
exaggeration = round(max(TTS_PARAM_MIN_EXAGGERATION, min(exaggeration, TTS_PARAM_MAX_EXAGGERATION)), 2)
|
| 447 |
+
cfg_weight = round(max(TTS_PARAM_MIN_CFG_WEIGHT, min(cfg_weight, TTS_PARAM_MAX_CFG_WEIGHT)), 2)
|
| 448 |
+
temperature = round(max(TTS_PARAM_MIN_TEMPERATURE, min(temperature, TTS_PARAM_MAX_TEMPERATURE)), 2)
|
| 449 |
+
|
| 450 |
+
boundary_type = detect_content_boundaries(chunk_text, i, chunk_texts, is_para_end)
|
| 451 |
+
|
| 452 |
+
enriched.append({
|
| 453 |
+
"index": i,
|
| 454 |
+
"text": chunk_text,
|
| 455 |
+
"word_count": len(chunk_text.split()),
|
| 456 |
+
"boundary_type": boundary_type if boundary_type else "none",
|
| 457 |
+
"sentiment_compound": compound_score,
|
| 458 |
+
"tts_params": {
|
| 459 |
+
"exaggeration": exaggeration,
|
| 460 |
+
"cfg_weight": cfg_weight,
|
| 461 |
+
"temperature": temperature
|
| 462 |
+
}
|
| 463 |
+
})
|
| 464 |
+
|
| 465 |
+
output_json_path = output_dir / "chunks_info.json"
|
| 466 |
+
save_chunks(output_json_path, enriched)
|
| 467 |
+
return enriched
|
| 468 |
+
|
| 469 |
+
def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=False):
|
| 470 |
+
"""Enhanced book processing with batch processing to prevent hangs"""
|
| 471 |
+
print(f"π DEBUG: Entering process_book_folder with book_dir='{book_dir}', voice_path='{voice_path}'")
|
| 472 |
+
|
| 473 |
+
from chatterbox.tts import punc_norm
|
| 474 |
+
print(f"π DEBUG: Successfully imported punc_norm")
|
| 475 |
+
|
| 476 |
+
# Setup directories
|
| 477 |
+
print(f"π DEBUG: Calling setup_book_directories...")
|
| 478 |
+
output_root, tts_dir, text_chunks_dir, audio_chunks_dir = setup_book_directories(book_dir)
|
| 479 |
+
print(f"π DEBUG: Directory setup complete")
|
| 480 |
+
|
| 481 |
+
# Clean previous processing files (but skip for resume operations)
|
| 482 |
+
if skip_cleanup:
|
| 483 |
+
print(f"π RESUME MODE: Skipping cleanup to preserve existing chunks")
|
| 484 |
+
print(f"π Preserving: {text_chunks_dir}, {audio_chunks_dir}")
|
| 485 |
+
else:
|
| 486 |
+
print(f"π§Ή FRESH PROCESSING: Cleaning previous processing files...")
|
| 487 |
+
import glob
|
| 488 |
+
|
| 489 |
+
# Clear text chunks
|
| 490 |
+
for txt_file in text_chunks_dir.glob("*.txt"):
|
| 491 |
+
txt_file.unlink(missing_ok=True)
|
| 492 |
+
for json_file in text_chunks_dir.glob("*.json"):
|
| 493 |
+
json_file.unlink(missing_ok=True)
|
| 494 |
+
|
| 495 |
+
# Clear audio chunks
|
| 496 |
+
for wav_file in audio_chunks_dir.glob("*.wav"):
|
| 497 |
+
wav_file.unlink(missing_ok=True)
|
| 498 |
+
|
| 499 |
+
# Clear logs
|
| 500 |
+
for log_file in output_root.glob("*.log"):
|
| 501 |
+
log_file.unlink(missing_ok=True)
|
| 502 |
+
|
| 503 |
+
print(f"β
Cleanup complete")
|
| 504 |
+
|
| 505 |
+
# Find book files
|
| 506 |
+
print(f"π DEBUG: Calling find_book_files...")
|
| 507 |
+
book_files = find_book_files(book_dir)
|
| 508 |
+
text_files = [book_files['text']] if book_files['text'] else []
|
| 509 |
+
cover_file = book_files['cover']
|
| 510 |
+
nfo_file = book_files['nfo']
|
| 511 |
+
print(f"π DEBUG: Found text files: {text_files}")
|
| 512 |
+
|
| 513 |
+
if not text_files:
|
| 514 |
+
logging.info(f"[{book_dir.name}] ERROR: No .txt files found in the book folder.")
|
| 515 |
+
return None, None, []
|
| 516 |
+
|
| 517 |
+
setup_logging(output_root)
|
| 518 |
+
|
| 519 |
+
# Generate enriched chunks with VADER analysis using user parameters
|
| 520 |
+
all_chunks = generate_enriched_chunks(text_files[0], text_chunks_dir, tts_params)
|
| 521 |
+
|
| 522 |
+
# Create run_log_lines
|
| 523 |
+
print(f"π DEBUG: Creating run_log_lines...")
|
| 524 |
+
print(f"π DEBUG: voice_path type: {type(voice_path)}, value: {voice_path}")
|
| 525 |
+
|
| 526 |
+
# Extract voice name for logging
|
| 527 |
+
voice_name_for_log = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem
|
| 528 |
+
|
| 529 |
+
run_log_lines = [
|
| 530 |
+
f"\n===== Processing: {book_dir.name} =====",
|
| 531 |
+
f"Voice: {voice_name_for_log}",
|
| 532 |
+
f"Started: {time.strftime('%Y-%m-%d %H:%M:%S')}",
|
| 533 |
+
f"Text files processed: {len(text_files)}",
|
| 534 |
+
f"Total chunks generated: {len(all_chunks)}"
|
| 535 |
+
]
|
| 536 |
+
|
| 537 |
+
start_time = time.time()
|
| 538 |
+
total_chunks = len(all_chunks)
|
| 539 |
+
log_path = output_root / "chunk_validation.log"
|
| 540 |
+
total_audio_duration = 0.0
|
| 541 |
+
|
| 542 |
+
# Batch processing
|
| 543 |
+
print(f"π Processing {total_chunks} chunks in batches of {BATCH_SIZE}")
|
| 544 |
+
|
| 545 |
+
all_results = []
|
| 546 |
+
|
| 547 |
+
for batch_start in range(0, total_chunks, BATCH_SIZE):
|
| 548 |
+
batch_end = min(batch_start + BATCH_SIZE, total_chunks)
|
| 549 |
+
batch_chunks = all_chunks[batch_start:batch_end]
|
| 550 |
+
|
| 551 |
+
print(f"\nπ Processing batch: chunks {batch_start+1}-{batch_end}")
|
| 552 |
+
|
| 553 |
+
# Fresh model for each batch
|
| 554 |
+
model = load_optimized_model(device)
|
| 555 |
+
compatible_voice = ensure_voice_sample_compatibility(voice_path, output_dir=tts_dir)
|
| 556 |
+
model.prepare_conditionals(compatible_voice)
|
| 557 |
+
|
| 558 |
+
# Load ASR model once per batch if needed (check user settings first, then global config)
|
| 559 |
+
asr_model = None
|
| 560 |
+
enable_asr_user = tts_params.get('enable_asr', False)
|
| 561 |
+
if enable_asr_user or ENABLE_ASR:
|
| 562 |
+
import whisper
|
| 563 |
+
print(f"π€ Loading Whisper ASR model for batch... (user setting: {enable_asr_user})")
|
| 564 |
+
# Use same device as TTS model, with fallback to CPU
|
| 565 |
+
asr_device = device if torch.cuda.is_available() and device == "cuda" else "cpu"
|
| 566 |
+
print(f"π€ Loading ASR model on device: {asr_device}")
|
| 567 |
+
asr_model = whisper.load_model("base", device=asr_device)
|
| 568 |
+
|
| 569 |
+
futures = []
|
| 570 |
+
batch_results = []
|
| 571 |
+
|
| 572 |
+
# Dynamic worker allocation
|
| 573 |
+
user_max_workers = tts_params.get('max_workers', None)
|
| 574 |
+
optimal_workers = get_optimal_workers(user_max_workers)
|
| 575 |
+
print(f"π§ Using {optimal_workers} workers for batch {batch_start+1}-{batch_end}")
|
| 576 |
+
|
| 577 |
+
with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
|
| 578 |
+
for i, chunk_data in enumerate(batch_chunks):
|
| 579 |
+
global_chunk_index = batch_start + i
|
| 580 |
+
|
| 581 |
+
# Check for shutdown request
|
| 582 |
+
if shutdown_requested:
|
| 583 |
+
print(f"\nβΉοΈ {YELLOW}Stopping submission of new chunks...{RESET}")
|
| 584 |
+
break
|
| 585 |
+
|
| 586 |
+
# Handle both dictionary and tuple formats for chunk data
|
| 587 |
+
if isinstance(chunk_data, dict):
|
| 588 |
+
chunk = chunk_data["text"]
|
| 589 |
+
boundary_type = chunk_data.get("boundary_type", "none")
|
| 590 |
+
# Use chunk-specific TTS params if available, otherwise fall back to global
|
| 591 |
+
chunk_tts_params = chunk_data.get("tts_params", tts_params)
|
| 592 |
+
else:
|
| 593 |
+
# Handle old tuple format (text, is_para_end) - convert to boundary_type
|
| 594 |
+
chunk = chunk_data[0] if len(chunk_data) > 0 else str(chunk_data)
|
| 595 |
+
# Convert old is_paragraph_end to boundary_type
|
| 596 |
+
is_old_para_end = chunk_data[1] if len(chunk_data) > 1 else False
|
| 597 |
+
boundary_type = "paragraph_end" if is_old_para_end else "none"
|
| 598 |
+
chunk_tts_params = tts_params # Fallback for old format
|
| 599 |
+
|
| 600 |
+
# Handle both dictionary and tuple formats for backward compatibility
|
| 601 |
+
all_chunk_texts = []
|
| 602 |
+
for cd in all_chunks:
|
| 603 |
+
if isinstance(cd, dict):
|
| 604 |
+
all_chunk_texts.append(cd["text"])
|
| 605 |
+
else:
|
| 606 |
+
# Handle old tuple format (text, is_para_end)
|
| 607 |
+
all_chunk_texts.append(cd[0] if len(cd) > 0 else str(cd))
|
| 608 |
+
|
| 609 |
+
futures.append(executor.submit(
|
| 610 |
+
process_one_chunk,
|
| 611 |
+
global_chunk_index, chunk, text_chunks_dir, audio_chunks_dir,
|
| 612 |
+
voice_path, chunk_tts_params, start_time, total_chunks,
|
| 613 |
+
punc_norm, book_dir.name, log_run, log_path, device,
|
| 614 |
+
model, asr_model, all_chunk_texts, boundary_type
|
| 615 |
+
))
|
| 616 |
+
|
| 617 |
+
# Wait for batch to complete
|
| 618 |
+
print(f"π {CYAN}Waiting for batch {batch_start+1}-{batch_end} to complete...{RESET}")
|
| 619 |
+
completed_count = 0
|
| 620 |
+
|
| 621 |
+
for fut in as_completed(futures):
|
| 622 |
+
try:
|
| 623 |
+
idx, wav_path = fut.result()
|
| 624 |
+
if wav_path and wav_path.exists():
|
| 625 |
+
# Measure actual audio duration for this chunk
|
| 626 |
+
chunk_duration = get_chunk_audio_duration(wav_path)
|
| 627 |
+
total_audio_duration += chunk_duration
|
| 628 |
+
batch_results.append((idx, wav_path))
|
| 629 |
+
|
| 630 |
+
# Update progress every 10 chunks within batch
|
| 631 |
+
completed_count += 1
|
| 632 |
+
if completed_count % 10 == 0:
|
| 633 |
+
log_chunk_progress(batch_start + completed_count - 1, total_chunks, start_time, total_audio_duration)
|
| 634 |
+
|
| 635 |
+
except Exception as e:
|
| 636 |
+
logging.error(f"Future failed in batch: {e}")
|
| 637 |
+
|
| 638 |
+
# Clean up model after batch
|
| 639 |
+
print(f"π§Ή Cleaning up after batch {batch_start+1}-{batch_end}")
|
| 640 |
+
del model
|
| 641 |
+
if asr_model:
|
| 642 |
+
del asr_model
|
| 643 |
+
torch.cuda.empty_cache()
|
| 644 |
+
gc.collect()
|
| 645 |
+
time.sleep(2)
|
| 646 |
+
|
| 647 |
+
all_results.extend(batch_results)
|
| 648 |
+
print(f"β
Batch {batch_start+1}-{batch_end} completed ({len(batch_results)} chunks)")
|
| 649 |
+
|
| 650 |
+
# Final processing
|
| 651 |
+
quarantine_dir = audio_chunks_dir / "quarantine"
|
| 652 |
+
pause_for_chunk_review(quarantine_dir)
|
| 653 |
+
|
| 654 |
+
# Collect final chunk paths
|
| 655 |
+
chunk_paths = get_audio_files_in_directory(audio_chunks_dir)
|
| 656 |
+
|
| 657 |
+
if not chunk_paths:
|
| 658 |
+
logging.info(f"{RED}β No valid audio chunks found. Skipping concatenation and conversion.{RESET}")
|
| 659 |
+
return None, None, []
|
| 660 |
+
|
| 661 |
+
# Calculate timing
|
| 662 |
+
elapsed_total = time.time() - start_time
|
| 663 |
+
elapsed_td = timedelta(seconds=int(elapsed_total))
|
| 664 |
+
|
| 665 |
+
total_audio_duration_final = sum(get_chunk_audio_duration(chunk_path) for chunk_path in chunk_paths)
|
| 666 |
+
audio_duration_td = timedelta(seconds=int(total_audio_duration_final))
|
| 667 |
+
realtime_factor = total_audio_duration_final / elapsed_total if elapsed_total > 0 else 0.0
|
| 668 |
+
|
| 669 |
+
print(f"\nβ±οΈ TTS Processing Complete:")
|
| 670 |
+
print(f" Elapsed Time: {CYAN}{str(elapsed_td)}{RESET}")
|
| 671 |
+
print(f" Audio Duration: {GREEN}{str(audio_duration_td)}{RESET}")
|
| 672 |
+
print(f" Realtime Factor: {YELLOW}{realtime_factor:.2f}x{RESET}")
|
| 673 |
+
|
| 674 |
+
# Combine audio
|
| 675 |
+
voice_name = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem
|
| 676 |
+
combined_wav_path = output_root / f"{book_dir.name} [{voice_name}].wav"
|
| 677 |
+
print("\nπΎ Saving WAV file...")
|
| 678 |
+
combine_audio_chunks(chunk_paths, combined_wav_path)
|
| 679 |
+
|
| 680 |
+
# M4B conversion with normalization
|
| 681 |
+
temp_m4b_path = output_root / "output.m4b"
|
| 682 |
+
final_m4b_path = output_root / f"{book_dir.name}[{voice_name}].m4b"
|
| 683 |
+
convert_to_m4b(combined_wav_path, temp_m4b_path)
|
| 684 |
+
add_metadata_to_m4b(temp_m4b_path, final_m4b_path, cover_file, nfo_file)
|
| 685 |
+
|
| 686 |
+
logging.info(f"Audiobook created: {final_m4b_path}")
|
| 687 |
+
|
| 688 |
+
# Add final info to run log
|
| 689 |
+
run_log_lines.extend([
|
| 690 |
+
f"Combined WAV: {combined_wav_path}",
|
| 691 |
+
"--- Generation Settings ---",
|
| 692 |
+
f"Batch Processing: Enabled ({BATCH_SIZE} chunks per batch)",
|
| 693 |
+
f"ASR Enabled: {enable_asr_user or ENABLE_ASR} (user: {enable_asr_user}, global: {ENABLE_ASR})",
|
| 694 |
+
f"Hum Detection: {ENABLE_HUM_DETECTION}",
|
| 695 |
+
f"Dynamic Workers: {USE_DYNAMIC_WORKERS}",
|
| 696 |
+
f"Voice used: {voice_name}",
|
| 697 |
+
f"Exaggeration: {tts_params['exaggeration']}",
|
| 698 |
+
f"CFG weight: {tts_params['cfg_weight']}",
|
| 699 |
+
f"Temperature: {tts_params['temperature']}",
|
| 700 |
+
f"Processing Time: {str(elapsed_td)}",
|
| 701 |
+
f"Audio Duration: {str(audio_duration_td)}",
|
| 702 |
+
f"Realtime Factor: {realtime_factor:.2f}x",
|
| 703 |
+
f"Total Chunks: {len(chunk_paths)}"
|
| 704 |
+
])
|
| 705 |
+
|
| 706 |
+
# Write the run log
|
| 707 |
+
log_run("\n".join(run_log_lines), output_root / "run.log")
|
| 708 |
+
print(f"π Run log written to: {output_root / 'run.log'}")
|
| 709 |
+
|
| 710 |
+
return final_m4b_path, combined_wav_path, run_log_lines
|
src/chatterbox/models/t3/t3.py
CHANGED
|
@@ -224,6 +224,7 @@ class T3(nn.Module):
|
|
| 224 |
do_sample=True,
|
| 225 |
temperature=0.8,
|
| 226 |
top_p=0.8,
|
|
|
|
| 227 |
length_penalty=1.0,
|
| 228 |
repetition_penalty=2.0,
|
| 229 |
cfg_weight=0,
|
|
|
|
| 224 |
do_sample=True,
|
| 225 |
temperature=0.8,
|
| 226 |
top_p=0.8,
|
| 227 |
+
min_p=0.05,
|
| 228 |
length_penalty=1.0,
|
| 229 |
repetition_penalty=2.0,
|
| 230 |
cfg_weight=0,
|
src/chatterbox/tts.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from pathlib import Path
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import librosa
|
| 5 |
import torch
|
|
@@ -189,6 +191,50 @@ class ChatterboxTTS:
|
|
| 189 |
return cls.from_local(Path(local_path).parent, device)
|
| 190 |
|
| 191 |
def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
## Load reference wav
|
| 193 |
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
|
| 194 |
|
|
@@ -214,6 +260,30 @@ class ChatterboxTTS:
|
|
| 214 |
).to(device=self.device)
|
| 215 |
self.conds = Conditionals(t3_cond, s3gen_ref_dict)
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
def generate(
|
| 218 |
self,
|
| 219 |
text,
|
|
@@ -278,4 +348,68 @@ class ChatterboxTTS:
|
|
| 278 |
)
|
| 279 |
wav = wav.squeeze(0).detach().cpu().numpy()
|
| 280 |
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
|
| 281 |
-
return torch.from_numpy(watermarked_wav).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from pathlib import Path
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
|
| 6 |
import librosa
|
| 7 |
import torch
|
|
|
|
| 191 |
return cls.from_local(Path(local_path).parent, device)
|
| 192 |
|
| 193 |
def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
|
| 194 |
+
"""Prepare voice conditionals with optional caching for performance optimization"""
|
| 195 |
+
|
| 196 |
+
# Try to import voice caching functions (with fallback for compatibility)
|
| 197 |
+
try:
|
| 198 |
+
from modules.tts_engine import (
|
| 199 |
+
get_voice_cache_key,
|
| 200 |
+
_voice_embedding_cache,
|
| 201 |
+
_cache_memory_usage,
|
| 202 |
+
estimate_cache_memory_mb,
|
| 203 |
+
get_available_memory,
|
| 204 |
+
clear_voice_embedding_cache
|
| 205 |
+
)
|
| 206 |
+
from config.config import (
|
| 207 |
+
ENABLE_VOICE_EMBEDDING_CACHE,
|
| 208 |
+
VOICE_CACHE_MEMORY_LIMIT_MB,
|
| 209 |
+
ENABLE_ADAPTIVE_VOICE_CACHE
|
| 210 |
+
)
|
| 211 |
+
caching_available = True
|
| 212 |
+
except ImportError:
|
| 213 |
+
caching_available = False
|
| 214 |
+
logging.warning("Voice embedding caching not available - using standard processing")
|
| 215 |
+
|
| 216 |
+
# Check cache if caching is enabled and available
|
| 217 |
+
if caching_available and ENABLE_VOICE_EMBEDDING_CACHE:
|
| 218 |
+
cache_key = get_voice_cache_key(wav_fpath, exaggeration)
|
| 219 |
+
|
| 220 |
+
# Check if we have cached embeddings
|
| 221 |
+
if cache_key in _voice_embedding_cache:
|
| 222 |
+
try:
|
| 223 |
+
self.conds = _voice_embedding_cache[cache_key]
|
| 224 |
+
logging.info("π Using cached voice embeddings - significant speedup!")
|
| 225 |
+
return
|
| 226 |
+
except Exception as e:
|
| 227 |
+
logging.warning(f"β οΈ Cache retrieval failed: {e}, computing fresh embeddings")
|
| 228 |
+
|
| 229 |
+
# Check memory constraints before caching
|
| 230 |
+
available_memory = get_available_memory()
|
| 231 |
+
if ENABLE_ADAPTIVE_VOICE_CACHE and available_memory < 2048: # Less than 2GB available
|
| 232 |
+
logging.warning("π§ Low memory detected - disabling voice embedding cache")
|
| 233 |
+
caching_available = False
|
| 234 |
+
|
| 235 |
+
# Original embedding computation (always runs for new voices or cache misses)
|
| 236 |
+
logging.info("π€ Computing voice embeddings (this may take a moment)")
|
| 237 |
+
|
| 238 |
## Load reference wav
|
| 239 |
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
|
| 240 |
|
|
|
|
| 260 |
).to(device=self.device)
|
| 261 |
self.conds = Conditionals(t3_cond, s3gen_ref_dict)
|
| 262 |
|
| 263 |
+
# Cache the computed embeddings if caching is enabled
|
| 264 |
+
if caching_available and ENABLE_VOICE_EMBEDDING_CACHE:
|
| 265 |
+
try:
|
| 266 |
+
# Check memory usage before caching
|
| 267 |
+
global _cache_memory_usage
|
| 268 |
+
estimated_size = estimate_cache_memory_mb(self.conds)
|
| 269 |
+
|
| 270 |
+
if _cache_memory_usage + estimated_size <= VOICE_CACHE_MEMORY_LIMIT_MB:
|
| 271 |
+
cache_key = get_voice_cache_key(wav_fpath, exaggeration)
|
| 272 |
+
_voice_embedding_cache[cache_key] = self.conds
|
| 273 |
+
_cache_memory_usage += estimated_size
|
| 274 |
+
logging.info(f"πΎ Voice embeddings cached ({estimated_size}MB, total: {_cache_memory_usage}MB)")
|
| 275 |
+
else:
|
| 276 |
+
logging.warning("β οΈ Cache memory limit reached - clearing old cache")
|
| 277 |
+
clear_voice_embedding_cache()
|
| 278 |
+
# Try caching again after clearing
|
| 279 |
+
cache_key = get_voice_cache_key(wav_fpath, exaggeration)
|
| 280 |
+
_voice_embedding_cache[cache_key] = self.conds
|
| 281 |
+
_cache_memory_usage = estimated_size
|
| 282 |
+
logging.info(f"πΎ Voice embeddings cached after cleanup ({estimated_size}MB)")
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
logging.warning(f"β οΈ Caching failed: {e}, continuing without cache")
|
| 286 |
+
|
| 287 |
def generate(
|
| 288 |
self,
|
| 289 |
text,
|
|
|
|
| 348 |
)
|
| 349 |
wav = wav.squeeze(0).detach().cpu().numpy()
|
| 350 |
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
|
| 351 |
+
return torch.from_numpy(watermarked_wav).unsqueeze(0)
|
| 352 |
+
|
| 353 |
+
def generate_batch(
|
| 354 |
+
self,
|
| 355 |
+
texts: list[str],
|
| 356 |
+
audio_prompt_path=None,
|
| 357 |
+
exaggeration=0.5,
|
| 358 |
+
cfg_weight=0.5,
|
| 359 |
+
temperature=0.8,
|
| 360 |
+
min_p=0.05,
|
| 361 |
+
top_p=0.8,
|
| 362 |
+
repetition_penalty=2.0,
|
| 363 |
+
):
|
| 364 |
+
if audio_prompt_path:
|
| 365 |
+
self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
|
| 366 |
+
else:
|
| 367 |
+
assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
|
| 368 |
+
|
| 369 |
+
if exaggeration != self.conds.t3.emotion_adv[0, 0, 0]:
|
| 370 |
+
_cond: T3Cond = self.conds.t3
|
| 371 |
+
self.conds.t3 = T3Cond(
|
| 372 |
+
speaker_emb=_cond.speaker_emb,
|
| 373 |
+
cond_prompt_speech_tokens=_cond.cond_prompt_speech_tokens,
|
| 374 |
+
emotion_adv=exaggeration * torch.ones(1, 1, 1),
|
| 375 |
+
).to(device=self.device)
|
| 376 |
+
|
| 377 |
+
norm_texts = [punc_norm(text) for text in texts]
|
| 378 |
+
text_tokens = [self.tokenizer.text_to_tokens(text) for text in norm_texts]
|
| 379 |
+
|
| 380 |
+
max_len = max(t.shape[1] for t in text_tokens)
|
| 381 |
+
text_tokens_padded = torch.stack([F.pad(t, (0, max_len - t.shape[1]), value=self.t3.hp.stop_text_token) for t in text_tokens])
|
| 382 |
+
text_tokens_padded = text_tokens_padded.squeeze(1).to(self.device)
|
| 383 |
+
|
| 384 |
+
if cfg_weight > 0.0:
|
| 385 |
+
text_tokens_padded = torch.cat([text_tokens_padded, text_tokens_padded], dim=0)
|
| 386 |
+
|
| 387 |
+
sot = self.t3.hp.start_text_token
|
| 388 |
+
text_tokens_padded = F.pad(text_tokens_padded, (1, 0), value=sot)
|
| 389 |
+
|
| 390 |
+
with torch.inference_mode():
|
| 391 |
+
speech_tokens_batch = self.t3.inference(
|
| 392 |
+
t3_cond=self.conds.t3,
|
| 393 |
+
text_tokens=text_tokens_padded,
|
| 394 |
+
max_new_tokens=1000,
|
| 395 |
+
temperature=temperature,
|
| 396 |
+
cfg_weight=cfg_weight,
|
| 397 |
+
min_p=min_p,
|
| 398 |
+
top_p=top_p,
|
| 399 |
+
repetition_penalty=repetition_penalty,
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
wavs = []
|
| 403 |
+
for speech_tokens in speech_tokens_batch:
|
| 404 |
+
speech_tokens = drop_invalid_tokens(speech_tokens)
|
| 405 |
+
speech_tokens = speech_tokens[speech_tokens < 6561]
|
| 406 |
+
speech_tokens = speech_tokens.to(self.device)
|
| 407 |
+
|
| 408 |
+
wav, _ = self.s3gen.inference(
|
| 409 |
+
speech_tokens=speech_tokens,
|
| 410 |
+
ref_dict=self.conds.gen,
|
| 411 |
+
)
|
| 412 |
+
wav = wav.squeeze(0).detach().cpu().numpy()
|
| 413 |
+
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
|
| 414 |
+
wavs.append(torch.from_numpy(watermarked_wav).unsqueeze(0))
|
| 415 |
+
return wavs
|
src/chatterbox/tts.py.20250811-120000.bak
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import librosa
|
| 5 |
+
import torch
|
| 6 |
+
import perth
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
from safetensors.torch import load_file
|
| 10 |
+
|
| 11 |
+
from .models.t3 import T3
|
| 12 |
+
from .models.s3tokenizer import S3_SR, drop_invalid_tokens
|
| 13 |
+
from .models.s3gen import S3GEN_SR, S3Gen
|
| 14 |
+
from .models.tokenizers import EnTokenizer
|
| 15 |
+
from .models.voice_encoder import VoiceEncoder
|
| 16 |
+
from .models.t3.modules.cond_enc import T3Cond
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
REPO_ID = "ResembleAI/chatterbox"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def punc_norm(text: str) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Quick cleanup func for punctuation from LLMs or
|
| 25 |
+
containing chars not seen often in the dataset
|
| 26 |
+
"""
|
| 27 |
+
if len(text) == 0:
|
| 28 |
+
return "You need to add some text for me to talk."
|
| 29 |
+
|
| 30 |
+
# Capitalise first letter
|
| 31 |
+
if text[0].islower():
|
| 32 |
+
text = text[0].upper() + text[1:]
|
| 33 |
+
|
| 34 |
+
# Remove multiple space chars
|
| 35 |
+
text = " ".join(text.split())
|
| 36 |
+
|
| 37 |
+
# Replace uncommon/llm punc
|
| 38 |
+
punc_to_replace = [
|
| 39 |
+
("...", ", "),
|
| 40 |
+
("β¦", ", "),
|
| 41 |
+
(":", ","),
|
| 42 |
+
(" - ", ", "),
|
| 43 |
+
(";", ", "),
|
| 44 |
+
("β", "-"),
|
| 45 |
+
("β", "-"),
|
| 46 |
+
(" ,", ","),
|
| 47 |
+
(""", '"'),
|
| 48 |
+
(""", '"'),
|
| 49 |
+
("β", "'"),
|
| 50 |
+
("β", "'"),
|
| 51 |
+
]
|
| 52 |
+
for old_char_sequence, new_char in punc_to_replace:
|
| 53 |
+
text = text.replace(old_char_sequence, new_char)
|
| 54 |
+
|
| 55 |
+
# Add full stop if no ending punc
|
| 56 |
+
text = text.rstrip(" ")
|
| 57 |
+
sentence_enders = {".", "!", "?", "-", ","}
|
| 58 |
+
|
| 59 |
+
# Check for punctuation at end, including inside quotes
|
| 60 |
+
has_ending_punct = False
|
| 61 |
+
if any(text.endswith(p) for p in sentence_enders):
|
| 62 |
+
has_ending_punct = True
|
| 63 |
+
elif len(text) >= 2 and text[-1] in ['"', "'"] and text[-2] in sentence_enders:
|
| 64 |
+
# Check for punctuation before closing quote: ?" or .'
|
| 65 |
+
has_ending_punct = True
|
| 66 |
+
|
| 67 |
+
if not has_ending_punct:
|
| 68 |
+
text += "."
|
| 69 |
+
|
| 70 |
+
return text
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class Conditionals:
|
| 75 |
+
"""
|
| 76 |
+
Conditionals for T3 and S3Gen
|
| 77 |
+
- T3 conditionals:
|
| 78 |
+
- speaker_emb
|
| 79 |
+
- clap_emb
|
| 80 |
+
- cond_prompt_speech_tokens
|
| 81 |
+
- cond_prompt_speech_emb
|
| 82 |
+
- emotion_adv
|
| 83 |
+
- S3Gen conditionals:
|
| 84 |
+
- prompt_token
|
| 85 |
+
- prompt_token_len
|
| 86 |
+
- prompt_feat
|
| 87 |
+
- prompt_feat_len
|
| 88 |
+
- embedding
|
| 89 |
+
"""
|
| 90 |
+
t3: T3Cond
|
| 91 |
+
gen: dict
|
| 92 |
+
|
| 93 |
+
def to(self, device):
|
| 94 |
+
self.t3 = self.t3.to(device=device)
|
| 95 |
+
for k, v in self.gen.items():
|
| 96 |
+
if torch.is_tensor(v):
|
| 97 |
+
self.gen[k] = v.to(device=device)
|
| 98 |
+
return self
|
| 99 |
+
|
| 100 |
+
def save(self, fpath: Path):
|
| 101 |
+
arg_dict = dict(
|
| 102 |
+
t3=self.t3.__dict__,
|
| 103 |
+
gen=self.gen
|
| 104 |
+
)
|
| 105 |
+
torch.save(arg_dict, fpath)
|
| 106 |
+
|
| 107 |
+
@classmethod
|
| 108 |
+
def load(cls, fpath, map_location="cpu"):
|
| 109 |
+
if isinstance(map_location, str):
|
| 110 |
+
map_location = torch.device(map_location)
|
| 111 |
+
kwargs = torch.load(fpath, map_location=map_location, weights_only=True)
|
| 112 |
+
return cls(T3Cond(**kwargs['t3']), kwargs['gen'])
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class ChatterboxTTS:
|
| 116 |
+
ENC_COND_LEN = 6 * S3_SR
|
| 117 |
+
DEC_COND_LEN = 10 * S3GEN_SR
|
| 118 |
+
|
| 119 |
+
def __init__(
|
| 120 |
+
self,
|
| 121 |
+
t3: T3,
|
| 122 |
+
s3gen: S3Gen,
|
| 123 |
+
ve: VoiceEncoder,
|
| 124 |
+
tokenizer: EnTokenizer,
|
| 125 |
+
device: str,
|
| 126 |
+
conds: Conditionals = None,
|
| 127 |
+
):
|
| 128 |
+
self.sr = S3GEN_SR # sample rate of synthesized audio
|
| 129 |
+
self.t3 = t3
|
| 130 |
+
self.s3gen = s3gen
|
| 131 |
+
self.ve = ve
|
| 132 |
+
self.tokenizer = tokenizer
|
| 133 |
+
self.device = device
|
| 134 |
+
self.conds = conds
|
| 135 |
+
self.watermarker = perth.PerthImplicitWatermarker()
|
| 136 |
+
|
| 137 |
+
@classmethod
|
| 138 |
+
def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
|
| 139 |
+
ckpt_dir = Path(ckpt_dir)
|
| 140 |
+
|
| 141 |
+
# Always load to CPU first for non-CUDA devices to handle CUDA-saved models
|
| 142 |
+
if device in ["cpu", "mps"]:
|
| 143 |
+
map_location = torch.device('cpu')
|
| 144 |
+
else:
|
| 145 |
+
map_location = None
|
| 146 |
+
|
| 147 |
+
ve = VoiceEncoder()
|
| 148 |
+
ve.load_state_dict(
|
| 149 |
+
load_file(ckpt_dir / "ve.safetensors")
|
| 150 |
+
)
|
| 151 |
+
ve.to(device).eval()
|
| 152 |
+
|
| 153 |
+
t3 = T3()
|
| 154 |
+
t3_state = load_file(ckpt_dir / "t3_cfg.safetensors")
|
| 155 |
+
if "model" in t3_state.keys():
|
| 156 |
+
t3_state = t3_state["model"][0]
|
| 157 |
+
t3.load_state_dict(t3_state)
|
| 158 |
+
t3.to(device).eval()
|
| 159 |
+
|
| 160 |
+
s3gen = S3Gen()
|
| 161 |
+
s3gen.load_state_dict(
|
| 162 |
+
load_file(ckpt_dir / "s3gen.safetensors"), strict=False
|
| 163 |
+
)
|
| 164 |
+
s3gen.to(device).eval()
|
| 165 |
+
|
| 166 |
+
tokenizer = EnTokenizer(
|
| 167 |
+
str(ckpt_dir / "tokenizer.json")
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
conds = None
|
| 171 |
+
if (builtin_voice := ckpt_dir / "conds.pt").exists():
|
| 172 |
+
conds = Conditionals.load(builtin_voice, map_location=map_location).to(device)
|
| 173 |
+
|
| 174 |
+
return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
|
| 175 |
+
|
| 176 |
+
@classmethod
|
| 177 |
+
def from_pretrained(cls, device) -> 'ChatterboxTTS':
|
| 178 |
+
# Check if MPS is available on macOS
|
| 179 |
+
if device == "mps" and not torch.backends.mps.is_available():
|
| 180 |
+
if not torch.backends.mps.is_built():
|
| 181 |
+
print("MPS not available because the current PyTorch install was not built with MPS enabled.")
|
| 182 |
+
else:
|
| 183 |
+
print("MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.")
|
| 184 |
+
device = "cpu"
|
| 185 |
+
|
| 186 |
+
for fpath in ["ve.safetensors", "t3_cfg.safetensors", "s3gen.safetensors", "tokenizer.json", "conds.pt"]:
|
| 187 |
+
local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath)
|
| 188 |
+
|
| 189 |
+
return cls.from_local(Path(local_path).parent, device)
|
| 190 |
+
|
| 191 |
+
def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
|
| 192 |
+
## Load reference wav
|
| 193 |
+
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
|
| 194 |
+
|
| 195 |
+
ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
|
| 196 |
+
|
| 197 |
+
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
|
| 198 |
+
s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
|
| 199 |
+
|
| 200 |
+
# Speech cond prompt tokens
|
| 201 |
+
if plen := self.t3.hp.speech_cond_prompt_len:
|
| 202 |
+
s3_tokzr = self.s3gen.tokenizer
|
| 203 |
+
t3_cond_prompt_tokens, _ = s3_tokzr.forward([ref_16k_wav[:self.ENC_COND_LEN]], max_len=plen)
|
| 204 |
+
t3_cond_prompt_tokens = torch.atleast_2d(t3_cond_prompt_tokens).to(self.device)
|
| 205 |
+
|
| 206 |
+
# Voice-encoder speaker embedding
|
| 207 |
+
ve_embed = torch.from_numpy(self.ve.embeds_from_wavs([ref_16k_wav], sample_rate=S3_SR))
|
| 208 |
+
ve_embed = ve_embed.mean(axis=0, keepdim=True).to(self.device)
|
| 209 |
+
|
| 210 |
+
t3_cond = T3Cond(
|
| 211 |
+
speaker_emb=ve_embed,
|
| 212 |
+
cond_prompt_speech_tokens=t3_cond_prompt_tokens,
|
| 213 |
+
emotion_adv=exaggeration * torch.ones(1, 1, 1),
|
| 214 |
+
).to(device=self.device)
|
| 215 |
+
self.conds = Conditionals(t3_cond, s3gen_ref_dict)
|
| 216 |
+
|
| 217 |
+
def generate(
|
| 218 |
+
self,
|
| 219 |
+
text,
|
| 220 |
+
audio_prompt_path=None,
|
| 221 |
+
exaggeration=0.5,
|
| 222 |
+
cfg_weight=0.5,
|
| 223 |
+
temperature=0.8,
|
| 224 |
+
min_p=0.05,
|
| 225 |
+
top_p=0.8,
|
| 226 |
+
repetition_penalty=2.0,
|
| 227 |
+
):
|
| 228 |
+
if audio_prompt_path:
|
| 229 |
+
self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
|
| 230 |
+
else:
|
| 231 |
+
assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
|
| 232 |
+
|
| 233 |
+
# Update exaggeration if needed
|
| 234 |
+
if exaggeration != self.conds.t3.emotion_adv[0, 0, 0]:
|
| 235 |
+
_cond: T3Cond = self.conds.t3
|
| 236 |
+
self.conds.t3 = T3Cond(
|
| 237 |
+
speaker_emb=_cond.speaker_emb,
|
| 238 |
+
cond_prompt_speech_tokens=_cond.cond_prompt_speech_tokens,
|
| 239 |
+
emotion_adv=exaggeration * torch.ones(1, 1, 1),
|
| 240 |
+
).to(device=self.device)
|
| 241 |
+
|
| 242 |
+
# Norm and tokenize text
|
| 243 |
+
text = punc_norm(text)
|
| 244 |
+
text_tokens = self.tokenizer.text_to_tokens(text).to(self.device)
|
| 245 |
+
|
| 246 |
+
if cfg_weight > 0.0:
|
| 247 |
+
text_tokens = torch.cat([text_tokens, text_tokens], dim=0) # Need two seqs for CFG
|
| 248 |
+
|
| 249 |
+
sot = self.t3.hp.start_text_token
|
| 250 |
+
eot = self.t3.hp.stop_text_token
|
| 251 |
+
text_tokens = F.pad(text_tokens, (1, 0), value=sot)
|
| 252 |
+
text_tokens = F.pad(text_tokens, (0, 1), value=eot)
|
| 253 |
+
|
| 254 |
+
with torch.inference_mode():
|
| 255 |
+
speech_tokens = self.t3.inference(
|
| 256 |
+
t3_cond=self.conds.t3,
|
| 257 |
+
text_tokens=text_tokens,
|
| 258 |
+
max_new_tokens=1000, # TODO: use the value in config
|
| 259 |
+
temperature=temperature,
|
| 260 |
+
cfg_weight=cfg_weight,
|
| 261 |
+
min_p=min_p,
|
| 262 |
+
top_p=top_p,
|
| 263 |
+
repetition_penalty=repetition_penalty,
|
| 264 |
+
)
|
| 265 |
+
# Extract only the conditional batch.
|
| 266 |
+
speech_tokens = speech_tokens[0]
|
| 267 |
+
|
| 268 |
+
# TODO: output becomes 1D
|
| 269 |
+
speech_tokens = drop_invalid_tokens(speech_tokens)
|
| 270 |
+
|
| 271 |
+
speech_tokens = speech_tokens[speech_tokens < 6561]
|
| 272 |
+
|
| 273 |
+
speech_tokens = speech_tokens.to(self.device)
|
| 274 |
+
|
| 275 |
+
wav, _ = self.s3gen.inference(
|
| 276 |
+
speech_tokens=speech_tokens,
|
| 277 |
+
ref_dict=self.conds.gen,
|
| 278 |
+
)
|
| 279 |
+
wav = wav.squeeze(0).detach().cpu().numpy()
|
| 280 |
+
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
|
| 281 |
+
return torch.from_numpy(watermarked_wav).unsqueeze(0)
|
test_parallel_performance.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Parallel Processing Performance Diagnostic Tool
|
| 4 |
+
Test various theories about why HuggingFace deployment is slow
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
import threading
|
| 9 |
+
import multiprocessing
|
| 10 |
+
import concurrent.futures
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import psutil
|
| 14 |
+
import torch
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
def test_basic_multiprocessing():
|
| 18 |
+
"""Test 1: Basic multiprocessing capability"""
|
| 19 |
+
print("=== TEST 1: Basic Multiprocessing ===")
|
| 20 |
+
|
| 21 |
+
def simple_task(n):
|
| 22 |
+
return n * n
|
| 23 |
+
|
| 24 |
+
# Sequential
|
| 25 |
+
start = time.time()
|
| 26 |
+
results_seq = [simple_task(i) for i in range(100)]
|
| 27 |
+
seq_time = time.time() - start
|
| 28 |
+
print(f"Sequential: {seq_time:.3f}s")
|
| 29 |
+
|
| 30 |
+
# Parallel
|
| 31 |
+
start = time.time()
|
| 32 |
+
with multiprocessing.Pool(processes=4) as pool:
|
| 33 |
+
results_par = pool.map(simple_task, range(100))
|
| 34 |
+
par_time = time.time() - start
|
| 35 |
+
print(f"Parallel (4 workers): {par_time:.3f}s")
|
| 36 |
+
print(f"Speedup: {seq_time/par_time:.2f}x")
|
| 37 |
+
print()
|
| 38 |
+
|
| 39 |
+
def test_thread_vs_process():
|
| 40 |
+
"""Test 2: Threading vs Processing"""
|
| 41 |
+
print("=== TEST 2: Threading vs Processing ===")
|
| 42 |
+
|
| 43 |
+
def cpu_task(n):
|
| 44 |
+
# CPU intensive task
|
| 45 |
+
total = 0
|
| 46 |
+
for i in range(n * 1000):
|
| 47 |
+
total += i * i
|
| 48 |
+
return total
|
| 49 |
+
|
| 50 |
+
tasks = [1000] * 8
|
| 51 |
+
|
| 52 |
+
# Sequential
|
| 53 |
+
start = time.time()
|
| 54 |
+
seq_results = [cpu_task(t) for t in tasks]
|
| 55 |
+
seq_time = time.time() - start
|
| 56 |
+
print(f"Sequential: {seq_time:.3f}s")
|
| 57 |
+
|
| 58 |
+
# Threading
|
| 59 |
+
start = time.time()
|
| 60 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 61 |
+
thread_results = list(executor.map(cpu_task, tasks))
|
| 62 |
+
thread_time = time.time() - start
|
| 63 |
+
print(f"ThreadPool: {thread_time:.3f}s, speedup: {seq_time/thread_time:.2f}x")
|
| 64 |
+
|
| 65 |
+
# Processing
|
| 66 |
+
start = time.time()
|
| 67 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
|
| 68 |
+
process_results = list(executor.map(cpu_task, tasks))
|
| 69 |
+
process_time = time.time() - start
|
| 70 |
+
print(f"ProcessPool: {process_time:.3f}s, speedup: {seq_time/process_time:.2f}x")
|
| 71 |
+
print()
|
| 72 |
+
|
| 73 |
+
def test_gpu_access():
|
| 74 |
+
"""Test 3: GPU sharing capability"""
|
| 75 |
+
print("=== TEST 3: GPU Access ===")
|
| 76 |
+
|
| 77 |
+
if not torch.cuda.is_available():
|
| 78 |
+
print("No CUDA available - skipping GPU test")
|
| 79 |
+
print()
|
| 80 |
+
return
|
| 81 |
+
|
| 82 |
+
def gpu_task(worker_id):
|
| 83 |
+
try:
|
| 84 |
+
device = torch.device("cuda")
|
| 85 |
+
# Create a small tensor operation
|
| 86 |
+
x = torch.randn(1000, 1000, device=device)
|
| 87 |
+
y = torch.randn(1000, 1000, device=device)
|
| 88 |
+
start = time.time()
|
| 89 |
+
for _ in range(10):
|
| 90 |
+
z = torch.mm(x, y)
|
| 91 |
+
duration = time.time() - start
|
| 92 |
+
return f"Worker {worker_id}: {duration:.3f}s"
|
| 93 |
+
except Exception as e:
|
| 94 |
+
return f"Worker {worker_id}: ERROR - {e}"
|
| 95 |
+
|
| 96 |
+
# Sequential GPU access
|
| 97 |
+
start = time.time()
|
| 98 |
+
seq_results = [gpu_task(i) for i in range(4)]
|
| 99 |
+
seq_time = time.time() - start
|
| 100 |
+
print("Sequential GPU:")
|
| 101 |
+
for result in seq_results:
|
| 102 |
+
print(f" {result}")
|
| 103 |
+
print(f"Total sequential time: {seq_time:.3f}s")
|
| 104 |
+
|
| 105 |
+
# Parallel GPU access
|
| 106 |
+
start = time.time()
|
| 107 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 108 |
+
par_results = list(executor.map(gpu_task, range(4)))
|
| 109 |
+
par_time = time.time() - start
|
| 110 |
+
print("Parallel GPU:")
|
| 111 |
+
for result in par_results:
|
| 112 |
+
print(f" {result}")
|
| 113 |
+
print(f"Total parallel time: {par_time:.3f}s")
|
| 114 |
+
print()
|
| 115 |
+
|
| 116 |
+
def test_model_loading():
|
| 117 |
+
"""Test 4: Model loading overhead"""
|
| 118 |
+
print("=== TEST 4: Model Loading Simulation ===")
|
| 119 |
+
|
| 120 |
+
# Simulate loading a heavy model
|
| 121 |
+
def load_model():
|
| 122 |
+
# Simulate model loading time
|
| 123 |
+
time.sleep(0.5) # 500ms loading time
|
| 124 |
+
return {"model": "loaded", "size": "large"}
|
| 125 |
+
|
| 126 |
+
def task_with_model_loading(worker_id):
|
| 127 |
+
start = time.time()
|
| 128 |
+
model = load_model() # Each worker loads model
|
| 129 |
+
processing_time = 0.1 # Simulate 100ms processing
|
| 130 |
+
time.sleep(processing_time)
|
| 131 |
+
total_time = time.time() - start
|
| 132 |
+
return f"Worker {worker_id}: {total_time:.3f}s"
|
| 133 |
+
|
| 134 |
+
# Test with model loading per worker
|
| 135 |
+
print("Each worker loads model:")
|
| 136 |
+
start = time.time()
|
| 137 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 138 |
+
results = list(executor.map(task_with_model_loading, range(4)))
|
| 139 |
+
total_time = time.time() - start
|
| 140 |
+
|
| 141 |
+
for result in results:
|
| 142 |
+
print(f" {result}")
|
| 143 |
+
print(f"Total time with per-worker loading: {total_time:.3f}s")
|
| 144 |
+
|
| 145 |
+
# Compare with shared model (simulation)
|
| 146 |
+
shared_load_time = 0.5 # Load once
|
| 147 |
+
processing_time = 0.1 * 4 # Process 4 items sequentially
|
| 148 |
+
simulated_shared_time = shared_load_time + processing_time
|
| 149 |
+
print(f"Simulated shared model time: {simulated_shared_time:.3f}s")
|
| 150 |
+
print(f"Overhead from per-worker loading: {total_time - simulated_shared_time:.3f}s")
|
| 151 |
+
print()
|
| 152 |
+
|
| 153 |
+
def test_environment_info():
|
| 154 |
+
"""Test 5: Environment information"""
|
| 155 |
+
print("=== TEST 5: Environment Info ===")
|
| 156 |
+
|
| 157 |
+
print(f"Python version: {sys.version}")
|
| 158 |
+
print(f"Platform: {sys.platform}")
|
| 159 |
+
print(f"CPU cores: {multiprocessing.cpu_count()}")
|
| 160 |
+
print(f"CPU usage: {psutil.cpu_percent()}%")
|
| 161 |
+
print(f"Memory: {psutil.virtual_memory().percent}% used")
|
| 162 |
+
|
| 163 |
+
if torch.cuda.is_available():
|
| 164 |
+
print(f"CUDA available: Yes")
|
| 165 |
+
print(f"CUDA devices: {torch.cuda.device_count()}")
|
| 166 |
+
print(f"Current device: {torch.cuda.current_device()}")
|
| 167 |
+
print(f"Device name: {torch.cuda.get_device_name()}")
|
| 168 |
+
if hasattr(torch.cuda, 'memory_summary'):
|
| 169 |
+
print("GPU Memory:")
|
| 170 |
+
print(torch.cuda.memory_summary(abbreviated=True))
|
| 171 |
+
else:
|
| 172 |
+
print("CUDA available: No")
|
| 173 |
+
|
| 174 |
+
# Check for environment variables that might affect multiprocessing
|
| 175 |
+
mp_vars = [
|
| 176 |
+
'OMP_NUM_THREADS', 'MKL_NUM_THREADS', 'OPENBLAS_NUM_THREADS',
|
| 177 |
+
'VECLIB_MAXIMUM_THREADS', 'NUMEXPR_NUM_THREADS'
|
| 178 |
+
]
|
| 179 |
+
print("Threading environment variables:")
|
| 180 |
+
for var in mp_vars:
|
| 181 |
+
value = os.environ.get(var, 'Not set')
|
| 182 |
+
print(f" {var}: {value}")
|
| 183 |
+
|
| 184 |
+
print()
|
| 185 |
+
|
| 186 |
+
def test_worker_creation():
|
| 187 |
+
"""Test 6: Worker creation monitoring"""
|
| 188 |
+
print("=== TEST 6: Worker Creation ===")
|
| 189 |
+
|
| 190 |
+
def monitored_task(worker_id):
|
| 191 |
+
pid = os.getpid()
|
| 192 |
+
tid = threading.get_ident()
|
| 193 |
+
return f"Worker {worker_id}: PID={pid}, TID={tid}"
|
| 194 |
+
|
| 195 |
+
print("Main process:")
|
| 196 |
+
print(f" PID: {os.getpid()}")
|
| 197 |
+
print(f" TID: {threading.get_ident()}")
|
| 198 |
+
|
| 199 |
+
print("ThreadPoolExecutor workers:")
|
| 200 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
| 201 |
+
results = list(executor.map(monitored_task, range(4)))
|
| 202 |
+
for result in results:
|
| 203 |
+
print(f" {result}")
|
| 204 |
+
|
| 205 |
+
print("ProcessPoolExecutor workers:")
|
| 206 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
|
| 207 |
+
results = list(executor.map(monitored_task, range(4)))
|
| 208 |
+
for result in results:
|
| 209 |
+
print(f" {result}")
|
| 210 |
+
|
| 211 |
+
print()
|
| 212 |
+
|
| 213 |
+
def main():
|
| 214 |
+
print("π Parallel Processing Diagnostic Tool")
|
| 215 |
+
print("=" * 50)
|
| 216 |
+
print()
|
| 217 |
+
|
| 218 |
+
test_environment_info()
|
| 219 |
+
test_basic_multiprocessing()
|
| 220 |
+
test_thread_vs_process()
|
| 221 |
+
test_gpu_access()
|
| 222 |
+
test_model_loading()
|
| 223 |
+
test_worker_creation()
|
| 224 |
+
|
| 225 |
+
print("π Diagnostic complete!")
|
| 226 |
+
print()
|
| 227 |
+
print("ANALYSIS:")
|
| 228 |
+
print("- If basic multiprocessing is slow: Environment blocks parallelism")
|
| 229 |
+
print("- If threading faster than processing: Use ThreadPoolExecutor")
|
| 230 |
+
print("- If GPU parallel time >> sequential: GPU contention issue")
|
| 231 |
+
print("- If model loading overhead high: Need model sharing strategy")
|
| 232 |
+
print("- If same PID for all workers: Using threads, not processes")
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
main()
|
utils/generate_from_json (copy).py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Direct Audio Generation from JSON Tool
|
| 4 |
+
|
| 5 |
+
This script allows for generating audiobook chunks directly from a pre-existing
|
| 6 |
+
`chunks_info.json` file. It is intended for debugging and testing purposes,
|
| 7 |
+
allowing a user to manually edit the TTS parameters in the JSON file and
|
| 8 |
+
hear the results without the VADER analysis step.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import sys
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
import time
|
| 16 |
+
from datetime import timedelta
|
| 17 |
+
|
| 18 |
+
# Add project root to path to allow module imports
|
| 19 |
+
project_root = Path(__file__).parent
|
| 20 |
+
sys.path.append(str(project_root))
|
| 21 |
+
|
| 22 |
+
from config.config import *
|
| 23 |
+
from modules.tts_engine import load_optimized_model, process_one_chunk, prewarm_model_with_voice
|
| 24 |
+
from modules.file_manager import setup_book_directories, list_voice_samples, ensure_voice_sample_compatibility
|
| 25 |
+
from wrapper.chunk_loader import load_chunks
|
| 26 |
+
from chatterbox.tts import punc_norm
|
| 27 |
+
from modules.progress_tracker import log_chunk_progress, log_run
|
| 28 |
+
|
| 29 |
+
def main():
|
| 30 |
+
"""Main function to drive the generation process."""
|
| 31 |
+
print(f"{BOLD}{CYAN}--- Direct Audio Generation from JSON Tool ---\{RESET}")
|
| 32 |
+
|
| 33 |
+
# 1. Get Book Name
|
| 34 |
+
book_name = input("Enter the book name (e.g., 'london'): ").strip()
|
| 35 |
+
if not book_name:
|
| 36 |
+
print("β Book name cannot be empty.")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# 2. Locate and Load JSON
|
| 40 |
+
book_audio_dir = AUDIOBOOK_ROOT / book_name
|
| 41 |
+
json_path = book_audio_dir / "TTS" / "text_chunks" / "chunks_info.json"
|
| 42 |
+
|
| 43 |
+
if not json_path.exists():
|
| 44 |
+
print(f"β Error: JSON file not found at {json_path}")
|
| 45 |
+
print("Please ensure you have run the 'Prepare text file' option for this book first.")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
print(f"π Loading chunks from: {json_path}")
|
| 49 |
+
all_chunks = load_chunks(str(json_path))
|
| 50 |
+
print(f"β
Found {len(all_chunks)} chunks.")
|
| 51 |
+
|
| 52 |
+
# 3. Select Voice
|
| 53 |
+
voice_files = list_voice_samples()
|
| 54 |
+
if not voice_files:
|
| 55 |
+
print(f"β No voice samples found in {VOICE_SAMPLES_DIR}")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
print("\nAvailable voices:")
|
| 59 |
+
for i, voice_file in enumerate(voice_files, 1):
|
| 60 |
+
print(f" [{i}] {voice_file.stem}")
|
| 61 |
+
|
| 62 |
+
while True:
|
| 63 |
+
try:
|
| 64 |
+
choice = input("Select voice number: ").strip()
|
| 65 |
+
idx = int(choice) - 1
|
| 66 |
+
if 0 <= idx < len(voice_files):
|
| 67 |
+
voice_path = voice_files[idx]
|
| 68 |
+
break
|
| 69 |
+
print("Invalid selection.")
|
| 70 |
+
except (ValueError, IndexError):
|
| 71 |
+
print("Invalid selection.")
|
| 72 |
+
|
| 73 |
+
# Ensure voice compatibility
|
| 74 |
+
voice_path = ensure_voice_sample_compatibility(voice_path)
|
| 75 |
+
|
| 76 |
+
# 4. Setup Environment
|
| 77 |
+
if torch.cuda.is_available():
|
| 78 |
+
device = "cuda"
|
| 79 |
+
elif torch.backends.mps.is_available():
|
| 80 |
+
device = "mps"
|
| 81 |
+
else:
|
| 82 |
+
device = "cpu"
|
| 83 |
+
|
| 84 |
+
print(f"\nπ Using device: {device}")
|
| 85 |
+
print(f"π€ Using voice: {Path(voice_path).name}")
|
| 86 |
+
|
| 87 |
+
# 5. Load Model
|
| 88 |
+
model = load_optimized_model(device)
|
| 89 |
+
|
| 90 |
+
# 6. Pre-warm model to eliminate first chunk quality variations
|
| 91 |
+
print(f"π₯ Pre-warming model with voice sample: {Path(voice_path).name}")
|
| 92 |
+
from modules.tts_engine import prewarm_model_with_voice
|
| 93 |
+
compatible_voice = ensure_voice_sample_compatibility(voice_path)
|
| 94 |
+
# Use default TTS params for pre-warming since we don't have user params here
|
| 95 |
+
model = prewarm_model_with_voice(model, compatible_voice, None)
|
| 96 |
+
|
| 97 |
+
# 7. Process Chunks
|
| 98 |
+
output_root, tts_dir, text_chunks_dir, audio_chunks_dir = setup_book_directories(Path(TEXT_INPUT_ROOT) / book_name)
|
| 99 |
+
|
| 100 |
+
# Clean existing audio chunks
|
| 101 |
+
print("π§Ή Clearing old audio chunks...")
|
| 102 |
+
for wav_file in audio_chunks_dir.glob("*.wav"):
|
| 103 |
+
wav_file.unlink()
|
| 104 |
+
|
| 105 |
+
start_time = time.time()
|
| 106 |
+
total_chunks = len(all_chunks)
|
| 107 |
+
log_path = output_root / "debug_generation.log"
|
| 108 |
+
|
| 109 |
+
print(f"\nπ Generating {total_chunks} chunks...")
|
| 110 |
+
|
| 111 |
+
with ThreadPoolExecutor(max_workers=1) as executor: # Force sequential processing
|
| 112 |
+
futures = []
|
| 113 |
+
for i, chunk_data in enumerate(all_chunks):
|
| 114 |
+
# Extract exaggeration from JSON, force others to default
|
| 115 |
+
chunk_tts_params = {
|
| 116 |
+
"exaggeration": chunk_data.get("tts_params", {}).get("exaggeration", DEFAULT_EXAGGERATION),
|
| 117 |
+
"cfg_weight": DEFAULT_CFG_WEIGHT,
|
| 118 |
+
"temperature": DEFAULT_TEMPERATURE
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
future = executor.submit(
|
| 122 |
+
process_one_chunk,
|
| 123 |
+
i, chunk_data['text'], text_chunks_dir, audio_chunks_dir,
|
| 124 |
+
voice_path, chunk_tts_params, start_time, total_chunks,
|
| 125 |
+
punc_norm, book_name, log_run, log_path, device,
|
| 126 |
+
model, None, chunk_data['is_paragraph_end'], all_chunks, chunk_data['boundary_type']
|
| 127 |
+
)
|
| 128 |
+
futures.append(future)
|
| 129 |
+
|
| 130 |
+
for future in as_completed(futures):
|
| 131 |
+
try:
|
| 132 |
+
result = future.result()
|
| 133 |
+
if result:
|
| 134 |
+
idx, _ = result
|
| 135 |
+
log_chunk_progress(idx, total_chunks, start_time, 0)
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"\nβ An error occurred while processing a chunk: {e}")
|
| 138 |
+
|
| 139 |
+
elapsed_time = time.time() - start_time
|
| 140 |
+
print(f"\n{GREEN}β
Generation Complete!{RESET}")
|
| 141 |
+
print(f"β±οΈ Total time: {timedelta(seconds=int(elapsed_time))}")
|
| 142 |
+
print(f"π Audio chunks are in: {audio_chunks_dir}")
|
| 143 |
+
print("You can now use Option 3 from the main menu to combine them.")
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
main()
|