Pocket TTS: Enforce resource management.
Browse files
app.py
CHANGED
|
@@ -164,25 +164,330 @@ import gc
|
|
| 164 |
import atexit
|
| 165 |
|
| 166 |
BACKGROUND_CLEANUP_INTERVAL = 300
|
| 167 |
-
VOICE_STATE_CACHE_MAXIMUM_SIZE =
|
| 168 |
-
VOICE_STATE_CACHE_CLEANUP_THRESHOLD =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
background_cleanup_thread = None
|
| 171 |
background_cleanup_stop_event = threading.Event()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
def perform_background_cleanup_cycle():
|
|
|
|
|
|
|
| 174 |
while not background_cleanup_stop_event.is_set():
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
cleanup_expired_temporary_files()
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
def start_background_cleanup_thread():
|
| 182 |
global background_cleanup_thread
|
| 183 |
|
| 184 |
if background_cleanup_thread is None or not background_cleanup_thread.is_alive():
|
| 185 |
background_cleanup_stop_event.clear()
|
|
|
|
| 186 |
|
| 187 |
background_cleanup_thread = threading.Thread(
|
| 188 |
target=perform_background_cleanup_cycle,
|
|
@@ -194,16 +499,11 @@ def start_background_cleanup_thread():
|
|
| 194 |
|
| 195 |
def stop_background_cleanup_thread():
|
| 196 |
background_cleanup_stop_event.set()
|
|
|
|
| 197 |
|
| 198 |
if background_cleanup_thread is not None and background_cleanup_thread.is_alive():
|
| 199 |
background_cleanup_thread.join(timeout=5)
|
| 200 |
|
| 201 |
-
def force_garbage_collection():
|
| 202 |
-
gc.collect()
|
| 203 |
-
|
| 204 |
-
if torch.cuda.is_available():
|
| 205 |
-
torch.cuda.empty_cache()
|
| 206 |
-
|
| 207 |
atexit.register(stop_background_cleanup_thread)
|
| 208 |
|
| 209 |
# =============================================================================
|
|
@@ -243,6 +543,8 @@ def convert_audio_to_pcm_wav(input_path):
|
|
| 243 |
with temporary_files_lock:
|
| 244 |
temporary_files_registry[output_file.name] = time.time()
|
| 245 |
|
|
|
|
|
|
|
| 246 |
return output_file.name
|
| 247 |
|
| 248 |
except Exception as conversion_error:
|
|
@@ -285,6 +587,23 @@ class TextToSpeechManager:
|
|
| 285 |
|
| 286 |
self.voice_state_cache_access_timestamps = {}
|
| 287 |
self.voice_state_cache_lock = threading.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
def load_or_get_model(
|
| 290 |
self,
|
|
@@ -311,6 +630,8 @@ class TextToSpeechManager:
|
|
| 311 |
Returns:
|
| 312 |
TTSModel: Loaded and configured TTS model instance
|
| 313 |
"""
|
|
|
|
|
|
|
| 314 |
# Process and validate input parameters with defaults
|
| 315 |
processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
|
| 316 |
processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
|
|
@@ -327,15 +648,20 @@ class TextToSpeechManager:
|
|
| 327 |
"eos_threshold": processed_eos_threshold
|
| 328 |
}
|
| 329 |
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
|
| 338 |
-
|
| 339 |
|
| 340 |
def clear_voice_state_cache_completely(self):
|
| 341 |
with self.voice_state_cache_lock:
|
|
@@ -355,6 +681,23 @@ class TextToSpeechManager:
|
|
| 355 |
with self.voice_state_cache_lock:
|
| 356 |
|
| 357 |
if len(self.voice_state_cache) <= VOICE_STATE_CACHE_CLEANUP_THRESHOLD:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
return
|
| 359 |
|
| 360 |
sorted_voice_names_by_access_time = sorted(
|
|
@@ -395,9 +738,16 @@ class TextToSpeechManager:
|
|
| 395 |
self.voice_state_cache_access_timestamps[validated_voice] = time.time()
|
| 396 |
return self.voice_state_cache[validated_voice]
|
| 397 |
|
|
|
|
|
|
|
|
|
|
| 398 |
if len(self.voice_state_cache) >= VOICE_STATE_CACHE_MAXIMUM_SIZE:
|
| 399 |
self.evict_least_recently_used_voice_states()
|
| 400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
# Compute and cache voice state if not already cached
|
| 402 |
if validated_voice not in self.voice_state_cache:
|
| 403 |
|
|
@@ -426,6 +776,9 @@ class TextToSpeechManager:
|
|
| 426 |
Returns:
|
| 427 |
Voice state tensor extracted from the audio file
|
| 428 |
"""
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
converted_audio_path = convert_audio_to_pcm_wav(audio_file_path)
|
| 431 |
|
|
@@ -447,15 +800,23 @@ class TextToSpeechManager:
|
|
| 447 |
Returns:
|
| 448 |
torch.Tensor: Generated audio waveform
|
| 449 |
"""
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
def save_audio_to_file(self, audio_tensor):
|
| 461 |
"""
|
|
@@ -470,9 +831,14 @@ class TextToSpeechManager:
|
|
| 470 |
Returns:
|
| 471 |
str: Path to the saved temporary WAV file
|
| 472 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
# Convert tensor to numpy array for scipy
|
| 474 |
audio_numpy_data = audio_tensor.numpy()
|
| 475 |
-
audio_sample_rate = self.loaded_model.sample_rate
|
| 476 |
|
| 477 |
# Create temporary file and write audio data
|
| 478 |
output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
@@ -482,6 +848,8 @@ class TextToSpeechManager:
|
|
| 482 |
with temporary_files_lock:
|
| 483 |
temporary_files_registry[output_file.name] = time.time()
|
| 484 |
|
|
|
|
|
|
|
| 485 |
return output_file.name
|
| 486 |
|
| 487 |
|
|
@@ -625,7 +993,10 @@ def perform_speech_generation(
|
|
| 625 |
global is_currently_generating, stop_generation_requested
|
| 626 |
|
| 627 |
# Run cleanup before starting new generation
|
| 628 |
-
|
|
|
|
|
|
|
|
|
|
| 629 |
|
| 630 |
# Validate text input
|
| 631 |
is_valid, validation_result = validate_text_input(text_input)
|
|
@@ -701,6 +1072,9 @@ def perform_speech_generation(
|
|
| 701 |
except gr.Error:
|
| 702 |
raise
|
| 703 |
|
|
|
|
|
|
|
|
|
|
| 704 |
except Exception as generation_error:
|
| 705 |
raise gr.Error(f"Speech generation failed: {str(generation_error)}")
|
| 706 |
|
|
@@ -712,11 +1086,15 @@ def perform_speech_generation(
|
|
| 712 |
|
| 713 |
if generated_audio_tensor is not None:
|
| 714 |
del generated_audio_tensor
|
|
|
|
| 715 |
|
| 716 |
if cloned_voice_state_tensor is not None:
|
| 717 |
del cloned_voice_state_tensor
|
|
|
|
| 718 |
|
| 719 |
-
|
|
|
|
|
|
|
| 720 |
|
| 721 |
|
| 722 |
# =============================================================================
|
|
|
|
| 164 |
import atexit
|
| 165 |
|
| 166 |
BACKGROUND_CLEANUP_INTERVAL = 300
|
| 167 |
+
VOICE_STATE_CACHE_MAXIMUM_SIZE = 8
|
| 168 |
+
VOICE_STATE_CACHE_CLEANUP_THRESHOLD = 4
|
| 169 |
+
|
| 170 |
+
MAXIMUM_MEMORY_USAGE = 1 * 1024 * 1024 * 1024
|
| 171 |
+
|
| 172 |
+
MEMORY_WARNING_THRESHOLD = int(0.7 * MAXIMUM_MEMORY_USAGE)
|
| 173 |
+
MEMORY_CRITICAL_THRESHOLD = int(0.85 * MAXIMUM_MEMORY_USAGE)
|
| 174 |
+
MEMORY_CHECK_INTERVAL = 30
|
| 175 |
+
MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
|
| 176 |
|
| 177 |
background_cleanup_thread = None
|
| 178 |
background_cleanup_stop_event = threading.Event()
|
| 179 |
+
background_cleanup_trigger_event = threading.Event()
|
| 180 |
+
|
| 181 |
+
memory_enforcement_lock = threading.Lock()
|
| 182 |
+
|
| 183 |
+
text_to_speech_manager = None
|
| 184 |
+
|
| 185 |
+
def get_current_memory_usage():
|
| 186 |
+
try:
|
| 187 |
+
with open('/proc/self/status', 'r') as status_file:
|
| 188 |
+
for line in status_file:
|
| 189 |
+
|
| 190 |
+
if line.startswith('VmRSS:'):
|
| 191 |
+
memory_value_kb = int(line.split()[1])
|
| 192 |
+
return memory_value_kb * 1024
|
| 193 |
+
|
| 194 |
+
except Exception:
|
| 195 |
+
pass
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
with open('/proc/self/statm', 'r') as statm_file:
|
| 199 |
+
statm_values = statm_file.read().split()
|
| 200 |
+
resident_pages = int(statm_values[1])
|
| 201 |
+
page_size = os.sysconf('SC_PAGE_SIZE')
|
| 202 |
+
return resident_pages * page_size
|
| 203 |
+
|
| 204 |
+
except Exception:
|
| 205 |
+
pass
|
| 206 |
+
|
| 207 |
+
try:
|
| 208 |
+
import resource
|
| 209 |
+
memory_usage_kilobytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
| 210 |
+
|
| 211 |
+
import platform
|
| 212 |
+
if platform.system() == "Darwin":
|
| 213 |
+
return memory_usage_kilobytes
|
| 214 |
+
|
| 215 |
+
else:
|
| 216 |
+
return memory_usage_kilobytes * 1024
|
| 217 |
+
|
| 218 |
+
except Exception:
|
| 219 |
+
pass
|
| 220 |
+
|
| 221 |
+
return 0
|
| 222 |
+
|
| 223 |
+
def check_if_generation_is_currently_active():
|
| 224 |
+
with generation_state_lock:
|
| 225 |
+
return is_currently_generating
|
| 226 |
+
|
| 227 |
+
def is_memory_usage_within_limit():
|
| 228 |
+
current_memory_usage = get_current_memory_usage()
|
| 229 |
+
return current_memory_usage < MAXIMUM_MEMORY_USAGE
|
| 230 |
+
|
| 231 |
+
def is_memory_usage_approaching_limit():
|
| 232 |
+
current_memory_usage = get_current_memory_usage()
|
| 233 |
+
return current_memory_usage >= MEMORY_WARNING_THRESHOLD
|
| 234 |
+
|
| 235 |
+
def is_memory_usage_critical():
|
| 236 |
+
current_memory_usage = get_current_memory_usage()
|
| 237 |
+
return current_memory_usage >= MEMORY_CRITICAL_THRESHOLD
|
| 238 |
+
|
| 239 |
+
def is_memory_above_idle_target():
|
| 240 |
+
current_memory_usage = get_current_memory_usage()
|
| 241 |
+
return current_memory_usage > MEMORY_IDLE_TARGET
|
| 242 |
+
|
| 243 |
+
def force_garbage_collection():
|
| 244 |
+
gc.collect(0)
|
| 245 |
+
gc.collect(1)
|
| 246 |
+
gc.collect(2)
|
| 247 |
+
|
| 248 |
+
if torch.cuda.is_available():
|
| 249 |
+
torch.cuda.empty_cache()
|
| 250 |
+
torch.cuda.synchronize()
|
| 251 |
+
|
| 252 |
+
def memory_cleanup():
|
| 253 |
+
force_garbage_collection()
|
| 254 |
+
|
| 255 |
+
try:
|
| 256 |
+
import ctypes
|
| 257 |
+
|
| 258 |
+
libc = ctypes.CDLL("libc.so.6")
|
| 259 |
+
libc.malloc_trim(0)
|
| 260 |
+
|
| 261 |
+
except Exception:
|
| 262 |
+
pass
|
| 263 |
+
|
| 264 |
+
force_garbage_collection()
|
| 265 |
+
|
| 266 |
+
def perform_memory_cleanup():
|
| 267 |
+
global text_to_speech_manager
|
| 268 |
+
|
| 269 |
+
force_garbage_collection()
|
| 270 |
+
|
| 271 |
+
if text_to_speech_manager is not None:
|
| 272 |
+
text_to_speech_manager.evict_least_recently_used_voice_states()
|
| 273 |
+
|
| 274 |
+
memory_cleanup()
|
| 275 |
+
|
| 276 |
+
def enforce_memory_limit_if_exceeded():
|
| 277 |
+
global text_to_speech_manager
|
| 278 |
+
|
| 279 |
+
with memory_enforcement_lock:
|
| 280 |
+
generation_is_active = check_if_generation_is_currently_active()
|
| 281 |
+
|
| 282 |
+
current_memory_usage = get_current_memory_usage()
|
| 283 |
+
|
| 284 |
+
if current_memory_usage < MEMORY_WARNING_THRESHOLD:
|
| 285 |
+
return True
|
| 286 |
+
|
| 287 |
+
force_garbage_collection()
|
| 288 |
+
current_memory_usage = get_current_memory_usage()
|
| 289 |
+
|
| 290 |
+
if current_memory_usage < MEMORY_WARNING_THRESHOLD:
|
| 291 |
+
return True
|
| 292 |
+
|
| 293 |
+
if text_to_speech_manager is not None:
|
| 294 |
+
text_to_speech_manager.evict_least_recently_used_voice_states()
|
| 295 |
+
|
| 296 |
+
memory_cleanup()
|
| 297 |
+
current_memory_usage = get_current_memory_usage()
|
| 298 |
+
|
| 299 |
+
if current_memory_usage < MEMORY_CRITICAL_THRESHOLD:
|
| 300 |
+
return True
|
| 301 |
+
|
| 302 |
+
if text_to_speech_manager is not None:
|
| 303 |
+
text_to_speech_manager.clear_voice_state_cache_completely()
|
| 304 |
+
|
| 305 |
+
cleanup_all_temporary_files_immediately()
|
| 306 |
+
memory_cleanup()
|
| 307 |
+
current_memory_usage = get_current_memory_usage()
|
| 308 |
+
|
| 309 |
+
if current_memory_usage < MAXIMUM_MEMORY_USAGE:
|
| 310 |
+
return True
|
| 311 |
+
|
| 312 |
+
if generation_is_active:
|
| 313 |
+
return current_memory_usage < MAXIMUM_MEMORY_USAGE
|
| 314 |
+
|
| 315 |
+
if text_to_speech_manager is not None:
|
| 316 |
+
text_to_speech_manager.unload_model_completely()
|
| 317 |
+
|
| 318 |
+
memory_cleanup()
|
| 319 |
+
current_memory_usage = get_current_memory_usage()
|
| 320 |
+
|
| 321 |
+
return current_memory_usage < MAXIMUM_MEMORY_USAGE
|
| 322 |
+
|
| 323 |
+
def perform_idle_memory_reduction():
|
| 324 |
+
global text_to_speech_manager
|
| 325 |
+
|
| 326 |
+
if check_if_generation_is_currently_active():
|
| 327 |
+
return
|
| 328 |
+
|
| 329 |
+
with memory_enforcement_lock:
|
| 330 |
+
current_memory_usage = get_current_memory_usage()
|
| 331 |
+
|
| 332 |
+
if current_memory_usage <= MEMORY_IDLE_TARGET:
|
| 333 |
+
return
|
| 334 |
+
|
| 335 |
+
force_garbage_collection()
|
| 336 |
+
current_memory_usage = get_current_memory_usage()
|
| 337 |
+
|
| 338 |
+
if current_memory_usage <= MEMORY_IDLE_TARGET:
|
| 339 |
+
return
|
| 340 |
+
|
| 341 |
+
if check_if_generation_is_currently_active():
|
| 342 |
+
return
|
| 343 |
+
|
| 344 |
+
if text_to_speech_manager is not None:
|
| 345 |
+
text_to_speech_manager.evict_least_recently_used_voice_states()
|
| 346 |
+
|
| 347 |
+
memory_cleanup()
|
| 348 |
+
current_memory_usage = get_current_memory_usage()
|
| 349 |
+
|
| 350 |
+
if current_memory_usage <= MEMORY_IDLE_TARGET:
|
| 351 |
+
return
|
| 352 |
+
|
| 353 |
+
if check_if_generation_is_currently_active():
|
| 354 |
+
return
|
| 355 |
+
|
| 356 |
+
if text_to_speech_manager is not None:
|
| 357 |
+
text_to_speech_manager.clear_voice_state_cache_completely()
|
| 358 |
+
|
| 359 |
+
memory_cleanup()
|
| 360 |
+
current_memory_usage = get_current_memory_usage()
|
| 361 |
+
|
| 362 |
+
if current_memory_usage <= MEMORY_IDLE_TARGET:
|
| 363 |
+
return
|
| 364 |
+
|
| 365 |
+
if check_if_generation_is_currently_active():
|
| 366 |
+
return
|
| 367 |
+
|
| 368 |
+
if text_to_speech_manager is not None:
|
| 369 |
+
text_to_speech_manager.unload_model_completely()
|
| 370 |
+
|
| 371 |
+
memory_cleanup()
|
| 372 |
+
|
| 373 |
+
def cleanup_all_temporary_files_immediately():
|
| 374 |
+
with temporary_files_lock:
|
| 375 |
+
for file_path in list(temporary_files_registry.keys()):
|
| 376 |
+
try:
|
| 377 |
+
if os.path.exists(file_path):
|
| 378 |
+
os.remove(file_path)
|
| 379 |
+
del temporary_files_registry[file_path]
|
| 380 |
+
|
| 381 |
+
except Exception:
|
| 382 |
+
pass
|
| 383 |
+
|
| 384 |
+
def has_temporary_files_pending_cleanup():
|
| 385 |
+
with temporary_files_lock:
|
| 386 |
+
|
| 387 |
+
if len(temporary_files_registry) == 0:
|
| 388 |
+
return False
|
| 389 |
+
|
| 390 |
+
current_timestamp = time.time()
|
| 391 |
+
|
| 392 |
+
for file_path, creation_timestamp in temporary_files_registry.items():
|
| 393 |
+
if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
|
| 394 |
+
return True
|
| 395 |
+
|
| 396 |
+
return False
|
| 397 |
+
|
| 398 |
+
def has_any_temporary_files_registered():
|
| 399 |
+
with temporary_files_lock:
|
| 400 |
+
return len(temporary_files_registry) > 0
|
| 401 |
+
|
| 402 |
+
def calculate_time_until_next_file_expiration():
|
| 403 |
+
with temporary_files_lock:
|
| 404 |
+
if len(temporary_files_registry) == 0:
|
| 405 |
+
return None
|
| 406 |
+
|
| 407 |
+
current_timestamp = time.time()
|
| 408 |
+
minimum_time_until_expiration = None
|
| 409 |
+
|
| 410 |
+
for file_path, creation_timestamp in temporary_files_registry.items():
|
| 411 |
+
time_since_creation = current_timestamp - creation_timestamp
|
| 412 |
+
time_until_expiration = TEMPORARY_FILE_LIFETIME_SECONDS - time_since_creation
|
| 413 |
+
|
| 414 |
+
if time_until_expiration <= 0:
|
| 415 |
+
return 0
|
| 416 |
+
|
| 417 |
+
if minimum_time_until_expiration is None or time_until_expiration < minimum_time_until_expiration:
|
| 418 |
+
minimum_time_until_expiration = time_until_expiration
|
| 419 |
+
|
| 420 |
+
return minimum_time_until_expiration
|
| 421 |
|
| 422 |
def perform_background_cleanup_cycle():
|
| 423 |
+
last_memory_check_timestamp = 0
|
| 424 |
+
|
| 425 |
while not background_cleanup_stop_event.is_set():
|
| 426 |
+
time_until_next_expiration = calculate_time_until_next_file_expiration()
|
| 427 |
+
current_timestamp = time.time()
|
| 428 |
+
time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
|
| 429 |
+
|
| 430 |
+
if time_until_next_expiration is not None:
|
| 431 |
+
if time_until_next_expiration <= 0:
|
| 432 |
+
wait_duration = 1
|
| 433 |
+
|
| 434 |
+
else:
|
| 435 |
+
wait_duration = min(
|
| 436 |
+
time_until_next_expiration + 1,
|
| 437 |
+
MEMORY_CHECK_INTERVAL,
|
| 438 |
+
BACKGROUND_CLEANUP_INTERVAL
|
| 439 |
+
)
|
| 440 |
+
else:
|
| 441 |
+
if is_memory_above_idle_target() and not check_if_generation_is_currently_active():
|
| 442 |
+
wait_duration = MEMORY_CHECK_INTERVAL
|
| 443 |
+
|
| 444 |
+
else:
|
| 445 |
+
background_cleanup_trigger_event.clear()
|
| 446 |
+
triggered = background_cleanup_trigger_event.wait(timeout=BACKGROUND_CLEANUP_INTERVAL)
|
| 447 |
+
|
| 448 |
+
if background_cleanup_stop_event.is_set():
|
| 449 |
+
break
|
| 450 |
+
|
| 451 |
+
if triggered:
|
| 452 |
+
continue
|
| 453 |
+
|
| 454 |
+
else:
|
| 455 |
+
if not check_if_generation_is_currently_active():
|
| 456 |
+
perform_idle_memory_reduction()
|
| 457 |
+
|
| 458 |
+
continue
|
| 459 |
|
| 460 |
+
background_cleanup_stop_event.wait(timeout=wait_duration)
|
| 461 |
+
|
| 462 |
+
if background_cleanup_stop_event.is_set():
|
| 463 |
+
break
|
| 464 |
+
|
| 465 |
+
if has_temporary_files_pending_cleanup():
|
| 466 |
cleanup_expired_temporary_files()
|
| 467 |
+
|
| 468 |
+
current_timestamp = time.time()
|
| 469 |
+
time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
|
| 470 |
+
|
| 471 |
+
if time_since_last_memory_check >= MEMORY_CHECK_INTERVAL:
|
| 472 |
+
if not check_if_generation_is_currently_active():
|
| 473 |
+
|
| 474 |
+
if is_memory_usage_critical():
|
| 475 |
+
enforce_memory_limit_if_exceeded()
|
| 476 |
+
|
| 477 |
+
elif is_memory_above_idle_target():
|
| 478 |
+
perform_idle_memory_reduction()
|
| 479 |
+
|
| 480 |
+
last_memory_check_timestamp = current_timestamp
|
| 481 |
+
|
| 482 |
+
def trigger_background_cleanup_check():
|
| 483 |
+
background_cleanup_trigger_event.set()
|
| 484 |
|
| 485 |
def start_background_cleanup_thread():
|
| 486 |
global background_cleanup_thread
|
| 487 |
|
| 488 |
if background_cleanup_thread is None or not background_cleanup_thread.is_alive():
|
| 489 |
background_cleanup_stop_event.clear()
|
| 490 |
+
background_cleanup_trigger_event.clear()
|
| 491 |
|
| 492 |
background_cleanup_thread = threading.Thread(
|
| 493 |
target=perform_background_cleanup_cycle,
|
|
|
|
| 499 |
|
| 500 |
def stop_background_cleanup_thread():
|
| 501 |
background_cleanup_stop_event.set()
|
| 502 |
+
background_cleanup_trigger_event.set()
|
| 503 |
|
| 504 |
if background_cleanup_thread is not None and background_cleanup_thread.is_alive():
|
| 505 |
background_cleanup_thread.join(timeout=5)
|
| 506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
atexit.register(stop_background_cleanup_thread)
|
| 508 |
|
| 509 |
# =============================================================================
|
|
|
|
| 543 |
with temporary_files_lock:
|
| 544 |
temporary_files_registry[output_file.name] = time.time()
|
| 545 |
|
| 546 |
+
trigger_background_cleanup_check()
|
| 547 |
+
|
| 548 |
return output_file.name
|
| 549 |
|
| 550 |
except Exception as conversion_error:
|
|
|
|
| 587 |
|
| 588 |
self.voice_state_cache_access_timestamps = {}
|
| 589 |
self.voice_state_cache_lock = threading.Lock()
|
| 590 |
+
self.model_lock = threading.Lock()
|
| 591 |
+
|
| 592 |
+
def is_model_loaded(self):
|
| 593 |
+
with self.model_lock:
|
| 594 |
+
return self.loaded_model is not None
|
| 595 |
+
|
| 596 |
+
def unload_model_completely(self):
|
| 597 |
+
with self.model_lock:
|
| 598 |
+
self.clear_voice_state_cache_completely()
|
| 599 |
+
|
| 600 |
+
if self.loaded_model is not None:
|
| 601 |
+
del self.loaded_model
|
| 602 |
+
self.loaded_model = None
|
| 603 |
+
|
| 604 |
+
self.current_configuration = {}
|
| 605 |
+
|
| 606 |
+
memory_cleanup()
|
| 607 |
|
| 608 |
def load_or_get_model(
|
| 609 |
self,
|
|
|
|
| 630 |
Returns:
|
| 631 |
TTSModel: Loaded and configured TTS model instance
|
| 632 |
"""
|
| 633 |
+
perform_memory_cleanup()
|
| 634 |
+
|
| 635 |
# Process and validate input parameters with defaults
|
| 636 |
processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
|
| 637 |
processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
|
|
|
|
| 648 |
"eos_threshold": processed_eos_threshold
|
| 649 |
}
|
| 650 |
|
| 651 |
+
with self.model_lock:
|
| 652 |
+
# Load new model if configuration changed or no model loaded
|
| 653 |
+
if self.loaded_model is None or self.current_configuration != requested_configuration:
|
| 654 |
+
if self.loaded_model is not None:
|
| 655 |
+
self.clear_voice_state_cache_completely()
|
| 656 |
+
del self.loaded_model
|
| 657 |
+
self.loaded_model = None
|
| 658 |
+
memory_cleanup()
|
| 659 |
|
| 660 |
+
self.loaded_model = TTSModel.load_model(**requested_configuration)
|
| 661 |
+
self.current_configuration = requested_configuration
|
| 662 |
+
self.voice_state_cache = {} # Clear cache on model change
|
| 663 |
|
| 664 |
+
return self.loaded_model
|
| 665 |
|
| 666 |
def clear_voice_state_cache_completely(self):
|
| 667 |
with self.voice_state_cache_lock:
|
|
|
|
| 681 |
with self.voice_state_cache_lock:
|
| 682 |
|
| 683 |
if len(self.voice_state_cache) <= VOICE_STATE_CACHE_CLEANUP_THRESHOLD:
|
| 684 |
+
if len(self.voice_state_cache) > 0:
|
| 685 |
+
sorted_voice_names_by_access_time = sorted(
|
| 686 |
+
self.voice_state_cache_access_timestamps.keys(),
|
| 687 |
+
key=lambda voice_name: self.voice_state_cache_access_timestamps[voice_name]
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
number_of_entries_to_remove = max(1, len(self.voice_state_cache) // 2)
|
| 691 |
+
|
| 692 |
+
for index in range(min(number_of_entries_to_remove, len(sorted_voice_names_by_access_time))):
|
| 693 |
+
voice_name_to_remove = sorted_voice_names_by_access_time[index]
|
| 694 |
+
voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
|
| 695 |
+
self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
|
| 696 |
+
|
| 697 |
+
if voice_state_tensor is not None:
|
| 698 |
+
del voice_state_tensor
|
| 699 |
+
|
| 700 |
+
force_garbage_collection()
|
| 701 |
return
|
| 702 |
|
| 703 |
sorted_voice_names_by_access_time = sorted(
|
|
|
|
| 738 |
self.voice_state_cache_access_timestamps[validated_voice] = time.time()
|
| 739 |
return self.voice_state_cache[validated_voice]
|
| 740 |
|
| 741 |
+
if is_memory_usage_approaching_limit():
|
| 742 |
+
self.evict_least_recently_used_voice_states()
|
| 743 |
+
|
| 744 |
if len(self.voice_state_cache) >= VOICE_STATE_CACHE_MAXIMUM_SIZE:
|
| 745 |
self.evict_least_recently_used_voice_states()
|
| 746 |
|
| 747 |
+
with self.model_lock:
|
| 748 |
+
if self.loaded_model is None:
|
| 749 |
+
raise RuntimeError("TTS model is not loaded. Please try again.")
|
| 750 |
+
|
| 751 |
# Compute and cache voice state if not already cached
|
| 752 |
if validated_voice not in self.voice_state_cache:
|
| 753 |
|
|
|
|
| 776 |
Returns:
|
| 777 |
Voice state tensor extracted from the audio file
|
| 778 |
"""
|
| 779 |
+
with self.model_lock:
|
| 780 |
+
if self.loaded_model is None:
|
| 781 |
+
raise RuntimeError("TTS model is not loaded. Please try again.")
|
| 782 |
|
| 783 |
converted_audio_path = convert_audio_to_pcm_wav(audio_file_path)
|
| 784 |
|
|
|
|
| 800 |
Returns:
|
| 801 |
torch.Tensor: Generated audio waveform
|
| 802 |
"""
|
| 803 |
+
with self.model_lock:
|
| 804 |
+
if self.loaded_model is None:
|
| 805 |
+
raise RuntimeError("TTS model is not loaded. Please try again.")
|
| 806 |
+
|
| 807 |
+
# Apply custom frames setting if enabled
|
| 808 |
+
processed_frames = int(frames_after_eos) if enable_custom_frames else None
|
| 809 |
+
|
| 810 |
+
generated_audio = self.loaded_model.generate_audio(
|
| 811 |
+
model_state=voice_state,
|
| 812 |
+
text_to_generate=text_content,
|
| 813 |
+
frames_after_eos=processed_frames,
|
| 814 |
+
copy_state=True
|
| 815 |
+
)
|
| 816 |
+
|
| 817 |
+
force_garbage_collection()
|
| 818 |
+
|
| 819 |
+
return generated_audio
|
| 820 |
|
| 821 |
def save_audio_to_file(self, audio_tensor):
|
| 822 |
"""
|
|
|
|
| 831 |
Returns:
|
| 832 |
str: Path to the saved temporary WAV file
|
| 833 |
"""
|
| 834 |
+
with self.model_lock:
|
| 835 |
+
if self.loaded_model is None:
|
| 836 |
+
raise RuntimeError("TTS model is not loaded. Cannot save audio.")
|
| 837 |
+
|
| 838 |
+
audio_sample_rate = self.loaded_model.sample_rate
|
| 839 |
+
|
| 840 |
# Convert tensor to numpy array for scipy
|
| 841 |
audio_numpy_data = audio_tensor.numpy()
|
|
|
|
| 842 |
|
| 843 |
# Create temporary file and write audio data
|
| 844 |
output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
|
|
| 848 |
with temporary_files_lock:
|
| 849 |
temporary_files_registry[output_file.name] = time.time()
|
| 850 |
|
| 851 |
+
trigger_background_cleanup_check()
|
| 852 |
+
|
| 853 |
return output_file.name
|
| 854 |
|
| 855 |
|
|
|
|
| 993 |
global is_currently_generating, stop_generation_requested
|
| 994 |
|
| 995 |
# Run cleanup before starting new generation
|
| 996 |
+
if has_temporary_files_pending_cleanup():
|
| 997 |
+
cleanup_expired_temporary_files()
|
| 998 |
+
|
| 999 |
+
perform_memory_cleanup()
|
| 1000 |
|
| 1001 |
# Validate text input
|
| 1002 |
is_valid, validation_result = validate_text_input(text_input)
|
|
|
|
| 1072 |
except gr.Error:
|
| 1073 |
raise
|
| 1074 |
|
| 1075 |
+
except RuntimeError as runtime_error:
|
| 1076 |
+
raise gr.Error(str(runtime_error))
|
| 1077 |
+
|
| 1078 |
except Exception as generation_error:
|
| 1079 |
raise gr.Error(f"Speech generation failed: {str(generation_error)}")
|
| 1080 |
|
|
|
|
| 1086 |
|
| 1087 |
if generated_audio_tensor is not None:
|
| 1088 |
del generated_audio_tensor
|
| 1089 |
+
generated_audio_tensor = None
|
| 1090 |
|
| 1091 |
if cloned_voice_state_tensor is not None:
|
| 1092 |
del cloned_voice_state_tensor
|
| 1093 |
+
cloned_voice_state_tensor = None
|
| 1094 |
|
| 1095 |
+
memory_cleanup()
|
| 1096 |
+
|
| 1097 |
+
trigger_background_cleanup_check()
|
| 1098 |
|
| 1099 |
|
| 1100 |
# =============================================================================
|