Spaces:
Sleeping
Sleeping
File size: 8,119 Bytes
5da0109 dae9fa5 5da0109 dae9fa5 5da0109 dae9fa5 5da0109 dae9fa5 5da0109 dae9fa5 5da0109 dae9fa5 5da0109 dae9fa5 5da0109 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#
import gradio as gr
from config import VOICE_MODE_CLONE
from ..core.state import (
generation_state_lock,
get_stop_generation_requested,
set_stop_generation_requested
)
from ..core.authentication import get_huggingface_token
from ..core.memory import (
has_temporary_files_pending_cleanup,
cleanup_expired_temporary_files,
perform_memory_cleanup,
memory_cleanup,
trigger_background_cleanup_check
)
from ..tts.manager import text_to_speech_manager
from ..validation.text import validate_text_input
from ..audio.validator import (
perform_comprehensive_audio_validation,
get_format_display_name
)
from ..audio.converter import prepare_audio_file_for_voice_cloning
def check_if_generating():
from ..core.state import is_currently_generating
with generation_state_lock:
return is_currently_generating
def request_generation_stop():
set_stop_generation_requested(True)
return gr.update(interactive=False)
def validate_and_prepare_voice_clone_audio(voice_clone_audio_file):
if not voice_clone_audio_file:
return None, "Please upload an audio file for voice cloning.", None, None
is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(voice_clone_audio_file)
if not is_valid:
format_display_name = get_format_display_name(detected_format) if detected_format else "Unknown"
if validation_error:
if "too short" in validation_error.lower():
return None, f"The uploaded audio file is too short. Please upload a longer audio sample for better voice cloning results.", None, detected_format
if "too long" in validation_error.lower():
return None, f"The uploaded audio file is too long. Please upload a shorter audio sample (maximum 1 hour).", None, detected_format
if "empty" in validation_error.lower() or "0 bytes" in validation_error.lower():
return None, "The uploaded audio file is empty. Please upload a valid audio file.", None, detected_format
if "corrupted" in validation_error.lower() or "truncated" in validation_error.lower():
return None, f"The uploaded {format_display_name} file appears to be corrupted or incomplete. Please upload a valid audio file.", None, detected_format
if "unsupported" in validation_error.lower():
return None, validation_error, None, detected_format
return None, f"Invalid audio file: {validation_error}", None, detected_format
return None, "The uploaded file could not be validated as a valid audio file.", None, detected_format
format_display_name = get_format_display_name(detected_format)
if is_wav_format:
prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)
if prepared_path is None:
return None, f"Failed to process WAV file: {preparation_error}", None, 'wav'
return prepared_path, None, False, 'wav'
else:
prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)
if prepared_path is None:
if "no audio conversion library" in preparation_error.lower():
return None, f"Cannot convert {format_display_name} format. Please upload a WAV file directly.", None, detected_format
return None, f"Failed to convert {format_display_name} to WAV format: {preparation_error}", None, detected_format
return prepared_path, None, True, detected_format
def perform_speech_generation(
text_input,
voice_mode_selection,
voice_preset_selection,
voice_clone_audio_file,
model_variant,
lsd_decode_steps,
temperature,
noise_clamp,
eos_threshold,
frames_after_eos,
enable_custom_frames
):
from ..core import state as global_state
if has_temporary_files_pending_cleanup():
cleanup_expired_temporary_files()
perform_memory_cleanup()
is_valid, validation_result = validate_text_input(text_input)
if not is_valid:
if validation_result:
raise gr.Error(validation_result)
raise gr.Error("Please enter valid text to generate speech.")
prepared_audio_path = None
was_audio_converted = False
original_audio_format = None
if voice_mode_selection == VOICE_MODE_CLONE:
if not voice_clone_audio_file:
raise gr.Error("Please upload an audio file for voice cloning.")
if not get_huggingface_token():
raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
prepared_audio_path, audio_error, was_audio_converted, original_audio_format = validate_and_prepare_voice_clone_audio(voice_clone_audio_file)
if prepared_audio_path is None:
raise gr.Error(audio_error)
if was_audio_converted:
format_display_name = get_format_display_name(original_audio_format)
gr.Warning(f"Audio converted from {format_display_name} to WAV format for voice cloning.")
with generation_state_lock:
if global_state.is_currently_generating:
raise gr.Error("A generation is already in progress. Please wait.")
global_state.is_currently_generating = True
global_state.stop_generation_requested = False
generated_audio_tensor = None
cloned_voice_state_tensor = None
try:
text_to_speech_manager.load_or_get_model(
model_variant,
temperature,
lsd_decode_steps,
noise_clamp,
eos_threshold
)
with generation_state_lock:
if global_state.stop_generation_requested:
return None
if voice_mode_selection == VOICE_MODE_CLONE:
cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(
voice_clone_audio_file,
prepared_audio_path=prepared_audio_path
)
voice_state = cloned_voice_state_tensor
else:
voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
with generation_state_lock:
if global_state.stop_generation_requested:
return None
generated_audio_tensor = text_to_speech_manager.generate_audio(
validation_result,
voice_state,
frames_after_eos,
enable_custom_frames
)
with generation_state_lock:
if global_state.stop_generation_requested:
return None
output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)
return output_file_path
except gr.Error:
raise
except RuntimeError as runtime_error:
raise gr.Error(str(runtime_error))
except Exception as generation_error:
error_message = str(generation_error)
if "file does not start with RIFF id" in error_message:
raise gr.Error("The audio file format is not supported. Please upload a valid WAV file or a common audio format (MP3, FLAC, OGG, M4A).")
if "unknown format" in error_message.lower():
raise gr.Error("The audio file uses an unsupported encoding format. Please convert it to a standard format and try again.")
raise gr.Error(f"Speech generation failed: {error_message}")
finally:
with generation_state_lock:
global_state.is_currently_generating = False
global_state.stop_generation_requested = False
if generated_audio_tensor is not None:
del generated_audio_tensor
generated_audio_tensor = None
if cloned_voice_state_tensor is not None:
del cloned_voice_state_tensor
cloned_voice_state_tensor = None
memory_cleanup()
trigger_background_cleanup_check() |