File size: 11,477 Bytes
5da0109
 
 
 
 
 
02b5975
5da0109
 
 
02b5975
 
 
 
 
5da0109
 
 
 
 
 
 
 
 
02b5975
5da0109
dae9fa5
02b5975
 
 
 
 
 
 
 
 
dae9fa5
5da0109
 
 
 
 
 
 
 
 
 
02b5975
 
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
02b5975
 
 
 
 
 
dae9fa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02b5975
 
 
dae9fa5
 
 
 
 
 
02b5975
 
dae9fa5
02b5975
 
 
 
 
dae9fa5
02b5975
 
 
 
 
 
dae9fa5
02b5975
 
dae9fa5
02b5975
dae9fa5
02b5975
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9fa5
02b5975
 
 
 
 
 
 
dae9fa5
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
02b5975
5da0109
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
5da0109
 
 
dae9fa5
02b5975
 
 
 
 
5da0109
 
 
dae9fa5
 
 
 
 
 
 
 
 
5da0109
 
 
02b5975
5da0109
 
 
02b5975
 
5da0109
 
 
 
02b5975
 
 
5da0109
 
 
 
 
 
 
02b5975
 
 
5da0109
 
 
 
 
dae9fa5
 
 
 
5da0109
02b5975
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02b5975
 
 
 
 
 
5da0109
02b5975
 
 
 
 
 
 
 
 
 
5da0109
 
dae9fa5
 
 
 
 
 
 
 
 
5da0109
 
02b5975
 
5da0109
 
 
 
 
02b5975
 
 
 
 
 
5da0109
 
 
02b5975
 
 
 
 
 
5da0109
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#

import gradio as gr
from config import VOICE_MODE_CLONE, MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES
from ..core.state import (
    generation_state_lock,
    get_stop_generation_requested,
    set_stop_generation_requested,
    is_audio_conversion_queue_busy,
    get_audio_conversion_waiting_count,
    acquire_generation_protection,
    release_generation_protection
)
from ..core.authentication import get_huggingface_token
from ..core.memory import (
    has_temporary_files_pending_cleanup,
    cleanup_expired_temporary_files,
    perform_memory_cleanup,
    memory_cleanup,
    trigger_background_cleanup_check
)
from ..tts.manager import text_to_speech_manager, ModelNotLoadedError, ModelLoadingError
from ..validation.text import validate_text_input
from ..audio.validator import (
    perform_voice_clone_file_validation,
    get_format_display_name,
    format_file_size_for_display,
    validate_file_size_for_voice_cloning
)
from ..audio.converter import (
    prepare_audio_file_for_voice_cloning,
    AudioConversionQueueBusyError,
    AudioConversionQueueTimeoutError
)

def check_if_generating():
    from ..core.state import is_currently_generating
    with generation_state_lock:
        return is_currently_generating

def request_generation_stop():
    set_stop_generation_requested(True)
    return gr.update(interactive=False)

def validate_voice_clone_file_size(voice_clone_audio_file):
    if not voice_clone_audio_file:
        return True, None

    file_size_valid, file_size_error = validate_file_size_for_voice_cloning(voice_clone_audio_file)

    if not file_size_valid:
        return False, file_size_error

    return True, None

def validate_and_prepare_voice_clone_audio(voice_clone_audio_file):
    if not voice_clone_audio_file:
        return None, "Please upload an audio file for voice cloning.", None, None

    file_size_valid, file_size_error = validate_file_size_for_voice_cloning(voice_clone_audio_file)

    if not file_size_valid:
        return None, file_size_error, None, None

    is_valid, is_wav_format, detected_format, validation_error = perform_voice_clone_file_validation(voice_clone_audio_file)

    if not is_valid:
        format_display_name = get_format_display_name(detected_format) if detected_format else "Unknown"

        if validation_error:
            if "too short" in validation_error.lower():
                return None, f"The uploaded audio file is too short. Please upload a longer audio sample for better voice cloning results.", None, detected_format

            if "too long" in validation_error.lower():
                return None, f"The uploaded audio file is too long. Please upload a shorter audio sample (maximum 1 hour).", None, detected_format

            if "empty" in validation_error.lower() or "0 bytes" in validation_error.lower():
                return None, "The uploaded audio file is empty. Please upload a valid audio file.", None, detected_format

            if "corrupted" in validation_error.lower() or "truncated" in validation_error.lower():
                return None, f"The uploaded {format_display_name} file appears to be corrupted or incomplete. Please upload a valid audio file.", None, detected_format

            if "unsupported" in validation_error.lower():
                return None, validation_error, None, detected_format

            if "exceeds" in validation_error.lower() or "maximum" in validation_error.lower():
                return None, validation_error, None, detected_format

            return None, f"Invalid audio file: {validation_error}", None, detected_format

        return None, "The uploaded file could not be validated as a valid audio file.", None, detected_format

    format_display_name = get_format_display_name(detected_format)

    if is_audio_conversion_queue_busy():
        waiting_count = get_audio_conversion_waiting_count()

        if waiting_count > 0:
            gr.Warning(f"Audio conversion queue is busy. Your request is queued (position: {waiting_count + 1}). Please wait...")
 
        else:
            gr.Warning("Audio conversion is in progress for another user. Your request has been queued. Please wait...")

    try:
        if is_wav_format:
            prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(
                voice_clone_audio_file,
                wait_for_queue=True
            )

            if prepared_path is None:
                return None, f"Failed to process WAV file: {preparation_error}", None, 'wav'

            return prepared_path, None, False, 'wav'

        else:
            prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(
                voice_clone_audio_file,
                wait_for_queue=True
            )

            if prepared_path is None:
                if "no audio conversion library" in preparation_error.lower():
                    return None, f"Cannot convert {format_display_name} format. Please upload a WAV file directly.", None, detected_format

                if "queue" in preparation_error.lower() or "busy" in preparation_error.lower():
                    return None, preparation_error, None, detected_format

                return None, f"Failed to convert {format_display_name} to WAV format: {preparation_error}", None, detected_format

            return prepared_path, None, True, detected_format

    except AudioConversionQueueBusyError as queue_busy_error:
        return None, str(queue_busy_error), None, detected_format

    except AudioConversionQueueTimeoutError as queue_timeout_error:
        return None, str(queue_timeout_error), None, detected_format

def perform_speech_generation(
    text_input,
    voice_mode_selection,
    voice_preset_selection,
    voice_clone_audio_file,
    model_variant,
    lsd_decode_steps,
    temperature,
    noise_clamp,
    eos_threshold,
    frames_after_eos,
    enable_custom_frames
):
    from ..core import state as global_state

    if has_temporary_files_pending_cleanup():
        cleanup_expired_temporary_files()

    is_valid, validation_result = validate_text_input(text_input)

    if not is_valid:
        if validation_result:
            raise gr.Error(validation_result)
        raise gr.Error("Please enter valid text to generate speech.")

    prepared_audio_path = None
    was_audio_converted = False
    original_audio_format = None

    if voice_mode_selection == VOICE_MODE_CLONE:
        if not voice_clone_audio_file:
            raise gr.Error("Please upload an audio file for voice cloning.")

        file_size_valid, file_size_error = validate_voice_clone_file_size(voice_clone_audio_file)
        if not file_size_valid:
            max_size_display = format_file_size_for_display(MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES)
            raise gr.Error(f"File size exceeds maximum limit of {max_size_display}. {file_size_error}")

        if not get_huggingface_token():
            raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")

        prepared_audio_path, audio_error, was_audio_converted, original_audio_format = validate_and_prepare_voice_clone_audio(voice_clone_audio_file)

        if prepared_audio_path is None:
            raise gr.Error(audio_error)

        if was_audio_converted:
            format_display_name = get_format_display_name(original_audio_format)
            gr.Warning(f"Audio converted from {format_display_name} to WAV format for voice cloning.")

    with generation_state_lock:
        if global_state.is_currently_generating:
            raise gr.Error("A generation is already in progress. Please wait.")
 
        global_state.is_currently_generating = True
        global_state.stop_generation_requested = False

    acquire_generation_protection()

    generated_audio_tensor = None
    cloned_voice_state_tensor = None

    try:
        perform_memory_cleanup()

        loaded_model = text_to_speech_manager.load_or_get_model(
            model_variant,
            temperature,
            lsd_decode_steps,
            noise_clamp,
            eos_threshold
        )

        if loaded_model is None:
            raise gr.Error("Failed to load TTS model. Please try again.")

        with generation_state_lock:
            if global_state.stop_generation_requested:
                return None

        if voice_mode_selection == VOICE_MODE_CLONE:
            cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(
                voice_clone_audio_file,
                prepared_audio_path=prepared_audio_path
            )
            voice_state = cloned_voice_state_tensor
 
        else:
            voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)

        with generation_state_lock:
            if global_state.stop_generation_requested:
                return None

        generated_audio_tensor = text_to_speech_manager.generate_audio(
            validation_result,
            voice_state,
            frames_after_eos,
            enable_custom_frames
        )

        with generation_state_lock:
            if global_state.stop_generation_requested:
                return None

        output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)

        return output_file_path

    except gr.Error:
        raise

    except ModelNotLoadedError as model_not_loaded_error:
        raise gr.Error(str(model_not_loaded_error))

    except ModelLoadingError as model_loading_error:
        raise gr.Error(f"Failed to load TTS model: {str(model_loading_error)}")

    except RuntimeError as runtime_error:
        error_message = str(runtime_error)
        if "not loaded" in error_message.lower():
 
            if text_to_speech_manager.ensure_model_loaded():
                raise gr.Error("Model was temporarily unavailable. Please try again.")
   
            else:
                raise gr.Error("TTS model could not be loaded. Please try again later.")
  
        raise gr.Error(error_message)

    except Exception as generation_error:
        error_message = str(generation_error)

        if "file does not start with RIFF id" in error_message:
            raise gr.Error("The audio file format is not supported. Please upload a valid WAV file or a common audio format (MP3, FLAC, OGG, M4A).")

        if "unknown format" in error_message.lower():
            raise gr.Error("The audio file uses an unsupported encoding format. Please convert it to a standard format and try again.")

        raise gr.Error(f"Speech generation failed: {error_message}")

    finally:
        release_generation_protection()

        with generation_state_lock:
            global_state.is_currently_generating = False
            global_state.stop_generation_requested = False

        if generated_audio_tensor is not None:
            try:
                del generated_audio_tensor
  
            except Exception:
                pass
   
            generated_audio_tensor = None

        if cloned_voice_state_tensor is not None:
            try:
                del cloned_voice_state_tensor
   
            except Exception:
                pass
  
            cloned_voice_state_tensor = None

        memory_cleanup()
        trigger_background_cleanup_check()