File size: 8,119 Bytes
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
 
5da0109
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
5da0109
 
 
dae9fa5
5da0109
 
 
dae9fa5
 
 
 
 
 
 
 
 
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae9fa5
 
 
 
 
 
 
 
 
5da0109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#

import gradio as gr
from config import VOICE_MODE_CLONE
from ..core.state import (
    generation_state_lock,
    get_stop_generation_requested,
    set_stop_generation_requested
)
from ..core.authentication import get_huggingface_token
from ..core.memory import (
    has_temporary_files_pending_cleanup,
    cleanup_expired_temporary_files,
    perform_memory_cleanup,
    memory_cleanup,
    trigger_background_cleanup_check
)
from ..tts.manager import text_to_speech_manager
from ..validation.text import validate_text_input
from ..audio.validator import (
    perform_comprehensive_audio_validation,
    get_format_display_name
)
from ..audio.converter import prepare_audio_file_for_voice_cloning

def check_if_generating():
    from ..core.state import is_currently_generating
    with generation_state_lock:
        return is_currently_generating

def request_generation_stop():
    set_stop_generation_requested(True)
    return gr.update(interactive=False)

def validate_and_prepare_voice_clone_audio(voice_clone_audio_file):
    if not voice_clone_audio_file:
        return None, "Please upload an audio file for voice cloning.", None, None

    is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(voice_clone_audio_file)

    if not is_valid:
        format_display_name = get_format_display_name(detected_format) if detected_format else "Unknown"

        if validation_error:
            if "too short" in validation_error.lower():
                return None, f"The uploaded audio file is too short. Please upload a longer audio sample for better voice cloning results.", None, detected_format

            if "too long" in validation_error.lower():
                return None, f"The uploaded audio file is too long. Please upload a shorter audio sample (maximum 1 hour).", None, detected_format

            if "empty" in validation_error.lower() or "0 bytes" in validation_error.lower():
                return None, "The uploaded audio file is empty. Please upload a valid audio file.", None, detected_format

            if "corrupted" in validation_error.lower() or "truncated" in validation_error.lower():
                return None, f"The uploaded {format_display_name} file appears to be corrupted or incomplete. Please upload a valid audio file.", None, detected_format

            if "unsupported" in validation_error.lower():
                return None, validation_error, None, detected_format

            return None, f"Invalid audio file: {validation_error}", None, detected_format

        return None, "The uploaded file could not be validated as a valid audio file.", None, detected_format

    format_display_name = get_format_display_name(detected_format)

    if is_wav_format:
        prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)

        if prepared_path is None:
            return None, f"Failed to process WAV file: {preparation_error}", None, 'wav'

        return prepared_path, None, False, 'wav'

    else:
        prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)

        if prepared_path is None:
            if "no audio conversion library" in preparation_error.lower():
                return None, f"Cannot convert {format_display_name} format. Please upload a WAV file directly.", None, detected_format

            return None, f"Failed to convert {format_display_name} to WAV format: {preparation_error}", None, detected_format

        return prepared_path, None, True, detected_format

def perform_speech_generation(
    text_input,
    voice_mode_selection,
    voice_preset_selection,
    voice_clone_audio_file,
    model_variant,
    lsd_decode_steps,
    temperature,
    noise_clamp,
    eos_threshold,
    frames_after_eos,
    enable_custom_frames
):
    from ..core import state as global_state
    if has_temporary_files_pending_cleanup():
        cleanup_expired_temporary_files()

    perform_memory_cleanup()

    is_valid, validation_result = validate_text_input(text_input)

    if not is_valid:
        if validation_result:
            raise gr.Error(validation_result)
        raise gr.Error("Please enter valid text to generate speech.")

    prepared_audio_path = None
    was_audio_converted = False
    original_audio_format = None

    if voice_mode_selection == VOICE_MODE_CLONE:
        if not voice_clone_audio_file:
            raise gr.Error("Please upload an audio file for voice cloning.")

        if not get_huggingface_token():
            raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")

        prepared_audio_path, audio_error, was_audio_converted, original_audio_format = validate_and_prepare_voice_clone_audio(voice_clone_audio_file)

        if prepared_audio_path is None:
            raise gr.Error(audio_error)

        if was_audio_converted:
            format_display_name = get_format_display_name(original_audio_format)
            gr.Warning(f"Audio converted from {format_display_name} to WAV format for voice cloning.")

    with generation_state_lock:
        if global_state.is_currently_generating:
            raise gr.Error("A generation is already in progress. Please wait.")
        global_state.is_currently_generating = True
        global_state.stop_generation_requested = False

    generated_audio_tensor = None
    cloned_voice_state_tensor = None

    try:
        text_to_speech_manager.load_or_get_model(
            model_variant,
            temperature,
            lsd_decode_steps,
            noise_clamp,
            eos_threshold
        )

        with generation_state_lock:
            if global_state.stop_generation_requested:
                return None

        if voice_mode_selection == VOICE_MODE_CLONE:
            cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(
                voice_clone_audio_file,
                prepared_audio_path=prepared_audio_path
            )
            voice_state = cloned_voice_state_tensor
        else:
            voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)

        with generation_state_lock:
            if global_state.stop_generation_requested:
                return None

        generated_audio_tensor = text_to_speech_manager.generate_audio(
            validation_result,
            voice_state,
            frames_after_eos,
            enable_custom_frames
        )

        with generation_state_lock:
            if global_state.stop_generation_requested:
                return None

        output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)

        return output_file_path

    except gr.Error:
        raise

    except RuntimeError as runtime_error:
        raise gr.Error(str(runtime_error))

    except Exception as generation_error:
        error_message = str(generation_error)

        if "file does not start with RIFF id" in error_message:
            raise gr.Error("The audio file format is not supported. Please upload a valid WAV file or a common audio format (MP3, FLAC, OGG, M4A).")

        if "unknown format" in error_message.lower():
            raise gr.Error("The audio file uses an unsupported encoding format. Please convert it to a standard format and try again.")

        raise gr.Error(f"Speech generation failed: {error_message}")

    finally:
        with generation_state_lock:
            global_state.is_currently_generating = False
            global_state.stop_generation_requested = False

        if generated_audio_tensor is not None:
            del generated_audio_tensor
            generated_audio_tensor = None

        if cloned_voice_state_tensor is not None:
            del cloned_voice_state_tensor
            cloned_voice_state_tensor = None

        memory_cleanup()
        trigger_background_cleanup_check()