hadadrjt commited on
Commit
dae9fa5
·
1 Parent(s): 5da0109

[3/?] Pocket TTS: Handle multiple format extensions for voice cloning.

Browse files
config.py CHANGED
@@ -44,6 +44,31 @@ MEMORY_CRITICAL_THRESHOLD = int(0.85 * MAXIMUM_MEMORY_USAGE)
44
  MEMORY_CHECK_INTERVAL = 30
45
  MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  EXAMPLE_PROMPTS = [
48
  {
49
  "text": "The quick brown fox jumps over the lazy dog near the riverbank.",
 
44
  MEMORY_CHECK_INTERVAL = 30
45
  MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
46
 
47
+ SUPPORTED_AUDIO_EXTENSIONS = [
48
+ ".wav",
49
+ ".mp3",
50
+ ".flac",
51
+ ".ogg",
52
+ ".m4a",
53
+ ".aac",
54
+ ".wma",
55
+ ".aiff",
56
+ ".aif",
57
+ ".opus",
58
+ ".webm",
59
+ ".mp4",
60
+ ".mkv",
61
+ ".avi",
62
+ ".mov",
63
+ ".3gp"
64
+ ]
65
+
66
+ AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES = {
67
+ "m4a": "M4A/AAC",
68
+ "aif": "AIFF",
69
+ "3gp": "3GP"
70
+ }
71
+
72
  EXAMPLE_PROMPTS = [
73
  {
74
  "text": "The quick brown fox jumps over the lazy dog near the riverbank.",
src/audio/converter.py CHANGED
@@ -3,6 +3,7 @@
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
 
6
  import time
7
  import tempfile
8
  import numpy as np
@@ -10,33 +11,210 @@ import scipy.io.wavfile
10
  from ..core.state import temporary_files_registry, temporary_files_lock
11
  from ..core.memory import trigger_background_cleanup_check
12
 
13
- def convert_audio_to_pcm_wav(input_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  try:
15
  sample_rate, audio_data = scipy.io.wavfile.read(input_path)
16
 
17
- if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
18
- audio_data = np.clip(audio_data, -1.0, 1.0)
19
- audio_data = (audio_data * 32767).astype(np.int16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- elif audio_data.dtype == np.int32:
22
- audio_data = (audio_data >> 16).astype(np.int16)
 
 
 
 
23
 
24
- elif audio_data.dtype == np.uint8:
25
- audio_data = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
26
 
27
- elif audio_data.dtype != np.int16:
28
- audio_data = audio_data.astype(np.int16)
29
 
30
- output_file = tempfile.NamedTemporaryFile(suffix="_converted.wav", delete=False)
31
- scipy.io.wavfile.write(output_file.name, sample_rate, audio_data)
32
 
33
- with temporary_files_lock:
34
- temporary_files_registry[output_file.name] = time.time()
35
 
36
- trigger_background_cleanup_check()
37
 
38
- return output_file.name
 
39
 
40
  except Exception as conversion_error:
41
- print(f"Warning: {conversion_error}")
42
- return input_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
6
+ import os
7
  import time
8
  import tempfile
9
  import numpy as np
 
11
  from ..core.state import temporary_files_registry, temporary_files_lock
12
  from ..core.memory import trigger_background_cleanup_check
13
 
14
+ def convert_audio_data_to_pcm_int16(audio_data):
15
+ if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
16
+ audio_data_clipped = np.clip(audio_data, -1.0, 1.0)
17
+ audio_data_int16 = (audio_data_clipped * 32767).astype(np.int16)
18
+ return audio_data_int16
19
+
20
+ if audio_data.dtype == np.int32:
21
+ audio_data_int16 = (audio_data >> 16).astype(np.int16)
22
+ return audio_data_int16
23
+
24
+ if audio_data.dtype == np.uint8:
25
+ audio_data_int16 = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
26
+ return audio_data_int16
27
+
28
+ if audio_data.dtype == np.int16:
29
+ return audio_data
30
+
31
+ if audio_data.dtype == np.int64:
32
+ audio_data_int16 = (audio_data >> 48).astype(np.int16)
33
+ return audio_data_int16
34
+
35
+ return audio_data.astype(np.int16)
36
+
37
+ def convert_stereo_to_mono(audio_data):
38
+ if len(audio_data.shape) == 1:
39
+ return audio_data
40
+
41
+ if len(audio_data.shape) == 2:
42
+ if audio_data.shape[0] > audio_data.shape[1]:
43
+ audio_data = audio_data.T
44
+
45
+ if audio_data.shape[0] > 1:
46
+ mono_audio = np.mean(audio_data, axis=0)
47
+ return mono_audio.astype(audio_data.dtype)
48
+
49
+ return audio_data[0]
50
+
51
+ return audio_data
52
+
53
+ def register_temporary_file(file_path):
54
+ with temporary_files_lock:
55
+ temporary_files_registry[file_path] = time.time()
56
+ trigger_background_cleanup_check()
57
+
58
+ def convert_wav_file_to_pcm_format(input_path):
59
  try:
60
  sample_rate, audio_data = scipy.io.wavfile.read(input_path)
61
 
62
+ if len(audio_data.shape) > 1:
63
+ audio_data = convert_stereo_to_mono(audio_data)
64
+
65
+ audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
66
+
67
+ output_file = tempfile.NamedTemporaryFile(suffix="_pcm_converted.wav", delete=False)
68
+ scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
69
+
70
+ register_temporary_file(output_file.name)
71
+
72
+ return output_file.name, None
73
+
74
+ except Exception as conversion_error:
75
+ return None, f"Failed to convert WAV to PCM format: {str(conversion_error)}"
76
+
77
+ def convert_audio_using_pydub(input_path, target_sample_rate=None):
78
+ try:
79
+ from pydub import AudioSegment
80
+
81
+ audio_segment = AudioSegment.from_file(input_path)
82
+
83
+ audio_segment = audio_segment.set_channels(1)
84
+ audio_segment = audio_segment.set_sample_width(2)
85
+
86
+ if target_sample_rate is not None:
87
+ audio_segment = audio_segment.set_frame_rate(target_sample_rate)
88
+
89
+ output_file = tempfile.NamedTemporaryFile(suffix="_pydub_converted.wav", delete=False)
90
+ audio_segment.export(output_file.name, format="wav")
91
+
92
+ register_temporary_file(output_file.name)
93
+
94
+ return output_file.name, None
95
+
96
+ except ImportError:
97
+ return None, "pydub_library_not_available"
98
+
99
+ except Exception as conversion_error:
100
+ error_message = str(conversion_error)
101
+ if "ffmpeg" in error_message.lower() or "ffprobe" in error_message.lower():
102
+ return None, "ffmpeg_not_available"
103
+ return None, f"Failed to convert audio using pydub: {error_message}"
104
+
105
+ def convert_audio_using_soundfile(input_path):
106
+ try:
107
+ import soundfile
108
+
109
+ audio_data, sample_rate = soundfile.read(input_path, dtype='float32')
110
+
111
+ if len(audio_data.shape) > 1:
112
+ audio_data = np.mean(audio_data, axis=1)
113
+
114
+ audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
115
+
116
+ output_file = tempfile.NamedTemporaryFile(suffix="_soundfile_converted.wav", delete=False)
117
+ scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
118
+
119
+ register_temporary_file(output_file.name)
120
+
121
+ return output_file.name, None
122
+
123
+ except ImportError:
124
+ return None, "soundfile_library_not_available"
125
 
126
+ except Exception as conversion_error:
127
+ return None, f"Failed to convert audio using soundfile: {str(conversion_error)}"
128
+
129
+ def convert_audio_using_librosa(input_path):
130
+ try:
131
+ import librosa
132
 
133
+ audio_data, sample_rate = librosa.load(input_path, sr=None, mono=True)
 
134
 
135
+ audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
 
136
 
137
+ output_file = tempfile.NamedTemporaryFile(suffix="_librosa_converted.wav", delete=False)
138
+ scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
139
 
140
+ register_temporary_file(output_file.name)
 
141
 
142
+ return output_file.name, None
143
 
144
+ except ImportError:
145
+ return None, "librosa_library_not_available"
146
 
147
  except Exception as conversion_error:
148
+ return None, f"Failed to convert audio using librosa: {str(conversion_error)}"
149
+
150
+ def convert_non_wav_audio_to_wav(input_path):
151
+ converted_path, pydub_error = convert_audio_using_pydub(input_path)
152
+ if converted_path is not None:
153
+ return converted_path, None, "pydub"
154
+
155
+ converted_path, soundfile_error = convert_audio_using_soundfile(input_path)
156
+ if converted_path is not None:
157
+ return converted_path, None, "soundfile"
158
+
159
+ converted_path, librosa_error = convert_audio_using_librosa(input_path)
160
+ if converted_path is not None:
161
+ return converted_path, None, "librosa"
162
+
163
+ pydub_unavailable = pydub_error in ["pydub_library_not_available", "ffmpeg_not_available"]
164
+ soundfile_unavailable = soundfile_error == "soundfile_library_not_available"
165
+ librosa_unavailable = librosa_error == "librosa_library_not_available"
166
+
167
+ if pydub_unavailable and soundfile_unavailable and librosa_unavailable:
168
+ return None, "No audio conversion library is available on the server. Please upload a WAV file directly.", None
169
+
170
+ all_errors = []
171
+ if not pydub_unavailable and pydub_error:
172
+ all_errors.append(f"pydub: {pydub_error}")
173
+
174
+ if not soundfile_unavailable and soundfile_error:
175
+ all_errors.append(f"soundfile: {soundfile_error}")
176
+
177
+ if not librosa_unavailable and librosa_error:
178
+ all_errors.append(f"librosa: {librosa_error}")
179
+
180
+ if all_errors:
181
+ combined_error = " | ".join(all_errors)
182
+ return None, f"Audio conversion failed with all available methods. {combined_error}", None
183
+
184
+ return None, "Audio conversion failed. Please try uploading a different audio file or use WAV format.", None
185
+
186
+ def prepare_audio_file_for_voice_cloning(input_path):
187
+ from .validator import perform_comprehensive_audio_validation, get_format_display_name
188
+
189
+ is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(input_path)
190
+
191
+ if not is_valid:
192
+ return None, validation_error, False, detected_format
193
+
194
+ if is_wav_format:
195
+ converted_path, conversion_error = convert_wav_file_to_pcm_format(input_path)
196
+ if converted_path is not None:
197
+ return converted_path, None, False, 'wav'
198
+ return None, conversion_error, False, 'wav'
199
+
200
+ format_display_name = get_format_display_name(detected_format)
201
+
202
+ converted_path, conversion_error, conversion_method = convert_non_wav_audio_to_wav(input_path)
203
+ if converted_path is not None:
204
+ final_path, pcm_error = convert_wav_file_to_pcm_format(converted_path)
205
+ if final_path is not None:
206
+ return final_path, None, True, detected_format
207
+ return converted_path, None, True, detected_format
208
+
209
+ return None, conversion_error, True, detected_format
210
+
211
+ def convert_audio_to_pcm_wav(input_path):
212
+ converted_path, error, was_converted, detected_format = prepare_audio_file_for_voice_cloning(input_path)
213
+
214
+ if converted_path is not None:
215
+ return converted_path
216
+
217
+ if error:
218
+ print(f"Warning: Audio conversion failed - {error}")
219
+
220
+ return input_path
src/audio/validator.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ import os
7
+ import wave
8
+ from config import (
9
+ SUPPORTED_AUDIO_EXTENSIONS,
10
+ AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES
11
+ )
12
+
13
+ def build_format_display_names_from_supported_extensions():
14
+ format_display_names = {}
15
+
16
+ for extension in SUPPORTED_AUDIO_EXTENSIONS:
17
+ format_code = extension.lstrip(".")
18
+
19
+ if format_code in AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES:
20
+ format_display_names[format_code] = AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES[format_code]
21
+ else:
22
+ format_display_names[format_code] = format_code.upper()
23
+
24
+ format_display_names["unknown"] = "Unknown"
25
+
26
+ return format_display_names
27
+
28
+ FORMAT_DISPLAY_NAMES = build_format_display_names_from_supported_extensions()
29
+
30
+ def get_audio_file_extension(file_path):
31
+ if not file_path:
32
+ return None
33
+
34
+ _, extension = os.path.splitext(file_path)
35
+
36
+ return extension.lower()
37
+
38
+ def is_supported_audio_extension(file_path):
39
+ extension = get_audio_file_extension(file_path)
40
+
41
+ if extension is None:
42
+ return False
43
+
44
+ return extension in SUPPORTED_AUDIO_EXTENSIONS
45
+
46
+ def validate_file_exists_and_readable(file_path):
47
+ if not file_path:
48
+ return False, "No audio file provided."
49
+
50
+ if not os.path.exists(file_path):
51
+ return False, "Audio file does not exist."
52
+
53
+ if not os.path.isfile(file_path):
54
+ return False, "The provided path is not a valid file."
55
+
56
+ try:
57
+ file_size = os.path.getsize(file_path)
58
+ except OSError as size_error:
59
+ return False, f"Cannot read file size: {str(size_error)}"
60
+
61
+ if file_size == 0:
62
+ return False, "Audio file is empty (0 bytes)."
63
+
64
+ if file_size < 44:
65
+ return False, "Audio file is too small to be a valid audio file."
66
+
67
+ try:
68
+ with open(file_path, "rb") as test_file:
69
+ test_file.read(1)
70
+ except IOError as read_error:
71
+ return False, f"Audio file is not readable: {str(read_error)}"
72
+
73
+ return True, None
74
+
75
+ def detect_audio_format_from_header(file_path):
76
+ try:
77
+ with open(file_path, "rb") as audio_file:
78
+ header_bytes = audio_file.read(32)
79
+
80
+ if len(header_bytes) < 4:
81
+ return None, "File is too small to determine audio format."
82
+
83
+ if len(header_bytes) >= 12:
84
+ if header_bytes[:4] == b"RIFF" and header_bytes[8:12] == b"WAVE":
85
+ return "wav", None
86
+
87
+ if header_bytes[:3] == b"ID3":
88
+ return "mp3", None
89
+
90
+ if len(header_bytes) >= 2:
91
+ first_two_bytes = header_bytes[:2]
92
+ mp3_sync_bytes = [
93
+ b"\xff\xfb",
94
+ b"\xff\xfa",
95
+ b"\xff\xf3",
96
+ b"\xff\xf2",
97
+ b"\xff\xe0",
98
+ b"\xff\xe2",
99
+ b"\xff\xe3"
100
+ ]
101
+
102
+ if first_two_bytes in mp3_sync_bytes:
103
+ return "mp3", None
104
+
105
+ if header_bytes[:4] == b"fLaC":
106
+ return "flac", None
107
+
108
+ if header_bytes[:4] == b"OggS":
109
+ return "ogg", None
110
+
111
+ if len(header_bytes) >= 12:
112
+ if header_bytes[:4] == b"FORM" and header_bytes[8:12] in [b"AIFF", b"AIFC"]:
113
+ return "aiff", None
114
+
115
+ if len(header_bytes) >= 8:
116
+ if header_bytes[4:8] == b"ftyp":
117
+ return "m4a", None
118
+
119
+ if len(header_bytes) >= 4:
120
+ if header_bytes[:4] == b"\x1aE\xdf\xa3":
121
+ return "webm", None
122
+
123
+ if len(header_bytes) >= 8:
124
+ if header_bytes[4:8] in [b"mdat", b"moov", b"free", b"skip", b"wide"]:
125
+ return "m4a", None
126
+
127
+ file_extension = get_audio_file_extension(file_path)
128
+
129
+ if file_extension and file_extension in SUPPORTED_AUDIO_EXTENSIONS:
130
+ return file_extension.lstrip("."), None
131
+
132
+ return "unknown", "Could not determine audio format from file header. The file may be corrupted or in an unsupported format."
133
+
134
+ except IOError as io_error:
135
+ return None, f"Error reading file header: {str(io_error)}"
136
+
137
+ except Exception as detection_error:
138
+ return None, f"Unexpected error detecting audio format: {str(detection_error)}"
139
+
140
+ def validate_wav_file_structure(file_path):
141
+ try:
142
+ with wave.open(file_path, "rb") as wav_file:
143
+ number_of_channels = wav_file.getnchannels()
144
+ sample_width_bytes = wav_file.getsampwidth()
145
+ sample_rate = wav_file.getframerate()
146
+ number_of_frames = wav_file.getnframes()
147
+
148
+ if number_of_channels < 1:
149
+ return False, "WAV file has no audio channels."
150
+
151
+ if number_of_channels > 16:
152
+ return False, f"WAV file has too many channels ({number_of_channels}). Maximum supported is 16."
153
+
154
+ if sample_width_bytes < 1:
155
+ return False, "WAV file has invalid sample width (less than 1 byte)."
156
+
157
+ if sample_width_bytes > 4:
158
+ return False, f"WAV file has unsupported sample width ({sample_width_bytes} bytes). Maximum supported is 4 bytes (32-bit)."
159
+
160
+ if sample_rate < 100:
161
+ return False, f"WAV file has invalid sample rate ({sample_rate} Hz). Minimum supported is 100 Hz."
162
+
163
+ if sample_rate > 384000:
164
+ return False, f"WAV file has unsupported sample rate ({sample_rate} Hz). Maximum supported is 384000 Hz."
165
+
166
+ if number_of_frames < 1:
167
+ return False, "WAV file contains no audio frames."
168
+
169
+ audio_duration_seconds = number_of_frames / sample_rate
170
+
171
+ if audio_duration_seconds < 0.1:
172
+ return False, f"Audio is too short ({audio_duration_seconds:.2f} seconds). Minimum duration is 0.1 seconds."
173
+
174
+ if audio_duration_seconds > 3600:
175
+ return False, f"Audio is too long ({audio_duration_seconds:.0f} seconds). Maximum duration is 1 hour."
176
+
177
+ return True, None
178
+
179
+ except wave.Error as wav_error:
180
+ error_message = str(wav_error)
181
+
182
+ if "file does not start with RIFF id" in error_message:
183
+ return False, "File has .wav extension but is not a valid WAV file. It may be a different audio format renamed to .wav."
184
+
185
+ if "unknown format" in error_message.lower():
186
+ return False, "WAV file uses an unsupported audio encoding format."
187
+
188
+ return False, f"Invalid WAV file structure: {error_message}"
189
+
190
+ except EOFError:
191
+ return False, "WAV file is truncated or corrupted (unexpected end of file)."
192
+
193
+ except Exception as validation_error:
194
+ return False, f"Error validating WAV file: {str(validation_error)}"
195
+
196
+ def perform_comprehensive_audio_validation(file_path):
197
+ file_exists_valid, file_exists_error = validate_file_exists_and_readable(file_path)
198
+
199
+ if not file_exists_valid:
200
+ return False, False, None, file_exists_error
201
+
202
+ file_extension = get_audio_file_extension(file_path)
203
+
204
+ if not is_supported_audio_extension(file_path):
205
+ supported_formats_list = ", ".join(SUPPORTED_AUDIO_EXTENSIONS)
206
+ return False, False, None, f"Unsupported file format '{file_extension}'. Supported formats are: {supported_formats_list}"
207
+
208
+ detected_format, detection_error = detect_audio_format_from_header(file_path)
209
+
210
+ if detected_format is None:
211
+ return False, False, None, detection_error
212
+
213
+ is_wav_format = (detected_format == "wav")
214
+
215
+ if is_wav_format:
216
+ wav_structure_valid, wav_structure_error = validate_wav_file_structure(file_path)
217
+
218
+ if not wav_structure_valid:
219
+ return False, True, "wav", wav_structure_error
220
+
221
+ return True, is_wav_format, detected_format, None
222
+
223
+ def get_format_display_name(format_code):
224
+ if format_code is None:
225
+ return "Unknown"
226
+
227
+ if format_code in FORMAT_DISPLAY_NAMES:
228
+ return FORMAT_DISPLAY_NAMES[format_code]
229
+
230
+ return format_code.upper()
src/core/authentication.py CHANGED
@@ -10,14 +10,14 @@ def authenticate_huggingface():
10
  if HF_TOKEN:
11
  try:
12
  login(token=HF_TOKEN, add_to_git_credential=False)
13
- print("Authenticated with Hugging Face")
14
 
15
  except Exception as authentication_error:
16
- print(f"Hugging Face authentication failed: {authentication_error}")
17
- print("Voice cloning may not be available")
18
 
19
  else:
20
- print("Missing Hugging Face authentication required for the license agreement")
21
 
22
  def get_huggingface_token():
23
  return HF_TOKEN
 
10
  if HF_TOKEN:
11
  try:
12
  login(token=HF_TOKEN, add_to_git_credential=False)
13
+ print("Authenticated with Hugging Face", flush=True)
14
 
15
  except Exception as authentication_error:
16
+ print(f"Hugging Face authentication failed: {authentication_error}", flush=True)
17
+ print("Voice cloning may not be available", flush=True)
18
 
19
  else:
20
+ print("Missing Hugging Face authentication required for the license agreement", flush=True)
21
 
22
  def get_huggingface_token():
23
  return HF_TOKEN
src/generation/handler.py CHANGED
@@ -20,6 +20,11 @@ from ..core.memory import (
20
  )
21
  from ..tts.manager import text_to_speech_manager
22
  from ..validation.text import validate_text_input
 
 
 
 
 
23
 
24
  def check_if_generating():
25
  from ..core.state import is_currently_generating
@@ -30,6 +35,56 @@ def request_generation_stop():
30
  set_stop_generation_requested(True)
31
  return gr.update(interactive=False)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def perform_speech_generation(
34
  text_input,
35
  voice_mode_selection,
@@ -56,12 +111,26 @@ def perform_speech_generation(
56
  raise gr.Error(validation_result)
57
  raise gr.Error("Please enter valid text to generate speech.")
58
 
 
 
 
 
59
  if voice_mode_selection == VOICE_MODE_CLONE:
60
  if not voice_clone_audio_file:
61
  raise gr.Error("Please upload an audio file for voice cloning.")
 
62
  if not get_huggingface_token():
63
  raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
64
 
 
 
 
 
 
 
 
 
 
65
  with generation_state_lock:
66
  if global_state.is_currently_generating:
67
  raise gr.Error("A generation is already in progress. Please wait.")
@@ -85,7 +154,10 @@ def perform_speech_generation(
85
  return None
86
 
87
  if voice_mode_selection == VOICE_MODE_CLONE:
88
- cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file)
 
 
 
89
  voice_state = cloned_voice_state_tensor
90
  else:
91
  voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
@@ -116,7 +188,15 @@ def perform_speech_generation(
116
  raise gr.Error(str(runtime_error))
117
 
118
  except Exception as generation_error:
119
- raise gr.Error(f"Speech generation failed: {str(generation_error)}")
 
 
 
 
 
 
 
 
120
 
121
  finally:
122
  with generation_state_lock:
 
20
  )
21
  from ..tts.manager import text_to_speech_manager
22
  from ..validation.text import validate_text_input
23
+ from ..audio.validator import (
24
+ perform_comprehensive_audio_validation,
25
+ get_format_display_name
26
+ )
27
+ from ..audio.converter import prepare_audio_file_for_voice_cloning
28
 
29
  def check_if_generating():
30
  from ..core.state import is_currently_generating
 
35
  set_stop_generation_requested(True)
36
  return gr.update(interactive=False)
37
 
38
+ def validate_and_prepare_voice_clone_audio(voice_clone_audio_file):
39
+ if not voice_clone_audio_file:
40
+ return None, "Please upload an audio file for voice cloning.", None, None
41
+
42
+ is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(voice_clone_audio_file)
43
+
44
+ if not is_valid:
45
+ format_display_name = get_format_display_name(detected_format) if detected_format else "Unknown"
46
+
47
+ if validation_error:
48
+ if "too short" in validation_error.lower():
49
+ return None, f"The uploaded audio file is too short. Please upload a longer audio sample for better voice cloning results.", None, detected_format
50
+
51
+ if "too long" in validation_error.lower():
52
+ return None, f"The uploaded audio file is too long. Please upload a shorter audio sample (maximum 1 hour).", None, detected_format
53
+
54
+ if "empty" in validation_error.lower() or "0 bytes" in validation_error.lower():
55
+ return None, "The uploaded audio file is empty. Please upload a valid audio file.", None, detected_format
56
+
57
+ if "corrupted" in validation_error.lower() or "truncated" in validation_error.lower():
58
+ return None, f"The uploaded {format_display_name} file appears to be corrupted or incomplete. Please upload a valid audio file.", None, detected_format
59
+
60
+ if "unsupported" in validation_error.lower():
61
+ return None, validation_error, None, detected_format
62
+
63
+ return None, f"Invalid audio file: {validation_error}", None, detected_format
64
+
65
+ return None, "The uploaded file could not be validated as a valid audio file.", None, detected_format
66
+
67
+ format_display_name = get_format_display_name(detected_format)
68
+
69
+ if is_wav_format:
70
+ prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)
71
+
72
+ if prepared_path is None:
73
+ return None, f"Failed to process WAV file: {preparation_error}", None, 'wav'
74
+
75
+ return prepared_path, None, False, 'wav'
76
+
77
+ else:
78
+ prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)
79
+
80
+ if prepared_path is None:
81
+ if "no audio conversion library" in preparation_error.lower():
82
+ return None, f"Cannot convert {format_display_name} format. Please upload a WAV file directly.", None, detected_format
83
+
84
+ return None, f"Failed to convert {format_display_name} to WAV format: {preparation_error}", None, detected_format
85
+
86
+ return prepared_path, None, True, detected_format
87
+
88
  def perform_speech_generation(
89
  text_input,
90
  voice_mode_selection,
 
111
  raise gr.Error(validation_result)
112
  raise gr.Error("Please enter valid text to generate speech.")
113
 
114
+ prepared_audio_path = None
115
+ was_audio_converted = False
116
+ original_audio_format = None
117
+
118
  if voice_mode_selection == VOICE_MODE_CLONE:
119
  if not voice_clone_audio_file:
120
  raise gr.Error("Please upload an audio file for voice cloning.")
121
+
122
  if not get_huggingface_token():
123
  raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
124
 
125
+ prepared_audio_path, audio_error, was_audio_converted, original_audio_format = validate_and_prepare_voice_clone_audio(voice_clone_audio_file)
126
+
127
+ if prepared_audio_path is None:
128
+ raise gr.Error(audio_error)
129
+
130
+ if was_audio_converted:
131
+ format_display_name = get_format_display_name(original_audio_format)
132
+ gr.Warning(f"Audio converted from {format_display_name} to WAV format for voice cloning.")
133
+
134
  with generation_state_lock:
135
  if global_state.is_currently_generating:
136
  raise gr.Error("A generation is already in progress. Please wait.")
 
154
  return None
155
 
156
  if voice_mode_selection == VOICE_MODE_CLONE:
157
+ cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(
158
+ voice_clone_audio_file,
159
+ prepared_audio_path=prepared_audio_path
160
+ )
161
  voice_state = cloned_voice_state_tensor
162
  else:
163
  voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
 
188
  raise gr.Error(str(runtime_error))
189
 
190
  except Exception as generation_error:
191
+ error_message = str(generation_error)
192
+
193
+ if "file does not start with RIFF id" in error_message:
194
+ raise gr.Error("The audio file format is not supported. Please upload a valid WAV file or a common audio format (MP3, FLAC, OGG, M4A).")
195
+
196
+ if "unknown format" in error_message.lower():
197
+ raise gr.Error("The audio file uses an unsupported encoding format. Please convert it to a standard format and try again.")
198
+
199
+ raise gr.Error(f"Speech generation failed: {error_message}")
200
 
201
  finally:
202
  with generation_state_lock:
src/tts/manager.py CHANGED
@@ -31,7 +31,6 @@ from ..core.memory import (
31
  trigger_background_cleanup_check,
32
  is_memory_usage_approaching_limit
33
  )
34
- from ..audio.converter import convert_audio_to_pcm_wav
35
 
36
  class TextToSpeechManager:
37
  def __init__(self):
@@ -178,15 +177,15 @@ class TextToSpeechManager:
178
 
179
  return self.voice_state_cache[validated_voice]
180
 
181
- def get_voice_state_for_clone(self, audio_file_path):
182
  with self.model_lock:
183
  if self.loaded_model is None:
184
  raise RuntimeError("TTS model is not loaded. Please try again.")
185
 
186
- converted_audio_path = convert_audio_to_pcm_wav(audio_file_path)
187
 
188
  return self.loaded_model.get_state_for_audio_prompt(
189
- audio_conditioning=converted_audio_path,
190
  truncate=False
191
  )
192
 
 
31
  trigger_background_cleanup_check,
32
  is_memory_usage_approaching_limit
33
  )
 
34
 
35
  class TextToSpeechManager:
36
  def __init__(self):
 
177
 
178
  return self.voice_state_cache[validated_voice]
179
 
180
+ def get_voice_state_for_clone(self, audio_file_path, prepared_audio_path=None):
181
  with self.model_lock:
182
  if self.loaded_model is None:
183
  raise RuntimeError("TTS model is not loaded. Please try again.")
184
 
185
+ audio_path_to_use = prepared_audio_path if prepared_audio_path is not None else audio_file_path
186
 
187
  return self.loaded_model.get_state_for_audio_prompt(
188
+ audio_conditioning=audio_path_to_use,
189
  truncate=False
190
  )
191