tts

Sleeping

App Files Files Community

hadadrjt commited on Jan 19

Commit

dae9fa5

1 Parent(s): 5da0109

[3/?] Pocket TTS: Handle multiple format extensions for voice cloning.

Browse files

Files changed (6) hide show

config.py +25 -0
src/audio/converter.py +196 -18
src/audio/validator.py +230 -0
src/core/authentication.py +4 -4
src/generation/handler.py +82 -2
src/tts/manager.py +3 -4

config.py CHANGED Viewed

@@ -44,6 +44,31 @@ MEMORY_CRITICAL_THRESHOLD = int(0.85 * MAXIMUM_MEMORY_USAGE)
 MEMORY_CHECK_INTERVAL = 30
 MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
 EXAMPLE_PROMPTS = [
     {
         "text": "The quick brown fox jumps over the lazy dog near the riverbank.",

 MEMORY_CHECK_INTERVAL = 30
 MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
+SUPPORTED_AUDIO_EXTENSIONS = [
+    ".wav",
+    ".mp3",
+    ".flac",
+    ".ogg",
+    ".m4a",
+    ".aac",
+    ".wma",
+    ".aiff",
+    ".aif",
+    ".opus",
+    ".webm",
+    ".mp4",
+    ".mkv",
+    ".avi",
+    ".mov",
+    ".3gp"
+]
+AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES = {
+    "m4a": "M4A/AAC",
+    "aif": "AIFF",
+    "3gp": "3GP"
+}
 EXAMPLE_PROMPTS = [
     {
         "text": "The quick brown fox jumps over the lazy dog near the riverbank.",

src/audio/converter.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 import time
 import tempfile
 import numpy as np
@@ -10,33 +11,210 @@ import scipy.io.wavfile
 from ..core.state import temporary_files_registry, temporary_files_lock
 from ..core.memory import trigger_background_cleanup_check
-def convert_audio_to_pcm_wav(input_path):
     try:
         sample_rate, audio_data = scipy.io.wavfile.read(input_path)
-        if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
-            audio_data = np.clip(audio_data, -1.0, 1.0)
-            audio_data = (audio_data * 32767).astype(np.int16)
-        elif audio_data.dtype == np.int32:
-            audio_data = (audio_data >> 16).astype(np.int16)
-        elif audio_data.dtype == np.uint8:
-            audio_data = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
-        elif audio_data.dtype != np.int16:
-            audio_data = audio_data.astype(np.int16)
-        output_file = tempfile.NamedTemporaryFile(suffix="_converted.wav", delete=False)
-        scipy.io.wavfile.write(output_file.name, sample_rate, audio_data)
-        with temporary_files_lock:
-            temporary_files_registry[output_file.name] = time.time()
-        trigger_background_cleanup_check()
-        return output_file.name
     except Exception as conversion_error:
-        print(f"Warning: {conversion_error}")
-        return input_path

 # SPDX-License-Identifier: Apache-2.0
 #
+import os
 import time
 import tempfile
 import numpy as np
 from ..core.state import temporary_files_registry, temporary_files_lock
 from ..core.memory import trigger_background_cleanup_check
+def convert_audio_data_to_pcm_int16(audio_data):
+    if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
+        audio_data_clipped = np.clip(audio_data, -1.0, 1.0)
+        audio_data_int16 = (audio_data_clipped * 32767).astype(np.int16)
+        return audio_data_int16
+    if audio_data.dtype == np.int32:
+        audio_data_int16 = (audio_data >> 16).astype(np.int16)
+        return audio_data_int16
+    if audio_data.dtype == np.uint8:
+        audio_data_int16 = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
+        return audio_data_int16
+    if audio_data.dtype == np.int16:
+        return audio_data
+    if audio_data.dtype == np.int64:
+        audio_data_int16 = (audio_data >> 48).astype(np.int16)
+        return audio_data_int16
+    return audio_data.astype(np.int16)
+def convert_stereo_to_mono(audio_data):
+    if len(audio_data.shape) == 1:
+        return audio_data
+    if len(audio_data.shape) == 2:
+        if audio_data.shape[0] > audio_data.shape[1]:
+            audio_data = audio_data.T
+        if audio_data.shape[0] > 1:
+            mono_audio = np.mean(audio_data, axis=0)
+            return mono_audio.astype(audio_data.dtype)
+        return audio_data[0]
+    return audio_data
+def register_temporary_file(file_path):
+    with temporary_files_lock:
+        temporary_files_registry[file_path] = time.time()
+    trigger_background_cleanup_check()
+def convert_wav_file_to_pcm_format(input_path):
     try:
         sample_rate, audio_data = scipy.io.wavfile.read(input_path)
+        if len(audio_data.shape) > 1:
+            audio_data = convert_stereo_to_mono(audio_data)
+        audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
+        output_file = tempfile.NamedTemporaryFile(suffix="_pcm_converted.wav", delete=False)
+        scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
+        register_temporary_file(output_file.name)
+        return output_file.name, None
+    except Exception as conversion_error:
+        return None, f"Failed to convert WAV to PCM format: {str(conversion_error)}"
+def convert_audio_using_pydub(input_path, target_sample_rate=None):
+    try:
+        from pydub import AudioSegment
+        audio_segment = AudioSegment.from_file(input_path)
+        audio_segment = audio_segment.set_channels(1)
+        audio_segment = audio_segment.set_sample_width(2)
+        if target_sample_rate is not None:
+            audio_segment = audio_segment.set_frame_rate(target_sample_rate)
+        output_file = tempfile.NamedTemporaryFile(suffix="_pydub_converted.wav", delete=False)
+        audio_segment.export(output_file.name, format="wav")
+        register_temporary_file(output_file.name)
+        return output_file.name, None
+    except ImportError:
+        return None, "pydub_library_not_available"
+    except Exception as conversion_error:
+        error_message = str(conversion_error)
+        if "ffmpeg" in error_message.lower() or "ffprobe" in error_message.lower():
+            return None, "ffmpeg_not_available"
+        return None, f"Failed to convert audio using pydub: {error_message}"
+def convert_audio_using_soundfile(input_path):
+    try:
+        import soundfile
+        audio_data, sample_rate = soundfile.read(input_path, dtype='float32')
+        if len(audio_data.shape) > 1:
+            audio_data = np.mean(audio_data, axis=1)
+        audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
+        output_file = tempfile.NamedTemporaryFile(suffix="_soundfile_converted.wav", delete=False)
+        scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
+        register_temporary_file(output_file.name)
+        return output_file.name, None
+    except ImportError:
+        return None, "soundfile_library_not_available"
+    except Exception as conversion_error:
+        return None, f"Failed to convert audio using soundfile: {str(conversion_error)}"
+def convert_audio_using_librosa(input_path):
+    try:
+        import librosa
+        audio_data, sample_rate = librosa.load(input_path, sr=None, mono=True)
+        audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
+        output_file = tempfile.NamedTemporaryFile(suffix="_librosa_converted.wav", delete=False)
+        scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
+        register_temporary_file(output_file.name)
+        return output_file.name, None
+    except ImportError:
+        return None, "librosa_library_not_available"
     except Exception as conversion_error:
+        return None, f"Failed to convert audio using librosa: {str(conversion_error)}"
+def convert_non_wav_audio_to_wav(input_path):
+    converted_path, pydub_error = convert_audio_using_pydub(input_path)
+    if converted_path is not None:
+        return converted_path, None, "pydub"
+    converted_path, soundfile_error = convert_audio_using_soundfile(input_path)
+    if converted_path is not None:
+        return converted_path, None, "soundfile"
+    converted_path, librosa_error = convert_audio_using_librosa(input_path)
+    if converted_path is not None:
+        return converted_path, None, "librosa"
+    pydub_unavailable = pydub_error in ["pydub_library_not_available", "ffmpeg_not_available"]
+    soundfile_unavailable = soundfile_error == "soundfile_library_not_available"
+    librosa_unavailable = librosa_error == "librosa_library_not_available"
+    if pydub_unavailable and soundfile_unavailable and librosa_unavailable:
+        return None, "No audio conversion library is available on the server. Please upload a WAV file directly.", None
+    all_errors = []
+    if not pydub_unavailable and pydub_error:
+        all_errors.append(f"pydub: {pydub_error}")
+    if not soundfile_unavailable and soundfile_error:
+        all_errors.append(f"soundfile: {soundfile_error}")
+    if not librosa_unavailable and librosa_error:
+        all_errors.append(f"librosa: {librosa_error}")
+    if all_errors:
+        combined_error = " | ".join(all_errors)
+        return None, f"Audio conversion failed with all available methods. {combined_error}", None
+    return None, "Audio conversion failed. Please try uploading a different audio file or use WAV format.", None
+def prepare_audio_file_for_voice_cloning(input_path):
+    from .validator import perform_comprehensive_audio_validation, get_format_display_name
+    is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(input_path)
+    if not is_valid:
+        return None, validation_error, False, detected_format
+    if is_wav_format:
+        converted_path, conversion_error = convert_wav_file_to_pcm_format(input_path)
+        if converted_path is not None:
+            return converted_path, None, False, 'wav'
+        return None, conversion_error, False, 'wav'
+    format_display_name = get_format_display_name(detected_format)
+    converted_path, conversion_error, conversion_method = convert_non_wav_audio_to_wav(input_path)
+    if converted_path is not None:
+        final_path, pcm_error = convert_wav_file_to_pcm_format(converted_path)
+        if final_path is not None:
+            return final_path, None, True, detected_format
+        return converted_path, None, True, detected_format
+    return None, conversion_error, True, detected_format
+def convert_audio_to_pcm_wav(input_path):
+    converted_path, error, was_converted, detected_format = prepare_audio_file_for_voice_cloning(input_path)
+    if converted_path is not None:
+        return converted_path
+    if error:
+        print(f"Warning: Audio conversion failed - {error}")
+    return input_path

src/audio/validator.py ADDED Viewed

	@@ -0,0 +1,230 @@

+#
+# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
+# SPDX-License-Identifier: Apache-2.0
+#
+import os
+import wave
+from config import (
+    SUPPORTED_AUDIO_EXTENSIONS,
+    AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES
+)
+def build_format_display_names_from_supported_extensions():
+    format_display_names = {}
+    for extension in SUPPORTED_AUDIO_EXTENSIONS:
+        format_code = extension.lstrip(".")
+        if format_code in AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES:
+            format_display_names[format_code] = AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES[format_code]
+        else:
+            format_display_names[format_code] = format_code.upper()
+    format_display_names["unknown"] = "Unknown"
+    return format_display_names
+FORMAT_DISPLAY_NAMES = build_format_display_names_from_supported_extensions()
+def get_audio_file_extension(file_path):
+    if not file_path:
+        return None
+    _, extension = os.path.splitext(file_path)
+    return extension.lower()
+def is_supported_audio_extension(file_path):
+    extension = get_audio_file_extension(file_path)
+    if extension is None:
+        return False
+    return extension in SUPPORTED_AUDIO_EXTENSIONS
+def validate_file_exists_and_readable(file_path):
+    if not file_path:
+        return False, "No audio file provided."
+    if not os.path.exists(file_path):
+        return False, "Audio file does not exist."
+    if not os.path.isfile(file_path):
+        return False, "The provided path is not a valid file."
+    try:
+        file_size = os.path.getsize(file_path)
+    except OSError as size_error:
+        return False, f"Cannot read file size: {str(size_error)}"
+    if file_size == 0:
+        return False, "Audio file is empty (0 bytes)."
+    if file_size < 44:
+        return False, "Audio file is too small to be a valid audio file."
+    try:
+        with open(file_path, "rb") as test_file:
+            test_file.read(1)
+    except IOError as read_error:
+        return False, f"Audio file is not readable: {str(read_error)}"
+    return True, None
+def detect_audio_format_from_header(file_path):
+    try:
+        with open(file_path, "rb") as audio_file:
+            header_bytes = audio_file.read(32)
+            if len(header_bytes) < 4:
+                return None, "File is too small to determine audio format."
+            if len(header_bytes) >= 12:
+                if header_bytes[:4] == b"RIFF" and header_bytes[8:12] == b"WAVE":
+                    return "wav", None
+            if header_bytes[:3] == b"ID3":
+                return "mp3", None
+            if len(header_bytes) >= 2:
+                first_two_bytes = header_bytes[:2]
+                mp3_sync_bytes = [
+                    b"\xff\xfb",
+                    b"\xff\xfa",
+                    b"\xff\xf3",
+                    b"\xff\xf2",
+                    b"\xff\xe0",
+                    b"\xff\xe2",
+                    b"\xff\xe3"
+                ]
+                if first_two_bytes in mp3_sync_bytes:
+                    return "mp3", None
+            if header_bytes[:4] == b"fLaC":
+                return "flac", None
+            if header_bytes[:4] == b"OggS":
+                return "ogg", None
+            if len(header_bytes) >= 12:
+                if header_bytes[:4] == b"FORM" and header_bytes[8:12] in [b"AIFF", b"AIFC"]:
+                    return "aiff", None
+            if len(header_bytes) >= 8:
+                if header_bytes[4:8] == b"ftyp":
+                    return "m4a", None
+            if len(header_bytes) >= 4:
+                if header_bytes[:4] == b"\x1aE\xdf\xa3":
+                    return "webm", None
+            if len(header_bytes) >= 8:
+                if header_bytes[4:8] in [b"mdat", b"moov", b"free", b"skip", b"wide"]:
+                    return "m4a", None
+            file_extension = get_audio_file_extension(file_path)
+            if file_extension and file_extension in SUPPORTED_AUDIO_EXTENSIONS:
+                return file_extension.lstrip("."), None
+            return "unknown", "Could not determine audio format from file header. The file may be corrupted or in an unsupported format."
+    except IOError as io_error:
+        return None, f"Error reading file header: {str(io_error)}"
+    except Exception as detection_error:
+        return None, f"Unexpected error detecting audio format: {str(detection_error)}"
+def validate_wav_file_structure(file_path):
+    try:
+        with wave.open(file_path, "rb") as wav_file:
+            number_of_channels = wav_file.getnchannels()
+            sample_width_bytes = wav_file.getsampwidth()
+            sample_rate = wav_file.getframerate()
+            number_of_frames = wav_file.getnframes()
+            if number_of_channels < 1:
+                return False, "WAV file has no audio channels."
+            if number_of_channels > 16:
+                return False, f"WAV file has too many channels ({number_of_channels}). Maximum supported is 16."
+            if sample_width_bytes < 1:
+                return False, "WAV file has invalid sample width (less than 1 byte)."
+            if sample_width_bytes > 4:
+                return False, f"WAV file has unsupported sample width ({sample_width_bytes} bytes). Maximum supported is 4 bytes (32-bit)."
+            if sample_rate < 100:
+                return False, f"WAV file has invalid sample rate ({sample_rate} Hz). Minimum supported is 100 Hz."
+            if sample_rate > 384000:
+                return False, f"WAV file has unsupported sample rate ({sample_rate} Hz). Maximum supported is 384000 Hz."
+            if number_of_frames < 1:
+                return False, "WAV file contains no audio frames."
+            audio_duration_seconds = number_of_frames / sample_rate
+            if audio_duration_seconds < 0.1:
+                return False, f"Audio is too short ({audio_duration_seconds:.2f} seconds). Minimum duration is 0.1 seconds."
+            if audio_duration_seconds > 3600:
+                return False, f"Audio is too long ({audio_duration_seconds:.0f} seconds). Maximum duration is 1 hour."
+            return True, None
+    except wave.Error as wav_error:
+        error_message = str(wav_error)
+        if "file does not start with RIFF id" in error_message:
+            return False, "File has .wav extension but is not a valid WAV file. It may be a different audio format renamed to .wav."
+        if "unknown format" in error_message.lower():
+            return False, "WAV file uses an unsupported audio encoding format."
+        return False, f"Invalid WAV file structure: {error_message}"
+    except EOFError:
+        return False, "WAV file is truncated or corrupted (unexpected end of file)."
+    except Exception as validation_error:
+        return False, f"Error validating WAV file: {str(validation_error)}"
+def perform_comprehensive_audio_validation(file_path):
+    file_exists_valid, file_exists_error = validate_file_exists_and_readable(file_path)
+    if not file_exists_valid:
+        return False, False, None, file_exists_error
+    file_extension = get_audio_file_extension(file_path)
+    if not is_supported_audio_extension(file_path):
+        supported_formats_list = ", ".join(SUPPORTED_AUDIO_EXTENSIONS)
+        return False, False, None, f"Unsupported file format '{file_extension}'. Supported formats are: {supported_formats_list}"
+    detected_format, detection_error = detect_audio_format_from_header(file_path)
+    if detected_format is None:
+        return False, False, None, detection_error
+    is_wav_format = (detected_format == "wav")
+    if is_wav_format:
+        wav_structure_valid, wav_structure_error = validate_wav_file_structure(file_path)
+        if not wav_structure_valid:
+            return False, True, "wav", wav_structure_error
+    return True, is_wav_format, detected_format, None
+def get_format_display_name(format_code):
+    if format_code is None:
+        return "Unknown"
+    if format_code in FORMAT_DISPLAY_NAMES:
+        return FORMAT_DISPLAY_NAMES[format_code]
+    return format_code.upper()

src/core/authentication.py CHANGED Viewed

@@ -10,14 +10,14 @@ def authenticate_huggingface():
     if HF_TOKEN:
         try:
             login(token=HF_TOKEN, add_to_git_credential=False)
-            print("Authenticated with Hugging Face")
         except Exception as authentication_error:
-            print(f"Hugging Face authentication failed: {authentication_error}")
-            print("Voice cloning may not be available")
     else:
-        print("Missing Hugging Face authentication required for the license agreement")
 def get_huggingface_token():
     return HF_TOKEN

     if HF_TOKEN:
         try:
             login(token=HF_TOKEN, add_to_git_credential=False)
+            print("Authenticated with Hugging Face", flush=True)
         except Exception as authentication_error:
+            print(f"Hugging Face authentication failed: {authentication_error}", flush=True)
+            print("Voice cloning may not be available", flush=True)
     else:
+        print("Missing Hugging Face authentication required for the license agreement", flush=True)
 def get_huggingface_token():
     return HF_TOKEN

src/generation/handler.py CHANGED Viewed

@@ -20,6 +20,11 @@ from ..core.memory import (
 )
 from ..tts.manager import text_to_speech_manager
 from ..validation.text import validate_text_input
 def check_if_generating():
     from ..core.state import is_currently_generating
@@ -30,6 +35,56 @@ def request_generation_stop():
     set_stop_generation_requested(True)
     return gr.update(interactive=False)
 def perform_speech_generation(
     text_input,
     voice_mode_selection,
@@ -56,12 +111,26 @@ def perform_speech_generation(
             raise gr.Error(validation_result)
         raise gr.Error("Please enter valid text to generate speech.")
     if voice_mode_selection == VOICE_MODE_CLONE:
         if not voice_clone_audio_file:
             raise gr.Error("Please upload an audio file for voice cloning.")
         if not get_huggingface_token():
             raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
     with generation_state_lock:
         if global_state.is_currently_generating:
             raise gr.Error("A generation is already in progress. Please wait.")
@@ -85,7 +154,10 @@ def perform_speech_generation(
                 return None
         if voice_mode_selection == VOICE_MODE_CLONE:
-            cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file)
             voice_state = cloned_voice_state_tensor
         else:
             voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
@@ -116,7 +188,15 @@ def perform_speech_generation(
         raise gr.Error(str(runtime_error))
     except Exception as generation_error:
-        raise gr.Error(f"Speech generation failed: {str(generation_error)}")
     finally:
         with generation_state_lock:

 )
 from ..tts.manager import text_to_speech_manager
 from ..validation.text import validate_text_input
+from ..audio.validator import (
+    perform_comprehensive_audio_validation,
+    get_format_display_name
+)
+from ..audio.converter import prepare_audio_file_for_voice_cloning
 def check_if_generating():
     from ..core.state import is_currently_generating
     set_stop_generation_requested(True)
     return gr.update(interactive=False)
+def validate_and_prepare_voice_clone_audio(voice_clone_audio_file):
+    if not voice_clone_audio_file:
+        return None, "Please upload an audio file for voice cloning.", None, None
+    is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(voice_clone_audio_file)
+    if not is_valid:
+        format_display_name = get_format_display_name(detected_format) if detected_format else "Unknown"
+        if validation_error:
+            if "too short" in validation_error.lower():
+                return None, f"The uploaded audio file is too short. Please upload a longer audio sample for better voice cloning results.", None, detected_format
+            if "too long" in validation_error.lower():
+                return None, f"The uploaded audio file is too long. Please upload a shorter audio sample (maximum 1 hour).", None, detected_format
+            if "empty" in validation_error.lower() or "0 bytes" in validation_error.lower():
+                return None, "The uploaded audio file is empty. Please upload a valid audio file.", None, detected_format
+            if "corrupted" in validation_error.lower() or "truncated" in validation_error.lower():
+                return None, f"The uploaded {format_display_name} file appears to be corrupted or incomplete. Please upload a valid audio file.", None, detected_format
+            if "unsupported" in validation_error.lower():
+                return None, validation_error, None, detected_format
+            return None, f"Invalid audio file: {validation_error}", None, detected_format
+        return None, "The uploaded file could not be validated as a valid audio file.", None, detected_format
+    format_display_name = get_format_display_name(detected_format)
+    if is_wav_format:
+        prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)
+        if prepared_path is None:
+            return None, f"Failed to process WAV file: {preparation_error}", None, 'wav'
+        return prepared_path, None, False, 'wav'
+    else:
+        prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(voice_clone_audio_file)
+        if prepared_path is None:
+            if "no audio conversion library" in preparation_error.lower():
+                return None, f"Cannot convert {format_display_name} format. Please upload a WAV file directly.", None, detected_format
+            return None, f"Failed to convert {format_display_name} to WAV format: {preparation_error}", None, detected_format
+        return prepared_path, None, True, detected_format
 def perform_speech_generation(
     text_input,
     voice_mode_selection,
             raise gr.Error(validation_result)
         raise gr.Error("Please enter valid text to generate speech.")
+    prepared_audio_path = None
+    was_audio_converted = False
+    original_audio_format = None
     if voice_mode_selection == VOICE_MODE_CLONE:
         if not voice_clone_audio_file:
             raise gr.Error("Please upload an audio file for voice cloning.")
         if not get_huggingface_token():
             raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
+        prepared_audio_path, audio_error, was_audio_converted, original_audio_format = validate_and_prepare_voice_clone_audio(voice_clone_audio_file)
+        if prepared_audio_path is None:
+            raise gr.Error(audio_error)
+        if was_audio_converted:
+            format_display_name = get_format_display_name(original_audio_format)
+            gr.Warning(f"Audio converted from {format_display_name} to WAV format for voice cloning.")
     with generation_state_lock:
         if global_state.is_currently_generating:
             raise gr.Error("A generation is already in progress. Please wait.")
                 return None
         if voice_mode_selection == VOICE_MODE_CLONE:
+            cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(
+                voice_clone_audio_file,
+                prepared_audio_path=prepared_audio_path
+            )
             voice_state = cloned_voice_state_tensor
         else:
             voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
         raise gr.Error(str(runtime_error))
     except Exception as generation_error:
+        error_message = str(generation_error)
+        if "file does not start with RIFF id" in error_message:
+            raise gr.Error("The audio file format is not supported. Please upload a valid WAV file or a common audio format (MP3, FLAC, OGG, M4A).")
+        if "unknown format" in error_message.lower():
+            raise gr.Error("The audio file uses an unsupported encoding format. Please convert it to a standard format and try again.")
+        raise gr.Error(f"Speech generation failed: {error_message}")
     finally:
         with generation_state_lock:

src/tts/manager.py CHANGED Viewed

@@ -31,7 +31,6 @@ from ..core.memory import (
     trigger_background_cleanup_check,
     is_memory_usage_approaching_limit
 )
-from ..audio.converter import convert_audio_to_pcm_wav
 class TextToSpeechManager:
     def __init__(self):
@@ -178,15 +177,15 @@ class TextToSpeechManager:
         return self.voice_state_cache[validated_voice]
-    def get_voice_state_for_clone(self, audio_file_path):
         with self.model_lock:
             if self.loaded_model is None:
                 raise RuntimeError("TTS model is not loaded. Please try again.")
-        converted_audio_path = convert_audio_to_pcm_wav(audio_file_path)
         return self.loaded_model.get_state_for_audio_prompt(
-            audio_conditioning=converted_audio_path,
             truncate=False
         )

     trigger_background_cleanup_check,
     is_memory_usage_approaching_limit
 )
 class TextToSpeechManager:
     def __init__(self):
         return self.voice_state_cache[validated_voice]
+    def get_voice_state_for_clone(self, audio_file_path, prepared_audio_path=None):
         with self.model_lock:
             if self.loaded_model is None:
                 raise RuntimeError("TTS model is not loaded. Please try again.")
+        audio_path_to_use = prepared_audio_path if prepared_audio_path is not None else audio_file_path
         return self.loaded_model.get_state_for_audio_prompt(
+            audio_conditioning=audio_path_to_use,
             truncate=False
         )