tts / src /audio /converter.py
hadadrjt's picture
[3/?] Pocket TTS: Handle multiple format extensions for voice cloning.
dae9fa5
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#
import os
import time
import tempfile
import numpy as np
import scipy.io.wavfile
from ..core.state import temporary_files_registry, temporary_files_lock
from ..core.memory import trigger_background_cleanup_check
def convert_audio_data_to_pcm_int16(audio_data):
if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
audio_data_clipped = np.clip(audio_data, -1.0, 1.0)
audio_data_int16 = (audio_data_clipped * 32767).astype(np.int16)
return audio_data_int16
if audio_data.dtype == np.int32:
audio_data_int16 = (audio_data >> 16).astype(np.int16)
return audio_data_int16
if audio_data.dtype == np.uint8:
audio_data_int16 = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
return audio_data_int16
if audio_data.dtype == np.int16:
return audio_data
if audio_data.dtype == np.int64:
audio_data_int16 = (audio_data >> 48).astype(np.int16)
return audio_data_int16
return audio_data.astype(np.int16)
def convert_stereo_to_mono(audio_data):
if len(audio_data.shape) == 1:
return audio_data
if len(audio_data.shape) == 2:
if audio_data.shape[0] > audio_data.shape[1]:
audio_data = audio_data.T
if audio_data.shape[0] > 1:
mono_audio = np.mean(audio_data, axis=0)
return mono_audio.astype(audio_data.dtype)
return audio_data[0]
return audio_data
def register_temporary_file(file_path):
with temporary_files_lock:
temporary_files_registry[file_path] = time.time()
trigger_background_cleanup_check()
def convert_wav_file_to_pcm_format(input_path):
try:
sample_rate, audio_data = scipy.io.wavfile.read(input_path)
if len(audio_data.shape) > 1:
audio_data = convert_stereo_to_mono(audio_data)
audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
output_file = tempfile.NamedTemporaryFile(suffix="_pcm_converted.wav", delete=False)
scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
register_temporary_file(output_file.name)
return output_file.name, None
except Exception as conversion_error:
return None, f"Failed to convert WAV to PCM format: {str(conversion_error)}"
def convert_audio_using_pydub(input_path, target_sample_rate=None):
try:
from pydub import AudioSegment
audio_segment = AudioSegment.from_file(input_path)
audio_segment = audio_segment.set_channels(1)
audio_segment = audio_segment.set_sample_width(2)
if target_sample_rate is not None:
audio_segment = audio_segment.set_frame_rate(target_sample_rate)
output_file = tempfile.NamedTemporaryFile(suffix="_pydub_converted.wav", delete=False)
audio_segment.export(output_file.name, format="wav")
register_temporary_file(output_file.name)
return output_file.name, None
except ImportError:
return None, "pydub_library_not_available"
except Exception as conversion_error:
error_message = str(conversion_error)
if "ffmpeg" in error_message.lower() or "ffprobe" in error_message.lower():
return None, "ffmpeg_not_available"
return None, f"Failed to convert audio using pydub: {error_message}"
def convert_audio_using_soundfile(input_path):
try:
import soundfile
audio_data, sample_rate = soundfile.read(input_path, dtype='float32')
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
output_file = tempfile.NamedTemporaryFile(suffix="_soundfile_converted.wav", delete=False)
scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
register_temporary_file(output_file.name)
return output_file.name, None
except ImportError:
return None, "soundfile_library_not_available"
except Exception as conversion_error:
return None, f"Failed to convert audio using soundfile: {str(conversion_error)}"
def convert_audio_using_librosa(input_path):
try:
import librosa
audio_data, sample_rate = librosa.load(input_path, sr=None, mono=True)
audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
output_file = tempfile.NamedTemporaryFile(suffix="_librosa_converted.wav", delete=False)
scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
register_temporary_file(output_file.name)
return output_file.name, None
except ImportError:
return None, "librosa_library_not_available"
except Exception as conversion_error:
return None, f"Failed to convert audio using librosa: {str(conversion_error)}"
def convert_non_wav_audio_to_wav(input_path):
converted_path, pydub_error = convert_audio_using_pydub(input_path)
if converted_path is not None:
return converted_path, None, "pydub"
converted_path, soundfile_error = convert_audio_using_soundfile(input_path)
if converted_path is not None:
return converted_path, None, "soundfile"
converted_path, librosa_error = convert_audio_using_librosa(input_path)
if converted_path is not None:
return converted_path, None, "librosa"
pydub_unavailable = pydub_error in ["pydub_library_not_available", "ffmpeg_not_available"]
soundfile_unavailable = soundfile_error == "soundfile_library_not_available"
librosa_unavailable = librosa_error == "librosa_library_not_available"
if pydub_unavailable and soundfile_unavailable and librosa_unavailable:
return None, "No audio conversion library is available on the server. Please upload a WAV file directly.", None
all_errors = []
if not pydub_unavailable and pydub_error:
all_errors.append(f"pydub: {pydub_error}")
if not soundfile_unavailable and soundfile_error:
all_errors.append(f"soundfile: {soundfile_error}")
if not librosa_unavailable and librosa_error:
all_errors.append(f"librosa: {librosa_error}")
if all_errors:
combined_error = " | ".join(all_errors)
return None, f"Audio conversion failed with all available methods. {combined_error}", None
return None, "Audio conversion failed. Please try uploading a different audio file or use WAV format.", None
def prepare_audio_file_for_voice_cloning(input_path):
from .validator import perform_comprehensive_audio_validation, get_format_display_name
is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(input_path)
if not is_valid:
return None, validation_error, False, detected_format
if is_wav_format:
converted_path, conversion_error = convert_wav_file_to_pcm_format(input_path)
if converted_path is not None:
return converted_path, None, False, 'wav'
return None, conversion_error, False, 'wav'
format_display_name = get_format_display_name(detected_format)
converted_path, conversion_error, conversion_method = convert_non_wav_audio_to_wav(input_path)
if converted_path is not None:
final_path, pcm_error = convert_wav_file_to_pcm_format(converted_path)
if final_path is not None:
return final_path, None, True, detected_format
return converted_path, None, True, detected_format
return None, conversion_error, True, detected_format
def convert_audio_to_pcm_wav(input_path):
converted_path, error, was_converted, detected_format = prepare_audio_file_for_voice_cloning(input_path)
if converted_path is not None:
return converted_path
if error:
print(f"Warning: Audio conversion failed - {error}")
return input_path