""" This module defines the 'SubtitleToSpeech' class, which converts subtitle files to speech audio files. It supports multiple Text-to-Speech (TTS) services including Harpo, Balabolka, Edge, and ElevenLabs. * Usage: To use this module, create an instance of the 'SubtitleToSpeech' class and call the 'generate_audio' method. * Example usage: from subtitle_to_speech import SubtitleToSpeech # Create an instance of SubtitleToSpeech converter = SubtitleToSpeech(filename="example.srt") # Generate audio converter.generate_audio(settings) * Example usage: if __name__ == '__main__': converter = SubtitleToSpeech(filename="example.srt") converter.generate_audio(settings) * Example usage: if __name__ == '__main__': if 'TTS - *Głos* - ElevenLans' in settings.tts: audio_generator = SubtitleToSpeech(filename="") audio_generator.srt_to_eac3_elevenlabs() # For Alt Subs """ from dataclasses import dataclass from msvcrt import getch from os import listdir, path, remove from subprocess import call, Popen from threading import Thread from time import sleep import wave from asyncio import create_task, gather, run from typing import Dict, List, Optional import pyttsx3 import pysrt from edge_tts import Communicate from pydub import AudioSegment from pydub.utils import mediainfo from constants import (WORKING_SPACE, WORKING_SPACE_OUTPUT, WORKING_SPACE_TEMP, WORKING_SPACE_TEMP_MAIN_SUBS, WORKING_SPACE_TEMP_ALT_SUBS, BALABOLKA_PATH, FFMPEG_PATH, console) from data.settings import Settings @dataclass(slots=True) class SubtitleToSpeech: """ This class provides methods to convert subtitle files to speech audio files. Attributes: - filename (str): The name of the subtitle file to convert. - working_space (str): The path to the working directory. - working_space_output (str): The path to the output directory. - working_space_temp (str): The path to the temporary directory. - working_space_temp_main_subs (str): The path to the main subtitles directory. - working_space_temp_alt_subs (str): The path to the alternative subtitles directory. - balabolka_path (str): The path to the Balabolka executable. - ffmpeg_path (str): The path to the FFmpeg executable. Methods: - ansi_srt(self) -> None: Converts the encoding of the subtitle file to ANSI. - srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None: Converts the subtitle file to a WAV audio file using Harpo TTS. - srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None: Converts the subtitle file to a WAV audio file using Balabolka TTS. - srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None: Converts the subtitle file to a WAV audio file using Edge TTS. - merge_tts_audio(self) -> None: Merges the generated TTS audio files. - generate_audio(self, settings: Settings) -> None: Generates the audio file from the subtitle file using the specified TTS settings. - srt_to_eac3_elevenlabs(self) -> None: Opens the main_subs folder for the user to add audio files generated by ElevenLabs. """ filename: str working_space: str = WORKING_SPACE working_space_output: str = WORKING_SPACE_OUTPUT working_space_temp: str = WORKING_SPACE_TEMP working_space_temp_main_subs: str = WORKING_SPACE_TEMP_MAIN_SUBS working_space_temp_alt_subs: str = WORKING_SPACE_TEMP_ALT_SUBS balabolka_path: str = BALABOLKA_PATH ffmpeg_path: str = FFMPEG_PATH def ansi_srt(self) -> None: """ Converts the encoding of the subtitle file to ANSI. Raises: - UnicodeDecodeError: If the file is not in UTF-8 encoding. """ try: with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="utf-8") as source_file: content: str = source_file.read() except UnicodeDecodeError: with open(path.join(self.working_space_temp_main_subs, self.filename), "r", encoding="ANSI") as source_file: content: str = source_file.read() with open(path.join(self.working_space_temp_main_subs, self.filename), "w", encoding="ANSI", errors="ignore") as target_file: target_file.write(content) console.print("Zamieniono kodowanie na ANSI:", style='green_bold', end=' ') console.print(self.filename) def srt_to_wav_harpo(self, tts_speed: str, tts_volume: str) -> None: """ Converts the subtitle file to a WAV audio file using Harpo TTS. Args: - tts_speed (str): The speed of the TTS voice. - tts_volume (str): The volume of the TTS voice. """ self.ansi_srt() engine = self._init_engine(tts_speed, tts_volume) subtitles: pysrt.SubRipFile = pysrt.open(path.join( self.working_space_temp_main_subs, self.filename), encoding='ANSI') output_file: str = path.splitext(path.join( self.working_space_temp_main_subs, self.filename))[0] + '.wav' self._generate_wav_file(engine, subtitles, output_file) remove(path.join(self.working_space_temp, "temp.wav")) def _init_engine(self, tts_speed: str, tts_volume: str) -> pyttsx3.Engine: """ Initializes the TTS engine with the specified speed and volume. Args: - tts_speed (str): The speed of the TTS voice. - tts_volume (str): The volume of the TTS voice. Returns: - pyttsx3.Engine: The initialized TTS engine. """ engine: pyttsx3.Engine = pyttsx3.init() voices: List[pyttsx3.Voice] = engine.getProperty('voices') for voice in voices: if voice.name == 'Vocalizer Expressive Zosia Harpo 22kHz': engine.setProperty('voice', voice.id) engine.setProperty('rate', int(tts_speed)) engine.setProperty('volume', float(tts_volume)) return engine def _generate_wav_file(self, engine: pyttsx3.Engine, subtitles: pysrt.SubRipFile, output_file: str) -> None: """ Generates a WAV audio file from the given subtitles using the specified TTS engine. Args: - engine (pyttsx3.Engine): The TTS engine to use for speech synthesis. - subtitles (pysrt.SubRipFile): The subtitles to convert to speech. - output_file (str): The path to the output WAV file. """ with wave.open(output_file, 'wb') as wav_file: wav_file.setnchannels(1) # Mono wav_file.setsampwidth(2) # 16-bit wav_file.setframerate(22500) # 22kHz for i, subtitle in enumerate(subtitles, start=1): print( f"{i}\n{subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitle.text}\n") start_time: float = subtitle.start.ordinal / 1000.0 self._save_subtitle_to_wav(engine, subtitle.text) self._add_empty_frame_if_needed(wav_file, start_time) self._add_subtitle_to_wav(wav_file) def _save_subtitle_to_wav(self, engine: pyttsx3.Engine, text: str) -> None: """ Saves a single subtitle to a temporary WAV file. Args: - engine (pyttsx3.Engine): The TTS engine to use for speech synthesis. - text (str): The text of the subtitle to convert to speech. """ engine.save_to_file(text, path.join( self.working_space_temp, "temp.wav")) engine.runAndWait() def _add_empty_frame_if_needed(self, wav_file: wave.Wave_write, start_time: float) -> None: """ Adds an empty frame to the WAV file if the start time of the next subtitle is later than the current time in the audio. Args: - wav_file (wave.Wave_write): The WAV file to add the empty frame to. - start_time (float): The start time of the next subtitle. """ framerate: int = wav_file.getframerate() nframes: int = wav_file.getnframes() current_time: float = nframes / float(framerate) if start_time > current_time: empty_frame_duration: int = int( (start_time - current_time) * framerate) empty_frame: bytes = b'\x00' * empty_frame_duration * 2 wav_file.writeframes(empty_frame) def _add_subtitle_to_wav(self, wav_file: wave.Wave_write) -> None: """ Adds a subtitle to the WAV file. Args: - wav_file (wave.Wave_write): The WAV file to add the subtitle to. """ with wave.open(path.join(self.working_space_temp, "temp.wav"), 'rb') as temp_file: data: bytes = temp_file.readframes(temp_file.getnframes()) wav_file.writeframes(data) def srt_to_wav_balabolka(self, tts_speed: str, tts_volume: str) -> None: """ Converts the subtitle file to a WAV audio file using Balabolka TTS. Args: - tts_speed (str): The speed of the TTS voice. - tts_volume (str): The volume of the TTS voice. """ self.ansi_srt() balcon_path: str = self.balabolka_path file_path: str = path.join( self.working_space_temp_main_subs, self.filename) output_wav_path: str = path.join( self.working_space_temp_main_subs, path.splitext(self.filename)[0] + ".wav") command: List[str] = self._prepare_balabolka_command( balcon_path, file_path, output_wav_path, tts_speed, tts_volume) command_thread: Thread = Thread( target=call, args=(command,)) command_thread.start() subtitles: pysrt.SubRipFile = pysrt.open(file_path, encoding='ANSI') for subtitle in subtitles: self.process_subtitle(subtitle) command_thread.join() def _prepare_balabolka_command(self, balcon_path: str, file_path: str, output_wav_path: str, tts_speed: str, tts_volume: str) -> List[str]: """ Prepares the command to run Balabolka TTS. Args: - balcon_path (str): The path to the Balabolka executable. - file_path (str): The path to the subtitle file. - output_wav_path (str): The path to the output WAV file. - tts_speed (str): The speed of the TTS voice. - tts_volume (str): The volume of the TTS voice. Returns: - List[str]: The prepared command. """ return [ balcon_path, "-fr", "48", "-f", file_path, "-w", output_wav_path, "-n", "IVONA 2 Agnieszka", "-s", tts_speed, "-v", tts_volume ] def process_subtitle(self, subtitle: pysrt.SubRipItem) -> None: """ Processes a single subtitle. Args: - subtitle (pysrt.SubRipItem): The subtitle to process. """ i: int = subtitle.index start_time: str = subtitle.start.to_time().strftime('%H:%M:%S.%f')[:-3] end_time: str = subtitle.end.to_time().strftime('%H:%M:%S.%f')[:-3] text: str = subtitle.text print(f"{i}\n{start_time} --> {end_time}\n{text}\n") sleep(0.02) async def generate_speech(self, subtitle: pysrt.SubRipItem, voice: str, output_file: str, rate: str, volume: str) -> None: """ Generates speech from a single subtitle using the specified TTS voice. Args: - subtitle (pysrt.SubRipItem): The subtitle to convert to speech. - voice (str): The TTS voice to use. - output_file (str): The path to the output audio file. - rate (str): The speed of the TTS voice. - volume (str): The volume of the TTS voice. """ communicate = Communicate( subtitle.text, voice, rate=rate, volume=volume) await communicate.save(output_file) async def generate_wav_files(self, subtitles: pysrt.SubRipFile, voice: str, rate: str, volume: str) -> List[str]: """ Generates WAV audio files from the given subtitles using the specified TTS voice. Args: - subtitles (pysrt.SubRipFile): The subtitles to convert to speech. - voice (str): The TTS voice to use. - rate (str): The speed of the TTS voice. - volume (str): The volume of the TTS voice. Returns: - List[str]: The paths to the generated WAV files. """ tasks = [] mp3_files = [] file_name = path.splitext(subtitles.path)[0] for i, subtitle in enumerate(subtitles, start=1): output_file = f"{file_name}_{i}.mp3" mp3_files.append(output_file) tasks.append(create_task(self.generate_speech( subtitle, voice, output_file, rate, volume))) if i % 50 == 0: await gather(*tasks) tasks = [] sleep(2) await gather(*tasks) return mp3_files def merge_audio_files(self, mp3_files: List[str], subtitles: pysrt.SubRipFile, dir_path: str) -> None: """ Merges the given MP3 audio files into a single WAV file. Args: - mp3_files (List[str]): The paths to the MP3 files to merge. - subtitles (pysrt.SubRipFile): The subtitles corresponding to the audio files. - dir_path (str): The directory where the audio files are located. """ file_name: str = path.splitext(subtitles.path)[0] with wave.open(f"{file_name}.wav", 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(24000) for i, mp3_file in enumerate(mp3_files, start=1): print( f"{i}\n{subtitles[i-1].start.to_time().strftime('%H:%M:%S.%f')[:-3]} --> {subtitles[i-1].end.to_time().strftime('%H:%M:%S.%f')[:-3]}\n{subtitles[i-1].text}\n") mp3_file_path: str = path.join(dir_path, mp3_file) if path.isfile(mp3_file_path): start_time: float = subtitles[i-1].start.ordinal / 1000.0 sound: AudioSegment = AudioSegment.from_file( mp3_file_path, format="mp3") remove(mp3_file_path) self._add_empty_frame_if_needed(wav_file, start_time) sound_data: bytes = sound.raw_data wav_file.writefqrames(sound_data) def srt_to_wav_edge_online(self, tts: str, tts_speed: str, tts_volume: str) -> None: """ Converts the subtitle file to a WAV audio file using Edge TTS. Args: - tts (str): The TTS service to use. - tts_speed (str): The speed of the TTS voice. - tts_volume (str): The volume of the TTS voice. """ self.ansi_srt() voice = "pl-PL-ZofiaNeural" if tts == "TTS - Zofia - Edge" else "pl-PL-MarekNeural" subtitles: pysrt.SubRipFile = pysrt.open(path.join( self.working_space_temp_main_subs, self.filename), encoding='ANSI') mp3_files: List[str] = run(self.generate_wav_files( subtitles, voice, tts_speed, tts_volume)) self.merge_audio_files(mp3_files, subtitles, self.working_space_temp_main_subs) def merge_tts_audio(self) -> None: """ Merges the generated TTS audio files. """ main_subs_files_dict: Dict[str, str] = self._get_files_dict( self.working_space_temp_main_subs) tmp_files_dict: Dict[str, str] = self._get_files_dict( self.working_space_temp) for file_name, main_subs_file in main_subs_files_dict.items(): main_subs_file_path: str = path.join( self.working_space_temp_main_subs, main_subs_file) output_file: str = path.join( self.working_space_output, file_name + ".eac3") if file_name in tmp_files_dict: tmp_file: str = tmp_files_dict[file_name] tmp_file_path: str = path.join( self.working_space_temp, tmp_file) main_subs_file_duration: float = self._get_file_duration( main_subs_file_path) tmp_file_duration: float = self._get_file_duration( tmp_file_path) input_file_1: str input_file_2: str if main_subs_file_duration > tmp_file_duration: input_file_1, input_file_2 = main_subs_file_path, tmp_file_path else: input_file_1, input_file_2 = tmp_file_path, main_subs_file_path self._merge_files(input_file_1, input_file_2, output_file) remove(main_subs_file_path) remove(tmp_file_path) else: self._convert_to_eac3(main_subs_file_path, output_file) remove(main_subs_file_path) self._remove_same_name_files( self.working_space_temp_main_subs, file_name) def _get_files_dict(self, directory: str) -> Dict[str, str]: """ Gets a dictionary of the files in the given directory, excluding files with certain extensions. Args: - directory (str): The directory to get the files from. Returns: - Dict[str, str]: A dictionary of the files in the directory. """ excluded_extensions: List[str] = ["srt", "ass"] return {path.splitext(f)[0]: f for f in listdir(directory) if path.splitext(f)[1][1:].lower() not in excluded_extensions} def _get_file_duration(self, file_path: str) -> float: """ Gets the duration of the file at the given path. Args: - file_path (str): The path to the file. Returns: - float: The duration of the file in seconds. """ return float(mediainfo(file_path)['duration']) def _merge_files(self, input_file_1: str, input_file_2: str, output_file: str): """ Merges two audio files into a single file. Args: - input_file_1 (str): The path to the first input file. - input_file_2 (str): The path to the second input file. - output_file (str): The path to the output file. """ if 'main_subs' in input_file_1: command: List[str] = [ self.ffmpeg_path, "-i", input_file_1, "-i", input_file_2, "-filter_complex", "[0:a]volume=7dB[a1];[a1][1:a]amix=inputs=2:duration=first", output_file ] else: command: List[str] = [ self.ffmpeg_path, "-i", input_file_1, "-i", input_file_2, "-filter_complex", "[1:a]volume=7dB[a1];[0:a][a1]amix=inputs=2:duration=first", output_file ] call(command) def _convert_to_eac3(self, input_file: str, output_file: str): """ Converts an audio file to EAC3 format. Args: - input_file (str): The path to the input file. - output_file (str): The path to the output file. """ command: List[str] = [ self.ffmpeg_path, "-i", input_file, "-c:a", "eac3", output_file ] call(command) def _remove_same_name_files(self, directory: str, file_name: str): """ Removes files with the same name as the given file name from the specified directory. Args: - directory (str): The directory to remove the files from. - file_name (str): The name of the files to remove. """ for file in listdir(directory): if path.splitext(file)[0] == file_name: remove(path.join(directory, file)) def generate_audio(self, settings: Settings): """ Generates the audio file from the subtitle file using the specified TTS settings. Args: - settings (Settings): The TTS settings to use. """ tts: Optional[str] = settings.tts tts_speed: Optional[str] = settings.tts_speed tts_volume: Optional[str] = settings.tts_volume console.print("Rozpoczynam generowanie pliku audio...", style='green_bold', end=' ') console.print(self.filename, style='white_bold') if tts == "TTS - Zosia - Harpo": self.srt_to_wav_harpo(tts_speed, tts_volume) elif tts == "TTS - Agnieszka - Ivona": self.srt_to_wav_balabolka(tts_speed, tts_volume) elif tts in ["TTS - Zofia - Edge", "TTS - Marek - Edge"]: self.srt_to_wav_edge_online(tts, tts_speed, tts_volume) console.print( "Generowanie pliku audio zakończone.", style='green_bold') self.merge_tts_audio() def srt_to_eac3_elevenlabs(self) -> None: """ Opens the main_subs folder for the user to add audio files generated by ElevenLabs. """ Popen(['explorer', path.realpath(self.working_space_temp_main_subs)]) console.print("\nWygeneruj pliki audio z plików .srt za pomocą 11Labs_TTS_Colab,\na następnie dodaj je do folderu main_subs.", style='yellow_bold') console.print( "11Labs_TTS_Colab: https://github.com/MattyMroz/11Labs_TTS_Colab", style='yellow_bold') console.print( "\n[green_italic]Naciśnij dowolny klawisz, aby kontynuować...", end=' ') getch() console.print() self.merge_tts_audio()