Spaces:
Running
Running
| from .kokoro import normalize_text,phonemize,generate | |
| import re | |
| import librosa | |
| import os | |
| import uuid | |
| from pydub.silence import split_on_silence | |
| from pydub import AudioSegment | |
| import wave | |
| import numpy as np | |
| import torch | |
| def create_audio_dir(): | |
| """Creates the 'kokoro_audio' directory in the root folder if it doesn't exist.""" | |
| root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| audio_dir = os.path.join(root_dir, "kokoro_audio") | |
| if not os.path.exists(audio_dir): | |
| os.makedirs(audio_dir) | |
| print(f"Created directory: {audio_dir}") | |
| else: | |
| print(f"Directory already exists: {audio_dir}") | |
| return audio_dir | |
| temp_folder = create_audio_dir() | |
| debug=False | |
| def resplit_strings(arr): | |
| # Handle edge cases | |
| if not arr: | |
| return '', '' | |
| if len(arr) == 1: | |
| return arr[0], '' | |
| # Try each possible split point | |
| min_diff = float('inf') | |
| best_split = 0 | |
| # Calculate lengths when joined with spaces | |
| lengths = [len(s) for s in arr] | |
| spaces = len(arr) - 1 # Total spaces needed | |
| # Try each split point | |
| left_len = 0 | |
| right_len = sum(lengths) + spaces | |
| for i in range(1, len(arr)): | |
| # Add current word and space to left side | |
| left_len += lengths[i-1] + (1 if i > 1 else 0) | |
| # Remove current word and space from right side | |
| right_len -= lengths[i-1] + 1 | |
| diff = abs(left_len - right_len) | |
| if diff < min_diff: | |
| min_diff = diff | |
| best_split = i | |
| # Join the strings with the best split point | |
| return ' '.join(arr[:best_split]), ' '.join(arr[best_split:]) | |
| def recursive_split(text, voice): | |
| if not text: | |
| return [] | |
| tokens = phonemize(text, voice, norm=False) | |
| if len(tokens) < 511: | |
| return [(text, tokens, len(tokens))] if tokens else [] | |
| if ' ' not in text: | |
| return [] | |
| for punctuation in ['!.?…', ':;', ',—']: | |
| splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text) | |
| if len(splits) > 1: | |
| break | |
| else: | |
| splits = None | |
| splits = splits or text.split(' ') | |
| a, b = resplit_strings(splits) | |
| return recursive_split(a, voice) + recursive_split(b, voice) | |
| def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2): | |
| if skip_square_brackets: | |
| text = re.sub(r'\[.*?\]', '', text) | |
| texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize_text(text))] if newline_split > 0 else [normalize_text(text)] | |
| segments = [row for t in texts for row in recursive_split(t, voice)] | |
| return [(i, *row) for i, row in enumerate(segments)] | |
| def large_text(text,VOICE_NAME): | |
| if len(text) <= 500: | |
| return [(0, text, len(text))] | |
| else: | |
| result=segment_and_tokenize(text, VOICE_NAME[0]) | |
| filtered_result = [(row[0], row[1], row[3]) for row in result] | |
| return filtered_result | |
| def clamp_speed(speed): | |
| if not isinstance(speed, float) and not isinstance(speed, int): | |
| return 1 | |
| elif speed < 0.5: | |
| # return 0.5 | |
| return speed | |
| elif speed > 2: | |
| return 2 | |
| return speed | |
| def clamp_trim(trim): | |
| if not isinstance(trim, float) and not isinstance(trim, int): | |
| return 0.5 | |
| elif trim <= 0: | |
| return 0 | |
| elif trim > 1: | |
| return 0.5 | |
| return trim | |
| def trim_if_needed(out, trim): | |
| if not trim: | |
| return out | |
| a, b = librosa.effects.trim(out, top_db=30)[1] | |
| a = int(a*trim) | |
| b = int(len(out)-(len(out)-b)*trim) | |
| return out[a:b] | |
| #Above code copied from https://huggingface.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py | |
| def get_random_file_name(output_file=""): | |
| global temp_folder | |
| if output_file=="": | |
| random_id = str(uuid.uuid4())[:8] | |
| output_file = f"{temp_folder}/{random_id}.wav" | |
| return output_file | |
| # Ensure temp_folder exists | |
| if not os.path.exists(output_file): | |
| return output_file | |
| try: | |
| if output_file and os.path.exists(output_file): | |
| os.remove(output_file) # Try to remove the file if it exists | |
| return output_file # Return the same name if the file was successfully removed | |
| except Exception as e: | |
| # print(f"Error removing file {output_file}: {e}") | |
| random_id = str(uuid.uuid4())[:8] | |
| output_file = f"{temp_folder}/{random_id}.wav" | |
| return output_file | |
| def remove_silence_function(file_path,minimum_silence=50): | |
| # Extract file name and format from the provided path | |
| output_path = file_path.replace(".wav", "_no_silence.wav") | |
| audio_format = "wav" | |
| # Reading and splitting the audio file into chunks | |
| sound = AudioSegment.from_file(file_path, format=audio_format) | |
| audio_chunks = split_on_silence(sound, | |
| min_silence_len=100, | |
| silence_thresh=-45, | |
| keep_silence=minimum_silence) | |
| # Putting the file back together | |
| combined = AudioSegment.empty() | |
| for chunk in audio_chunks: | |
| combined += chunk | |
| combined.export(output_path, format=audio_format) | |
| return output_path | |
| # import simpleaudio as sa | |
| # def play_audio(filename): | |
| # wave_obj = sa.WaveObject.from_wave_file(filename) | |
| # play_obj = wave_obj.play() | |
| # play_obj.wait_done() | |
| import re | |
| def clean_text(text): | |
| # Define replacement rules | |
| replacements = { | |
| "–": " ", # Replace en-dash with space | |
| "-": " ", # Replace hyphen with space | |
| ":": ",", # Replace colon with comma | |
| "**": " ", # Replace double asterisks with space | |
| "*": " ", # Replace single asterisk with space | |
| "#": " ", # Replace hash with space | |
| } | |
| # Apply replacements | |
| for old, new in replacements.items(): | |
| text = text.replace(old, new) | |
| # Remove emojis using regex (covering wide range of Unicode characters) | |
| emoji_pattern = re.compile( | |
| r'[\U0001F600-\U0001F64F]|' # Emoticons | |
| r'[\U0001F300-\U0001F5FF]|' # Miscellaneous symbols and pictographs | |
| r'[\U0001F680-\U0001F6FF]|' # Transport and map symbols | |
| r'[\U0001F700-\U0001F77F]|' # Alchemical symbols | |
| r'[\U0001F780-\U0001F7FF]|' # Geometric shapes extended | |
| r'[\U0001F800-\U0001F8FF]|' # Supplemental arrows-C | |
| r'[\U0001F900-\U0001F9FF]|' # Supplemental symbols and pictographs | |
| r'[\U0001FA00-\U0001FA6F]|' # Chess symbols | |
| r'[\U0001FA70-\U0001FAFF]|' # Symbols and pictographs extended-A | |
| r'[\U00002702-\U000027B0]|' # Dingbats | |
| r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS) | |
| r'', flags=re.UNICODE) | |
| text = emoji_pattern.sub(r'', text) | |
| # Remove multiple spaces and extra line breaks | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| #copied from F5TTS 😁 | |
| import re | |
| def parse_speechtypes_text(gen_text): | |
| # Pattern to find {speechtype} | |
| pattern = r"\{(.*?)\}" | |
| # Split the text by the pattern | |
| tokens = re.split(pattern, gen_text) | |
| segments = [] | |
| current_style = "af" | |
| for i in range(len(tokens)): | |
| if i % 2 == 0: | |
| # This is text | |
| text = tokens[i].strip() | |
| if text: | |
| text=clean_text(text) | |
| segments.append({"voice_name": current_style, "text": text}) | |
| else: | |
| # This is style | |
| style = tokens[i].strip() | |
| current_style = style | |
| return segments | |
| def podcast(MODEL, device, gen_text, speed=1.0, trim=0.5, pad_between_segments=0, remove_silence=True, minimum_silence=50): | |
| segments = parse_speechtypes_text(gen_text) | |
| speed = clamp_speed(speed) | |
| trim = clamp_trim(trim) | |
| silence_duration = clamp_trim(pad_between_segments) | |
| # output_file = get_random_file_name(output_file) | |
| sample_rate = 24000 # Sample rate of the audio | |
| # Create a silent audio segment in float32 | |
| silence = np.zeros(int(sample_rate * silence_duration), dtype=np.float32) | |
| if len(segments)>=1: | |
| first_line_text=segments[0]["text"] | |
| output_file=tts_file_name(first_line_text) | |
| else: | |
| output_file = get_random_file_name("") | |
| output_file = output_file.replace('\n', '').replace('\r', '') | |
| # Open a WAV file for writing | |
| with wave.open(output_file, 'wb') as wav_file: | |
| wav_file.setnchannels(1) # Mono | |
| wav_file.setsampwidth(2) # 16-bit audio | |
| wav_file.setframerate(sample_rate) | |
| for idx, segment in enumerate(segments): # Added index `idx` to track position | |
| voice_name = segment["voice_name"] | |
| text = segment["text"] | |
| voice_pack_path = f"./KOKORO/voices/{voice_name}.pt" | |
| VOICEPACK = torch.load(voice_pack_path, weights_only=True).to(device) | |
| # Generate audio for the segment | |
| audio, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0], speed=speed) | |
| audio = trim_if_needed(audio, trim) | |
| # Scale audio from float32 to int16 | |
| audio = (audio * 32767).astype(np.int16) | |
| # Write the audio segment to the WAV file | |
| wav_file.writeframes(audio.tobytes()) | |
| # Add silence between segments, except after the last segment | |
| if idx != len(segments) - 1: | |
| wav_file.writeframes((silence * 32767).astype(np.int16).tobytes()) | |
| # Optionally remove silence from the output file | |
| if remove_silence: | |
| output_file = remove_silence_function(output_file, minimum_silence=minimum_silence) | |
| return output_file | |
| old_voice_pack_path="" | |
| old_VOICEPACK=None | |
| def tts(MODEL,device,text, voice_name, speed=1.0, trim=0.5, pad_between_segments=0.5, output_file="",remove_silence=True,minimum_silence=50): | |
| global old_voice_pack_path,old_VOICEPACK | |
| language = voice_name[0] | |
| voice_pack_path = f"./KOKORO/voices/{voice_name}.pt" | |
| if voice_name.endswith(".pt"): | |
| language="a" | |
| voice_pack_path=voice_name | |
| text=clean_text(text) | |
| segments = large_text(text, language) | |
| if (old_voice_pack_path!=voice_pack_path)or ("weighted_normalised_voices.pt" in voice_pack_path): | |
| VOICEPACK = torch.load(voice_pack_path, weights_only=True).to(device) | |
| old_voice_pack_path=voice_pack_path | |
| old_VOICEPACK=VOICEPACK | |
| # print("Loaded new voice pack") | |
| else: | |
| VOICEPACK=old_VOICEPACK | |
| # print("Using old voice pack") | |
| speed = clamp_speed(speed) | |
| trim = clamp_trim(trim) | |
| silence_duration = clamp_trim(pad_between_segments) | |
| output_file=get_random_file_name(output_file) | |
| if debug: | |
| print(f'Loaded voice: {voice_pack_path}') | |
| print(f"Speed: {speed}") | |
| print(f"Trim: {trim}") | |
| print(f"Silence duration: {silence_duration}") | |
| sample_rate = 24000 # Sample rate of the audio | |
| # Create a silent audio segment in float32 | |
| silence = np.zeros(int(sample_rate * silence_duration), dtype=np.float32) | |
| # Open a WAV file for writing | |
| with wave.open(output_file, 'wb') as wav_file: | |
| wav_file.setnchannels(1) # Mono | |
| wav_file.setsampwidth(2) # 16-bit audio | |
| wav_file.setframerate(sample_rate) | |
| for i in segments: | |
| id = i[0] | |
| text = i[1] | |
| if debug: | |
| print(i) | |
| audio, out_ps = generate(MODEL, text, VOICEPACK, lang=language, speed=speed) | |
| audio = trim_if_needed(audio, trim) | |
| # Scale audio from float32 to int16 | |
| audio = (audio * 32767).astype(np.int16) | |
| # Write the audio segment to the WAV file | |
| wav_file.writeframes(audio.tobytes()) | |
| # Add silence between segments, except after the last segment | |
| if id != len(segments) - 1: | |
| wav_file.writeframes((silence * 32767).astype(np.int16).tobytes()) | |
| if remove_silence: | |
| output_file=remove_silence_function(output_file,minimum_silence=minimum_silence) | |
| return output_file | |
| def tts_file_name(text): | |
| global temp_folder | |
| # Remove all non-alphabetic characters and convert to lowercase | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces | |
| text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces | |
| text = text.replace(" ", "_") # Replace spaces with underscores | |
| # Truncate or handle empty text | |
| truncated_text = text[:25] if len(text) > 25 else text if len(text) > 0 else "empty" | |
| # Generate a random string for uniqueness | |
| random_string = uuid.uuid4().hex[:8].upper() | |
| # Construct the file name | |
| file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav" | |
| return file_name | |