Spaces:

NeuralFalcon
/

Kokoro-TTS

Running

App Files Files Community

NeuralFalcon commited on Jan 23

Commit

3a5f7f4

verified ·

1 Parent(s): 6b16783

Update app.py

Browse files

Files changed (1) hide show

app.py +436 -38

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from KOKORO.models import build_model
 from KOKORO.utils import tts,tts_file_name,podcast
 import sys
@@ -6,11 +7,25 @@ import os
 os.system("python download_model.py")
 import torch
 import gc
 print("Loading model...")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f'Using device: {device}')
-# MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
-MODEL = build_model('./KOKORO/fp16/kokoro-v0_19-half.pth', device)
 print("Model loaded successfully.")
 def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
@@ -22,7 +37,7 @@ def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_p
 model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
-current_model = model_list[-1]
 def update_model(model_name):
     """
@@ -43,8 +58,21 @@ def update_model(model_name):
     return f"Model updated to {model_name}"
-def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,trim=0.5):
     """
     Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
     """
@@ -54,6 +82,12 @@ def text_to_speech(text, model_name="kokoro-v0_19-half.pth", voice_name="af", sp
         minimum_silence = 0.05
     keep_silence = int(minimum_silence * 1000)
     save_at = tts_file_name(text)
     audio_path = tts_maker(
         text,
         voice_name,
@@ -96,7 +130,6 @@ def toggle_autoplay(autoplay):
 with gr.Blocks() as demo1:
     gr.Markdown("# Batched TTS")
-    gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
     with gr.Row():
         with gr.Column():
             text = gr.Textbox(
@@ -115,16 +148,17 @@ with gr.Blocks() as demo1:
             with gr.Row():
                 generate_btn = gr.Button('Generate', variant='primary')
             with gr.Accordion('Audio Settings', open=False):
-                model_name=gr.Dropdown(model_list,label="Model",value=model_list[-1])
                 remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
                 minimum_silence = gr.Number(
                     label="Keep Silence Upto (In seconds)",
                     value=0.05
                 )
-                speed = gr.Slider(
-                    minimum=0.25, maximum=2, value=1, step=0.1,
-                    label='⚡️Speed', info='Adjust the speaking speed'
-                )
                 # trim = gr.Slider(
                 #     minimum=0, maximum=1, value=0, step=0.1,
                 #     label='🔪 Trim', info='How much to cut from both ends of each segment'
@@ -134,6 +168,8 @@ with gr.Blocks() as demo1:
                     label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
                 )
         with gr.Column():
             audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
             with gr.Accordion('Enable Autoplay', open=False):
@@ -142,16 +178,16 @@ with gr.Blocks() as demo1:
     text.submit(
         text_to_speech,
-        inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence],
         outputs=[audio]
     )
     generate_btn.click(
         text_to_speech,
-        inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence],
         outputs=[audio]
     )
-def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19-half.pth"):
     global MODEL,device
     update_model(model_name)
     if not minimum_silence:
@@ -258,13 +294,13 @@ def your_tts(text,audio_path,actual_duration,speed=1.0):
   global srt_voice_name
   model_name="kokoro-v0_19.pth"
   tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
-  print(tts_path)
   tts_audio = AudioSegment.from_file(tts_path)
   tts_duration = len(tts_audio)
   if tts_duration > actual_duration:
     speedup_factor = tts_duration / actual_duration
     tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
-  print(tts_path)
   shutil.copy(tts_path,audio_path)
@@ -321,6 +357,77 @@ def clean_srt(input_path):
 class SRTDubbing:
@@ -343,14 +450,15 @@ class SRTDubbing:
         if tts_duration > actual_duration:
             speedup_factor = tts_duration / actual_duration
             speedup_filename = "./cache/speedup_temp.wav"
             # Use ffmpeg to change audio speed
-            subprocess.run([
-                "ffmpeg",
-                "-i", tts_filename,
-                "-filter:a", f"atempo={speedup_factor}",
-                speedup_filename,
-                "-y"
-            ], check=True)
             # Replace the original TTS audio with the sped-up version
             shutil.move(speedup_filename, audio_path)
@@ -456,10 +564,27 @@ class SRTDubbing:
         with open("entries.json", "w") as file:
             json.dump(entries, file, indent=4)
         return entries
-srt_voice_name="am_adam"
-def srt_process(srt_file_path,voice_name,dest_language="en"):
-  global srt_voice_name
-  srt_voice_name=voice_name
   srt_dubbing = SRTDubbing()
   dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
   srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
@@ -476,7 +601,7 @@ with gr.Blocks() as demo3:
     gr.Markdown(
         """
-        # Generate Audio File From Subtitle [Single Speaker Only]
         To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
@@ -495,7 +620,12 @@ with gr.Blocks() as demo3:
                 )
             with gr.Row():
                 generate_btn_ = gr.Button('Generate', variant='primary')
         with gr.Column():
             audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
             with gr.Accordion('Enable Autoplay', open=False):
@@ -509,24 +639,292 @@ with gr.Blocks() as demo3:
     # )
     generate_btn_.click(
         srt_process,
-        inputs=[srt_file,voice],
         outputs=[audio]
     )
 display_text = "  \n".join(voice_list)
-with gr.Blocks() as demo4:
-    gr.Markdown("Run on Your Local System [Kokoro-82M-WebUI](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
     gr.Markdown(f"# Voice Names \n{display_text}")
-# import click
-# @click.command()
-# @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
-# @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
-def main(debug=False, share=False):
-    demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS",theme='JohnSmith9982/small_and_pretty')
     demo.queue().launch(debug=debug, share=share)
     #Run on local network
@@ -559,4 +957,4 @@ if __name__ == "__main__":
 # save_at=f"./temp_audio/{os.path.basename(result)}"
 # shutil.move(result, save_at)
-# print(f"Saved at {save_at}")

 from KOKORO.models import build_model
 from KOKORO.utils import tts,tts_file_name,podcast
 import sys
 os.system("python download_model.py")
 import torch
 import gc
+import platform
+import shutil
+base_path=os.getcwd()
+def clean_folder_before_start():
+    global base_path
+    folder_list=["dummy","TTS_DUB","kokoro_audio"]
+    for folder in folder_list:
+        if os.path.exists(f"{base_path}/{folder}"):
+            try:
+                shutil.rmtree(f"{base_path}/{folder}")
+            except:
+                pass
+            os.makedirs(f"{base_path}/{folder}", exist_ok=True)
+clean_folder_before_start()
 print("Loading model...")
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f'Using device: {device}')
+MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
 print("Model loaded successfully.")
 def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
 model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
+current_model = model_list[0]
 def update_model(model_name):
     """
     return f"Model updated to {model_name}"
+def manage_files(file_path):
+    if os.path.exists(file_path):
+        file_extension = os.path.splitext(file_path)[1]  # Get file extension
+        file_size = os.path.getsize(file_path)  # Get file size in bytes
+        # Check if file is a valid .pt file and its size is ≤ 5 MB
+        if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
+            return True  # File is valid and kept
+        else:
+            os.remove(file_path)  # Delete invalid or oversized file
+            return False
+    return False  # File does not exist
+def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,custom_voicepack=None,trim=0.0):
     """
     Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
     """
         minimum_silence = 0.05
     keep_silence = int(minimum_silence * 1000)
     save_at = tts_file_name(text)
+    # print(voice_name,custom_voicepack)
+    if custom_voicepack:
+        if manage_files(custom_voicepack):
+            voice_name = custom_voicepack
+        else:
+            gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
     audio_path = tts_maker(
         text,
         voice_name,
 with gr.Blocks() as demo1:
     gr.Markdown("# Batched TTS")
     with gr.Row():
         with gr.Column():
             text = gr.Textbox(
             with gr.Row():
                 generate_btn = gr.Button('Generate', variant='primary')
             with gr.Accordion('Audio Settings', open=False):
+                model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
+                speed = gr.Slider(
+                    minimum=0.25, maximum=2, value=1, step=0.1,
+                    label='⚡️Speed', info='Adjust the speaking speed'
+                )
                 remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
                 minimum_silence = gr.Number(
                     label="Keep Silence Upto (In seconds)",
                     value=0.05
                 )
                 # trim = gr.Slider(
                 #     minimum=0, maximum=1, value=0, step=0.1,
                 #     label='🔪 Trim', info='How much to cut from both ends of each segment'
                     label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
                 )
+                custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
         with gr.Column():
             audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
             with gr.Accordion('Enable Autoplay', open=False):
     text.submit(
         text_to_speech,
+        inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
         outputs=[audio]
     )
     generate_btn.click(
         text_to_speech,
+        inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
         outputs=[audio]
     )
+def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
     global MODEL,device
     update_model(model_name)
     if not minimum_silence:
   global srt_voice_name
   model_name="kokoro-v0_19.pth"
   tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
+#   print(tts_path)
   tts_audio = AudioSegment.from_file(tts_path)
   tts_duration = len(tts_audio)
   if tts_duration > actual_duration:
     speedup_factor = tts_duration / actual_duration
     tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
+#   print(tts_path)
   shutil.copy(tts_path,audio_path)
+import librosa
+import soundfile as sf
+import subprocess
+def speedup_audio_librosa(input_file, output_file, speedup_factor):
+    try:
+        # Load the audio file
+        y, sr = librosa.load(input_file, sr=None)
+        # Use time stretching to speed up audio without changing pitch
+        y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
+        # Save the output with the original sample rate
+        sf.write(output_file, y_stretched, sr)
+        # print(f"Speed up by {speedup_factor} completed successfully: {output_file}")
+    except Exception as e:
+        gr.Warning(f"Error during speedup with Librosa: {e}")
+        shutil.copy(input_file, output_file)
+def is_ffmpeg_installed():
+    if platform.system() == "Windows":
+        local_ffmpeg_path = os.path.join("./ffmpeg", "ffmpeg.exe")
+    else:
+        local_ffmpeg_path = "ffmpeg"
+    try:
+        subprocess.run([local_ffmpeg_path, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
+        # print("FFmpeg is installed")
+        return True,local_ffmpeg_path
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        # print("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing")
+        gr.Warning("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing",duration= 20)
+        return False,local_ffmpeg_path
+# ffmpeg -i test.wav -filter:a "atempo=2.0" ffmpeg.wav  -y
+def change_speed(input_file, output_file, speedup_factor):
+    global use_ffmpeg,local_ffmpeg_path
+    if use_ffmpeg:
+        # print("Using FFmpeg for speedup")
+        try:
+            # subprocess.run([
+            #         local_ffmpeg_path,
+            #         "-i", input_file,
+            #         "-filter:a", f"atempo={speedup_factor}",
+            #         output_file,
+            #         "-y"
+            #     ], check=True)
+            subprocess.run([
+                local_ffmpeg_path,
+                "-i", input_file,
+                "-filter:a", f"atempo={speedup_factor}",
+                output_file,
+                "-y"
+                ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        except Exception as e:
+            gr.Error(f"Error during speedup with FFmpeg: {e}")
+            speedup_audio_librosa(input_file, output_file, speedup_factor)
+    else:
+        # print("Using Librosa for speedup")
+        speedup_audio_librosa(input_file, output_file, speedup_factor)
 class SRTDubbing:
         if tts_duration > actual_duration:
             speedup_factor = tts_duration / actual_duration
             speedup_filename = "./cache/speedup_temp.wav"
+            change_speed(tts_filename, speedup_filename, speedup_factor)
             # Use ffmpeg to change audio speed
+            # subprocess.run([
+            #     "ffmpeg",
+            #     "-i", tts_filename,
+            #     "-filter:a", f"atempo={speedup_factor}",
+            #     speedup_filename,
+            #     "-y"
+            # ], check=True)
             # Replace the original TTS audio with the sped-up version
             shutil.move(speedup_filename, audio_path)
         with open("entries.json", "w") as file:
             json.dump(entries, file, indent=4)
         return entries
+srt_voice_name="af"
+use_ffmpeg,local_ffmpeg_path = is_ffmpeg_installed()
+# use_ffmpeg=False
+def srt_process(srt_file_path,voice_name,custom_voicepack=None,dest_language="en"):
+  global srt_voice_name,use_ffmpeg
+  if not srt_file_path.endswith(".srt"):
+      gr.Error("Please upload a valid .srt file",duration=5)
+      return None
+  if use_ffmpeg:
+    gr.Success("Using FFmpeg for audio speedup to sync with subtitle")
+  else:
+    gr.Warning("Install FFmpeg to ensure high-quality audio when speeding up the audio to sync with subtitle. Default Using 'librosa' for speedup",duration= 20)
+  if custom_voicepack:
+    if manage_files(custom_voicepack):
+        srt_voice_name = custom_voicepack
+    else:
+        srt_voice_name=voice_name
+        gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
   srt_dubbing = SRTDubbing()
   dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
   srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
     gr.Markdown(
         """
+        # Generate Audio File From Subtitle [Upload Only .srt file]
         To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
                 )
             with gr.Row():
                 generate_btn_ = gr.Button('Generate', variant='primary')
+            with gr.Accordion('Audio Settings', open=False):
+                custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
         with gr.Column():
             audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
             with gr.Accordion('Enable Autoplay', open=False):
     # )
     generate_btn_.click(
         srt_process,
+        inputs=[srt_file,voice,custom_voicepack],
         outputs=[audio]
     )
+#### Voice mixing
+# modified from here
+# https://huggingface.co/spaces/ysharma/Make_Custom_Voices_With_KokoroTTS
+def get_voices():
+    voices = {}
+    for i in os.listdir("./KOKORO/voices"):
+        if i.endswith(".pt"):
+            voice_name = i.replace(".pt", "")
+            voices[voice_name] = torch.load(f"./KOKORO/voices/{i}", weights_only=True).to(device)
+    slider_configs = {}
+    # Iterate through the predefined list of voices
+    for i in voices:
+        # Handle the default case for "af"
+        if i == "af":
+            slider_configs["af"]= "Default 👩🇺🇸"
+            continue
+        if i == "af_nicole":
+            slider_configs["af_nicole"]="Nicole 😏🇺🇸"
+            continue
+        if i == "af_bella":
+            slider_configs["af_bella"]="Bella 🤗🇺🇸"
+            continue
+        # Determine the country emoji
+        country = "🇺🇸" if i.startswith("a") else "🇬🇧"
+        # Determine the gender emoji and name
+        if "f_" in i:
+            display_name = f"{i.split('_')[-1].capitalize()} 👩{country}"
+        elif "m_" in i:
+            display_name = f"{i.split('_')[-1].capitalize()} 👨{country}"
+        else:
+            display_name = f"{i.capitalize()} 😐"
+        # Append the voice tuple to the list
+        slider_configs[i]= display_name
+    return voices, slider_configs
+voices, slider_configs = get_voices()
+def parse_voice_formula(formula):
+    global voices
+    """Parse the voice formula string and return the combined voice tensor."""
+    if not formula.strip():
+        raise ValueError("Empty voice formula")
+    # Initialize the weighted sum
+    weighted_sum = None
+    # Split the formula into terms
+    terms = formula.split('+')
+    weights=0
+    for term in terms:
+        # Parse each term (format: "voice_name * 0.333")
+        parts = term.strip().split('*')
+        if len(parts) != 2:
+            raise ValueError(f"Invalid term format: {term.strip()}. Should be 'voice_name * weight'")
+        voice_name = parts[0].strip()
+        weight = float(parts[1].strip())
+        weights+=weight
+        # print(voice_name)
+        # print(weight)
+        # Get the voice tensor
+        if voice_name not in voices:
+            raise ValueError(f"Unknown voice: {voice_name}")
+        voice_tensor = voices[voice_name]
+        # Add to weighted sum
+        if weighted_sum is None:
+            weighted_sum = weight * voice_tensor
+        else:
+            weighted_sum += weight * voice_tensor
+    return weighted_sum/weights
+def get_new_voice(formula):
+    # print(formula)
+    try:
+        # Parse the formula and get the combined voice tensor
+        weighted_voices = parse_voice_formula(formula)
+        voice_pack_name = "./weighted_normalised_voices.pt"
+        # Save and load the combined voice
+        torch.save(weighted_voices, voice_pack_name)
+        # print(f"Voice pack saved at: {voice_pack_name}")
+        return voice_pack_name
+    except Exception as e:
+        raise gr.Error(f"Failed to create voice: {str(e)}")
+def generate_voice_formula(*values):
+        """
+        Generate a formatted string showing the normalized voice combination.
+        Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2"
+        """
+        n = len(values) // 2
+        checkbox_values = values[:n]
+        slider_values = list(values[n:])
+        global slider_configs
+        # Get active sliders and their names
+        active_pairs = [(slider_values[i], slider_configs[i][0])
+                      for i in range(len(slider_configs))
+                      if checkbox_values[i]]
+        if not active_pairs:
+            return ""
+        # If only one voice is selected, use its actual value
+        if len(active_pairs) == 1:
+            value, name = active_pairs[0]
+            return f"{value:.3f} * {name}"
+        # Calculate sum for normalization of multiple voices
+        total_sum = sum(value for value, _ in active_pairs)
+        if total_sum == 0:
+            return ""
+        # Generate normalized formula for multiple voices
+        terms = []
+        for value, name in active_pairs:
+            normalized_value = value / total_sum
+            terms.append(f"{normalized_value:.3f} * {name}")
+        return " + ".join(terms)
+def create_voice_mix_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # Kokoro Voice Mixer
+            Select voices and adjust their weights to create a mixed voice.
+            """
+        )
+        voice_components = {}
+        voice_names = list(voices.keys())
+        female_voices = [name for name in voice_names if "f_" in name]
+        male_voices = [name for name in voice_names if "b_" in name]
+        neutral_voices = [name for name in voice_names if "f_" not in name and "b_" not in name]
+        # Define how many columns you want
+        num_columns = 3
+        # Function to generate UI
+        def generate_ui_row(voice_list):
+            num_voices = len(voice_list)
+            num_rows = (num_voices + num_columns - 1) // num_columns
+            for i in range(num_rows):
+                with gr.Row():
+                    for j in range(num_columns):
+                        index = i * num_columns + j
+                        if index < num_voices:
+                            voice_name = voice_list[index]
+                            with gr.Column():
+                                checkbox = gr.Checkbox(label=slider_configs[voice_name])
+                                weight_slider = gr.Slider(
+                                    minimum=0,
+                                    maximum=1,
+                                    value=1.0,
+                                    step=0.01,
+                                    interactive=False
+                                )
+                            voice_components[voice_name] = (checkbox, weight_slider)
+                            checkbox.change(
+                                lambda x, slider=weight_slider: gr.update(interactive=x),
+                                inputs=[checkbox],
+                                outputs=[weight_slider]
+                            )
+        generate_ui_row(female_voices)
+        generate_ui_row(male_voices)
+        generate_ui_row(neutral_voices)
+        formula_inputs = []
+        for i in voice_components:
+            checkbox, slider = voice_components[i]
+            formula_inputs.append(checkbox)
+            formula_inputs.append(slider)
+        with gr.Row():
+            voice_formula = gr.Textbox(label="Voice Formula", interactive=False)
+        # Function to dynamically update the voice formula
+        def update_voice_formula(*args):
+            formula_parts = []
+            for i, (checkbox, slider) in enumerate(voice_components.values()):
+                if args[i * 2]:  # If checkbox is selected
+                    formula_parts.append(f"{list(voice_components.keys())[i]} * {args[i * 2 + 1]:.3f}")
+            return " + ".join(formula_parts)
+        # Update formula whenever any checkbox or slider changes
+        for checkbox, slider in voice_components.values():
+            checkbox.change(
+                update_voice_formula,
+                inputs=formula_inputs,
+                outputs=[voice_formula]
+            )
+            slider.change(
+                update_voice_formula,
+                inputs=formula_inputs,
+                outputs=[voice_formula]
+            )
+        with gr.Row():
+            voice_text = gr.Textbox(
+                label='Enter Text',
+                lines=3,
+                placeholder="Type your text here to preview the custom voice..."
+            )
+            voice_generator = gr.Button('Generate', variant='primary')
+        with gr.Accordion('Audio Settings', open=False):
+            model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
+            speed = gr.Slider(
+                minimum=0.25, maximum=2, value=1, step=0.1,
+                label='⚡️Speed', info='Adjust the speaking speed'
+            )
+            remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
+        with gr.Row():
+            voice_audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
+        with gr.Row():
+            mix_voice_download = gr.File(label="Download VoicePack")
+        with gr.Accordion('Enable Autoplay', open=False):
+                        autoplay = gr.Checkbox(value=True, label='Autoplay')
+                        autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[voice_audio])
+        def generate_custom_audio(text_input, formula_text, model_name, speed, remove_silence):
+            try:
+                new_voice_pack = get_new_voice(formula_text)
+                audio_output_path =text_to_speech(text=text_input, model_name=model_name, voice_name="af", speed=speed, pad_between_segments=0, remove_silence=remove_silence, minimum_silence=0.05,custom_voicepack=new_voice_pack,trim=0.0)
+                # audio_output_path = text_to_speech(text=text_input, model_name=model_name,voice_name="af", speed=1.0, custom_voicepack=new_voice_pack)
+                return audio_output_path,new_voice_pack
+            except Exception as e:
+                raise gr.Error(f"Failed to generate audio: {e}")
+        voice_generator.click(
+            generate_custom_audio,
+            inputs=[voice_text, voice_formula,model_name,speed,remove_silence],
+            outputs=[voice_audio,mix_voice_download]
+        )
+    return demo
+demo4 = create_voice_mix_ui()
 display_text = "  \n".join(voice_list)
+with gr.Blocks() as demo5:
     gr.Markdown(f"# Voice Names \n{display_text}")
+import click
+@click.command()
+@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
+@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
+def main(debug, share):
+    demo = gr.TabbedInterface([demo1, demo2,demo3,demo4,demo5], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Voice Mix","Available Voice Names"],title="Kokoro TTS")
     demo.queue().launch(debug=debug, share=share)
     #Run on local network
 # save_at=f"./temp_audio/{os.path.basename(result)}"
 # shutil.move(result, save_at)
+# print(f"Saved at {save_at}")