OpenVoice

Sleeping

App Files Files Community

Chuatury commited on Jun 5, 2025

Commit

416031b

unverified ·

1 Parent(s): ddc2731

remove tts

Browse files

Files changed (17) hide show

README.md +3 -3
app.py +76 -230
app_locally.py +0 -222
checkpoints/base_speakers/EN/checkpoint.pth +0 -3
checkpoints/base_speakers/EN/config.json +0 -145
checkpoints/base_speakers/EN/en_default_se.pth +0 -3
checkpoints/base_speakers/ZH/checkpoint.pth +0 -3
checkpoints/base_speakers/ZH/config.json +0 -137
checkpoints/base_speakers/ZH/zh_default_se.pth +0 -3
openvoice/api.py +0 -62
openvoice/openvoice_app.py +0 -275
openvoice/text/__init__.py +0 -79
openvoice/text/cleaners.py +0 -16
openvoice/text/english.py +0 -188
openvoice/text/mandarin.py +0 -326
openvoice/text/symbols.py +0 -88
requirements.txt +2 -10

README.md CHANGED Viewed

@@ -5,11 +5,11 @@ colorFrom: blue
 colorTo: red
 sdk: gradio
 sdk_version: 3.48.0
-app_file: app_locally.py
 pinned: false
 license: mit
 models:
-- myshell-ai/OpenVoice
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: red
 sdk: gradio
 sdk_version: 3.48.0
+app_file: app.py
 pinned: false
 license: mit
 models:
+  - myshell-ai/OpenVoice
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,268 +1,114 @@
 import os
 import gradio as gr
-import requests
-import langid
-import base64
-import json
-import time
-API_URL = os.environ.get("API_URL")
-TOKEN = os.environ.get("TOKEN")
-supported_languages = ['zh', 'en']
-output_dir = 'outputs'
-os.makedirs(output_dir, exist_ok=True)
-def audio_to_base64(audio_file):
-    with open(audio_file, "rb") as audio_file:
-        audio_data = audio_file.read()
-        base64_data = base64.b64encode(audio_data).decode("utf-8")
-    return base64_data
-def predict(prompt, style, audio_file_pth, agree):
-    # initialize a empty info
-    text_hint = ''
-    # agree with the terms
-    if agree == False:
-        text_hint += '[ERROR] Please accept the Terms & Condition!\n'
-        gr.Warning("Please accept the Terms & Condition!")
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    # first detect the input language
-    language_predicted = langid.classify(prompt)[0].strip()
-    print(f"Detected language:{language_predicted}")
-    if language_predicted not in supported_languages:
-        text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
-        gr.Warning(
-            f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
-        )
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    if language_predicted == "en":
-        if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
-            text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
-            gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
-            return (
-                text_hint,
-                None,
-                None,
-            )
-        style = 'en_' + style
-        prompt_length = len(prompt.split(' '))
-    else:
-        if style not in ['default']:
-            text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
-            gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
-            return (
-                text_hint,
-                None,
-                None,
-            )
-        style = 'cn_' + style
-        prompt_length = len(prompt)
-    speaker_wav = audio_file_pth
-    if prompt_length < 2:
-        text_hint += f"[ERROR] Please give a longer prompt text \n"
-        gr.Warning("Please give a longer prompt text")
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    if prompt_length > 50:
-        text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
-        gr.Warning(
-            "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo for your usage"
         )
         return (
             text_hint,
             None,
             None,
         )
-    save_path = f'{output_dir}/output.wav'
-    speaker_audio_base64 = audio_to_base64(speaker_wav)
-    data = {
-        "text": prompt,
-        "reference_speaker": speaker_audio_base64,
-        "emotion": style
-    }
-    start = time.time()
-    # Send the data as a POST request
-    headers = {
-        "Authorization": f"Bearer {TOKEN}"
-    }
-    response = requests.post(API_URL, json=data, headers=headers, timeout=60)
-    print(f'Get response successfully within {time.time() - start}')
-    task_id = response.json()['task_id']
-    while True:
-        response = requests.post(API_URL.replace('run', 'get_result'), json={'task_id': task_id}, headers=headers)
-        json_data = response.json()
-        status = json_data['status']
-        if status in ["CREATED", "RUNNING"]:
-            time.sleep(1)
-            continue
-        if status == 'FAILED':
-            text_hint += f"[HTTP ERROR] {json_data['error']} \n"
-            gr.Warning(
-                f"[HTTP ERROR] {json_data['error']} \n"
-            )
-            return (
-                text_hint,
-                None,
-                None,
-            )
-        else:
-            decoded_bytes = base64.b64decode(json_data['result']['base64'].encode('utf-8'))
-            with open(save_path, 'wb') as f:
-                f.write(decoded_bytes)
-            text_hint += f'''Get response successfully \n'''
-            return (
-                text_hint,
-                save_path,
-                speaker_wav,
-            )
-title = "MyShell OpenVoice"
-description = """
-We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
-"""
-markdown_table = """
-<div align="center" style="margin-bottom: 10px;">
-|               |               |               |
-| :-----------: | :-----------: | :-----------: |
-| **OpenSource Repo** | **Project Page** | **Join the Community** |
-| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
-</div>
-"""
-markdown_table_v2 = """
-<div align="center" style="margin-bottom: 2px;">
-|               |               |               |              |
-| :-----------: | :-----------: | :-----------: | :-----------: |
-| **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> |  **Project Page** |  [OpenVoice](https://research.myshell.ai/open-voice) |
-| | |
-| :-----------: | :-----------: |
-**Join the Community** |   [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
-</div>
-"""
-content = """
-<div>
-  <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
-  This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
-</div>
-"""
-wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
-examples = [
-    [
-        "今天天气真好，我们一起出去吃饭吧。",
-        'default',
-        "examples/speaker1.mp3",
-        True,
-    ],[
-        "This audio is generated by open voice with a half-performance model.",
-        'whispering',
-        "examples/speaker2.mp3",
-        True,
-    ],
-    [
-        "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
-        'sad',
-        "examples/speaker0.mp3",
-        True,
-    ],
-]
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         with gr.Column():
-            with gr.Row():
-                gr.Markdown(
-                    """
-                    ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
-                    """
-                )
-            with gr.Row():
-                gr.Markdown(markdown_table_v2)
-            with gr.Row():
-                gr.Markdown(description)
-        with gr.Column():
-            gr.Video('./open_voice.mp4', autoplay=True)
-    with gr.Row():
-        gr.HTML(wrapped_markdown_content)
-    with gr.Row():
-        with gr.Column():
-            input_text_gr = gr.Textbox(
-                label="Text Prompt",
-                info="One or two sentences at a time is better. Up to 200 text characters.",
-                value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
-            )
-            style_gr = gr.Dropdown(
-                label="Style",
-                info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
-                choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
-                max_choices=1,
-                value="default",
-            )
             ref_gr = gr.Audio(
                 label="Reference Audio",
                 info="Click on the ✎ button to upload your own target speaker audio",
                 type="filepath",
-                value="examples/speaker2.mp3",
             )
-            tos_gr = gr.Checkbox(
-                label="Agree",
-                value=False,
-                info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
             )
             tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
         with gr.Column():
             out_text_gr = gr.Text(label="Info")
-            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
             ref_audio_gr = gr.Audio(label="Reference Audio Used")
-            gr.Examples(examples,
-                        label="Examples",
-                        inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
-                        outputs=[out_text_gr, audio_gr, ref_audio_gr],
-                        fn=predict,
-                        cache_examples=False,)
-            tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
-demo.queue(concurrency_count=6)
-demo.launch(debug=True, show_api=True)

 import os
+import torch
+import argparse
 import gradio as gr
+parser = argparse.ArgumentParser()
+# parser.add_argument(
+#     "--online_checkpoint_url",
+#     default="https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip",
+# )
+parser.add_argument(
+    "--share", action="store_true", default=False, help="make link public"
+)
+args = parser.parse_args()
+# first download the checkpoints from server
+# if not os.path.exists("checkpoints/"):
+#     print("Downloading OpenVoice checkpoint ...")
+#     os.system(f"wget {args.online_checkpoint_url} -O ckpt.zip")
+#     print("Extracting OpenVoice checkpoint ...")
+#     ZipFile("ckpt.zip").extractall()
+print("Starting OpenVoice")
+from openvoice import se_extractor
+from openvoice.api import ToneColorConverter
+ckpt_converter = "checkpoints/converter"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+output_dir = "outputs"
+os.makedirs(output_dir, exist_ok=True)
+tone_color_converter = ToneColorConverter(
+    f"{ckpt_converter}/config.json", device=device
+)
+tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
+def predict(speaker_wav, transform_wav):
+    # initialize a empty info
+    text_hint = ""
+    # extract source_se
+    source_se, _ = se_extractor.get_se(
+        transform_wav,
+        tone_color_converter,
+        vad=True,
+    )
+    # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
+    try:
+        target_se, _ = se_extractor.get_se(
+            speaker_wav,
+            tone_color_converter,
+            vad=True,
         )
+    except Exception as e:
+        text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
+        gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
         return (
             text_hint,
             None,
             None,
         )
+    save_path = f"{output_dir}/output.wav"
+    # Run the tone color converter
+    tone_color_converter.convert(
+        audio_src_path=transform_wav,
+        src_se=source_se,
+        tgt_se=target_se,
+        output_path=save_path,
+    )
+    text_hint += f"""Get response successfully \n"""
+    return (
+        text_hint,
+        save_path,
+        speaker_wav,
+    )
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         with gr.Column():
             ref_gr = gr.Audio(
                 label="Reference Audio",
                 info="Click on the ✎ button to upload your own target speaker audio",
                 type="filepath",
+                value="examples/speaker0.mp3",
             )
+            tra_gr = gr.Audio(
+                label="Transform Audio",
+                info="Click on the ✎ button to upload your own target transform audio",
+                type="filepath",
+                value=None,
             )
             tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
         with gr.Column():
             out_text_gr = gr.Text(label="Info")
+            audio_gr = gr.Audio(label="Synthesized Audio", autoplay=True)
             ref_audio_gr = gr.Audio(label="Reference Audio Used")
+            tts_button.click(
+                predict,
+                [ref_gr, tra_gr],
+                outputs=[out_text_gr, audio_gr, ref_audio_gr],
+            )
+demo.queue()
+demo.launch(debug=True, show_api=True, share=args.share)

app_locally.py DELETED Viewed

@@ -1,222 +0,0 @@
-import os
-import torch
-import argparse
-import gradio as gr
-# from zipfile import ZipFile
-import langid
-parser = argparse.ArgumentParser()
-# parser.add_argument(
-#     "--online_checkpoint_url",
-#     default="https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip",
-# )
-parser.add_argument(
-    "--share", action="store_true", default=False, help="make link public"
-)
-args = parser.parse_args()
-# first download the checkpoints from server
-# if not os.path.exists("checkpoints/"):
-#     print("Downloading OpenVoice checkpoint ...")
-#     os.system(f"wget {args.online_checkpoint_url} -O ckpt.zip")
-#     print("Extracting OpenVoice checkpoint ...")
-#     ZipFile("ckpt.zip").extractall()
-print("Starting OpenVoice")
-# Init EN/ZH baseTTS and ToneConvertor
-from openvoice import se_extractor
-from openvoice.api import BaseSpeakerTTS, ToneColorConverter
-en_ckpt_base = "checkpoints/base_speakers/EN"
-zh_ckpt_base = "checkpoints/base_speakers/ZH"
-ckpt_converter = "checkpoints/converter"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-output_dir = "outputs"
-os.makedirs(output_dir, exist_ok=True)
-en_base_speaker_tts = BaseSpeakerTTS(f"{en_ckpt_base}/config.json", device=device)
-en_base_speaker_tts.load_ckpt(f"{en_ckpt_base}/checkpoint.pth")
-zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_ckpt_base}/config.json", device=device)
-zh_base_speaker_tts.load_ckpt(f"{zh_ckpt_base}/checkpoint.pth")
-tone_color_converter = ToneColorConverter(
-    f"{ckpt_converter}/config.json", device=device
-)
-tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
-en_source_default_se = torch.load(f"{en_ckpt_base}/en_default_se.pth").to(device)
-zh_source_se = torch.load(f"{zh_ckpt_base}/zh_default_se.pth").to(device)
-supported_languages = ["zh", "en"]
-def predict(prompt, speaker_wav, transform_wav):
-    # initialize a empty info
-    text_hint = ""
-    if transform_wav is not None:
-        # if transform_wav is provided, use it as the source audio
-        src_path = transform_wav
-        text_hint += f"Using transform audio {src_path} as source audio \n"
-        # extract source_se
-        source_se, _ = se_extractor.get_se(
-            transform_wav,
-            tone_color_converter,
-            vad=True,
-        )
-    else:
-        # first detect the input language
-        language_predicted = langid.classify(prompt)[0].strip()
-        print(f"Detected language:{language_predicted}")
-        if language_predicted not in supported_languages:
-            text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
-            gr.Warning(
-                f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
-            )
-            return (
-                text_hint,
-                None,
-                None,
-            )
-        if language_predicted == "zh":
-            tts_model = zh_base_speaker_tts
-            source_se = zh_source_se
-            language = "Chinese"
-        else:
-            tts_model = en_base_speaker_tts
-            source_se = en_source_default_se
-            language = "English"
-        text_hint += f"Using TTS to generate source audio from the prompt text \n"
-        src_path = f"{output_dir}/tmp.wav"
-        tts_model.tts(prompt, src_path, speaker="default", language=language)
-    # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
-    try:
-        target_se, wavs_folder = se_extractor.get_se(
-            speaker_wav,
-            tone_color_converter,
-            vad=True,
-        )
-        # os.system(f'rm -rf {wavs_folder}')
-    except Exception as e:
-        text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
-        gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    save_path = f"{output_dir}/output.wav"
-    # Run the tone color converter
-    tone_color_converter.convert(
-        audio_src_path=src_path,
-        src_se=source_se,
-        tgt_se=target_se,
-        output_path=save_path,
-    )
-    text_hint += f"""Get response successfully \n"""
-    return (
-        text_hint,
-        save_path,
-        speaker_wav,
-        src_path,
-    )
-title = "MyShell OpenVoice"
-content = """
-<div>
-  <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
-  This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
-</div>
-"""
-wrapped_markdown_content = (
-    f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
-)
-examples = [
-    [
-        "今天天气真好，我们一起出去吃饭吧。",
-        "examples/speaker0.mp3",
-        None,
-    ],
-    [
-        "This audio is generated by open voice with a half-performance model.",
-        "examples/speaker1.mp3",
-        None,
-    ],
-    [
-        "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
-        "examples/speaker2.mp3",
-        None,
-    ],
-]
-with gr.Blocks(analytics_enabled=False) as demo:
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                gr.Markdown(
-                    """
-                    ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
-                    """
-                )
-    with gr.Row():
-        gr.HTML(wrapped_markdown_content)
-    with gr.Row():
-        with gr.Column():
-            input_text_gr = gr.Textbox(
-                label="Text Prompt",
-                info="One or two sentences at a time is better. Up to 200 text characters.",
-                value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
-            )
-            ref_gr = gr.Audio(
-                label="Reference Audio",
-                info="Click on the ✎ button to upload your own target speaker audio",
-                type="filepath",
-                value="examples/speaker0.mp3",
-            )
-            tra_gr = gr.Audio(
-                label="Transform Audio",
-                info="Click on the ✎ button to upload your own target transform audio",
-                type="filepath",
-                value=None,
-            )
-            tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
-        with gr.Column():
-            out_text_gr = gr.Text(label="Info")
-            audio_gr = gr.Audio(label="Synthesized Audio", autoplay=True)
-            ref_audio_gr = gr.Audio(label="Reference Audio Used")
-            tts_audio_gr = gr.Audio(label="TTS Generated Audio")
-            gr.Examples(
-                examples,
-                label="Examples",
-                inputs=[input_text_gr, ref_gr, tra_gr],
-                outputs=[out_text_gr, audio_gr, ref_audio_gr, tts_audio_gr],
-                fn=predict,
-                cache_examples=False,
-            )
-            tts_button.click(
-                predict,
-                [input_text_gr, ref_gr, tra_gr],
-                outputs=[out_text_gr, audio_gr, ref_audio_gr, tts_audio_gr],
-            )
-demo.queue()
-demo.launch(debug=True, show_api=True, share=args.share)

checkpoints/base_speakers/EN/checkpoint.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
-size 160467309

checkpoints/base_speakers/EN/config.json DELETED Viewed

@@ -1,145 +0,0 @@
-{
-  "data": {
-    "text_cleaners": [
-      "cjke_cleaners2"
-    ],
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "add_blank": true,
-    "cleaned_text": true,
-    "n_speakers": 10
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "n_layers_trans_flow": 3,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [
-      3,
-      7,
-      11
-    ],
-    "resblock_dilation_sizes": [
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ]
-    ],
-    "upsample_rates": [
-      8,
-      8,
-      2,
-      2
-    ],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [
-      16,
-      16,
-      4,
-      4
-    ],
-    "n_layers_q": 3,
-    "use_spectral_norm": false,
-    "gin_channels": 256
-  },
-  "symbols": [
-    "_",
-    ",",
-    ".",
-    "!",
-    "?",
-    "-",
-    "~",
-    "\u2026",
-    "N",
-    "Q",
-    "a",
-    "b",
-    "d",
-    "e",
-    "f",
-    "g",
-    "h",
-    "i",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "p",
-    "s",
-    "t",
-    "u",
-    "v",
-    "w",
-    "x",
-    "y",
-    "z",
-    "\u0251",
-    "\u00e6",
-    "\u0283",
-    "\u0291",
-    "\u00e7",
-    "\u026f",
-    "\u026a",
-    "\u0254",
-    "\u025b",
-    "\u0279",
-    "\u00f0",
-    "\u0259",
-    "\u026b",
-    "\u0265",
-    "\u0278",
-    "\u028a",
-    "\u027e",
-    "\u0292",
-    "\u03b8",
-    "\u03b2",
-    "\u014b",
-    "\u0266",
-    "\u207c",
-    "\u02b0",
-    "`",
-    "^",
-    "#",
-    "*",
-    "=",
-    "\u02c8",
-    "\u02cc",
-    "\u2192",
-    "\u2193",
-    "\u2191",
-    " "
-  ],
-  "speakers": {
-    "default": 1,
-    "whispering": 2,
-    "shouting": 3,
-    "excited": 4,
-    "cheerful": 5,
-    "terrified": 6,
-    "angry": 7,
-    "sad": 8,
-    "friendly": 9
-  }
-}

checkpoints/base_speakers/EN/en_default_se.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
-size 1789

checkpoints/base_speakers/ZH/checkpoint.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
-size 160467309

checkpoints/base_speakers/ZH/config.json DELETED Viewed

@@ -1,137 +0,0 @@
-{
-  "data": {
-    "text_cleaners": [
-      "cjke_cleaners2"
-    ],
-    "sampling_rate": 22050,
-    "filter_length": 1024,
-    "hop_length": 256,
-    "win_length": 1024,
-    "n_mel_channels": 80,
-    "add_blank": true,
-    "cleaned_text": true,
-    "n_speakers": 10
-  },
-  "model": {
-    "inter_channels": 192,
-    "hidden_channels": 192,
-    "filter_channels": 768,
-    "n_heads": 2,
-    "n_layers": 6,
-    "n_layers_trans_flow": 3,
-    "kernel_size": 3,
-    "p_dropout": 0.1,
-    "resblock": "1",
-    "resblock_kernel_sizes": [
-      3,
-      7,
-      11
-    ],
-    "resblock_dilation_sizes": [
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ]
-    ],
-    "upsample_rates": [
-      8,
-      8,
-      2,
-      2
-    ],
-    "upsample_initial_channel": 512,
-    "upsample_kernel_sizes": [
-      16,
-      16,
-      4,
-      4
-    ],
-    "n_layers_q": 3,
-    "use_spectral_norm": false,
-    "gin_channels": 256
-  },
-  "symbols": [
-    "_",
-    ",",
-    ".",
-    "!",
-    "?",
-    "-",
-    "~",
-    "\u2026",
-    "N",
-    "Q",
-    "a",
-    "b",
-    "d",
-    "e",
-    "f",
-    "g",
-    "h",
-    "i",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "p",
-    "s",
-    "t",
-    "u",
-    "v",
-    "w",
-    "x",
-    "y",
-    "z",
-    "\u0251",
-    "\u00e6",
-    "\u0283",
-    "\u0291",
-    "\u00e7",
-    "\u026f",
-    "\u026a",
-    "\u0254",
-    "\u025b",
-    "\u0279",
-    "\u00f0",
-    "\u0259",
-    "\u026b",
-    "\u0265",
-    "\u0278",
-    "\u028a",
-    "\u027e",
-    "\u0292",
-    "\u03b8",
-    "\u03b2",
-    "\u014b",
-    "\u0266",
-    "\u207c",
-    "\u02b0",
-    "`",
-    "^",
-    "#",
-    "*",
-    "=",
-    "\u02c8",
-    "\u02cc",
-    "\u2192",
-    "\u2193",
-    "\u2191",
-    " "
-  ],
-  "speakers": {
-    "default": 0
-  }
-}

checkpoints/base_speakers/ZH/zh_default_se.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
-size 1789

openvoice/api.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import torch
-import numpy as np
-import re
 import soundfile
 from openvoice import utils
 from openvoice import commons
 import os
 import librosa
-from openvoice.text import text_to_sequence
 from openvoice.mel_processing import spectrogram_torch
 from openvoice.models import SynthesizerTrn
@@ -39,65 +36,6 @@ class OpenVoiceBaseClass(object):
         print('missing/unexpected keys:', a, b)
-class BaseSpeakerTTS(OpenVoiceBaseClass):
-    language_marks = {
-        "english": "EN",
-        "chinese": "ZH",
-    }
-    @staticmethod
-    def get_text(text, hps, is_symbol):
-        text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
-        if hps.data.add_blank:
-            text_norm = commons.intersperse(text_norm, 0)
-        text_norm = torch.LongTensor(text_norm)
-        return text_norm
-    @staticmethod
-    def audio_numpy_concat(segment_data_list, sr, speed=1.):
-        audio_segments = []
-        for segment_data in segment_data_list:
-            audio_segments += segment_data.reshape(-1).tolist()
-            audio_segments += [0] * int((sr * 0.05)/speed)
-        audio_segments = np.array(audio_segments).astype(np.float32)
-        return audio_segments
-    @staticmethod
-    def split_sentences_into_pieces(text, language_str):
-        texts = utils.split_sentence(text, language_str=language_str)
-        print(" > Text splitted to sentences.")
-        print('\n'.join(texts))
-        print(" > ===========================")
-        return texts
-    def tts(self, text, output_path, speaker, language='English', speed=1.0):
-        mark = self.language_marks.get(language.lower(), None)
-        assert mark is not None, f"language {language} is not supported"
-        texts = self.split_sentences_into_pieces(text, mark)
-        audio_list = []
-        for t in texts:
-            t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
-            t = f'[{mark}]{t}[{mark}]'
-            stn_tst = self.get_text(t, self.hps, False)
-            device = self.device
-            speaker_id = self.hps.speakers[speaker]
-            with torch.no_grad():
-                x_tst = stn_tst.unsqueeze(0).to(device)
-                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
-                sid = torch.LongTensor([speaker_id]).to(device)
-                audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
-                                    length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
-            audio_list.append(audio)
-        audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
-        if output_path is None:
-            return audio
-        else:
-            soundfile.write(output_path, audio, self.hps.data.sampling_rate)
 class ToneColorConverter(OpenVoiceBaseClass):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

 import torch
 import soundfile
 from openvoice import utils
 from openvoice import commons
 import os
 import librosa
 from openvoice.mel_processing import spectrogram_torch
 from openvoice.models import SynthesizerTrn
         print('missing/unexpected keys:', a, b)
 class ToneColorConverter(OpenVoiceBaseClass):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

openvoice/openvoice_app.py DELETED Viewed

@@ -1,275 +0,0 @@
-import os
-import torch
-import argparse
-import gradio as gr
-from zipfile import ZipFile
-import langid
-from openvoice import se_extractor
-from openvoice.api import BaseSpeakerTTS, ToneColorConverter
-parser = argparse.ArgumentParser()
-parser.add_argument("--share", action='store_true', default=False, help="make link public")
-args = parser.parse_args()
-en_ckpt_base = 'checkpoints/base_speakers/EN'
-zh_ckpt_base = 'checkpoints/base_speakers/ZH'
-ckpt_converter = 'checkpoints/converter'
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-output_dir = 'outputs'
-os.makedirs(output_dir, exist_ok=True)
-# load models
-en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
-en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
-zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
-zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
-tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
-tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
-# load speaker embeddings
-en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
-en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
-zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
-# This online demo mainly supports English and Chinese
-supported_languages = ['zh', 'en']
-def predict(prompt, style, audio_file_pth, agree):
-    # initialize a empty info
-    text_hint = ''
-    # agree with the terms
-    if agree == False:
-        text_hint += '[ERROR] Please accept the Terms & Condition!\n'
-        gr.Warning("Please accept the Terms & Condition!")
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    # first detect the input language
-    language_predicted = langid.classify(prompt)[0].strip()
-    print(f"Detected language:{language_predicted}")
-    if language_predicted not in supported_languages:
-        text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
-        gr.Warning(
-            f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
-        )
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    if language_predicted == "zh":
-        tts_model = zh_base_speaker_tts
-        source_se = zh_source_se
-        language = 'Chinese'
-        if style not in ['default']:
-            text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
-            gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
-            return (
-                text_hint,
-                None,
-                None,
-            )
-    else:
-        tts_model = en_base_speaker_tts
-        if style == 'default':
-            source_se = en_source_default_se
-        else:
-            source_se = en_source_style_se
-        language = 'English'
-        if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
-            text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
-            gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
-            return (
-                text_hint,
-                None,
-                None,
-            )
-    speaker_wav = audio_file_pth
-    if len(prompt) < 2:
-        text_hint += f"[ERROR] Please give a longer prompt text \n"
-        gr.Warning("Please give a longer prompt text")
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    if len(prompt) > 200:
-        text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
-        gr.Warning(
-            "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
-        )
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
-    try:
-        target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
-    except Exception as e:
-        text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
-        gr.Warning(
-            "[ERROR] Get target tone color error {str(e)} \n"
-        )
-        return (
-            text_hint,
-            None,
-            None,
-        )
-    src_path = f'{output_dir}/tmp.wav'
-    tts_model.tts(prompt, src_path, speaker=style, language=language)
-    save_path = f'{output_dir}/output.wav'
-    # Run the tone color converter
-    encode_message = "@MyShell"
-    tone_color_converter.convert(
-        audio_src_path=src_path,
-        src_se=source_se,
-        tgt_se=target_se,
-        output_path=save_path,
-        message=encode_message)
-    text_hint += f'''Get response successfully \n'''
-    return (
-        text_hint,
-        save_path,
-        speaker_wav,
-    )
-title = "MyShell OpenVoice"
-description = """
-We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
-"""
-markdown_table = """
-<div align="center" style="margin-bottom: 10px;">
-|               |               |               |
-| :-----------: | :-----------: | :-----------: |
-| **OpenSource Repo** | **Project Page** | **Join the Community** |
-| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
-</div>
-"""
-markdown_table_v2 = """
-<div align="center" style="margin-bottom: 2px;">
-|               |               |               |              |
-| :-----------: | :-----------: | :-----------: | :-----------: |
-| **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> |  **Project Page** |  [OpenVoice](https://research.myshell.ai/open-voice) |
-| | |
-| :-----------: | :-----------: |
-**Join the Community** |   [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
-</div>
-"""
-content = """
-<div>
-  <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
-  This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
-</div>
-"""
-wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
-examples = [
-    [
-        "今天天气真好，我们一起出去吃饭吧。",
-        'default',
-        "resources/demo_speaker1.mp3",
-        True,
-    ],[
-        "This audio is generated by open voice with a half-performance model.",
-        'whispering',
-        "resources/demo_speaker2.mp3",
-        True,
-    ],
-    [
-        "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
-        'sad',
-        "resources/demo_speaker0.mp3",
-        True,
-    ],
-]
-with gr.Blocks(analytics_enabled=False) as demo:
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                gr.Markdown(
-                    """
-                    ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
-                    """
-                )
-            with gr.Row():
-                gr.Markdown(markdown_table_v2)
-            with gr.Row():
-                gr.Markdown(description)
-        with gr.Column():
-            gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
-    with gr.Row():
-        gr.HTML(wrapped_markdown_content)
-    with gr.Row():
-        with gr.Column():
-            input_text_gr = gr.Textbox(
-                label="Text Prompt",
-                info="One or two sentences at a time is better. Up to 200 text characters.",
-                value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
-            )
-            style_gr = gr.Dropdown(
-                label="Style",
-                info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
-                choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
-                max_choices=1,
-                value="default",
-            )
-            ref_gr = gr.Audio(
-                label="Reference Audio",
-                info="Click on the ✎ button to upload your own target speaker audio",
-                type="filepath",
-                value="resources/demo_speaker2.mp3",
-            )
-            tos_gr = gr.Checkbox(
-                label="Agree",
-                value=False,
-                info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
-            )
-            tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
-        with gr.Column():
-            out_text_gr = gr.Text(label="Info")
-            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
-            ref_audio_gr = gr.Audio(label="Reference Audio Used")
-            gr.Examples(examples,
-                        label="Examples",
-                        inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
-                        outputs=[out_text_gr, audio_gr, ref_audio_gr],
-                        fn=predict,
-                        cache_examples=False,)
-            tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
-demo.queue()
-demo.launch(debug=True, show_api=True, share=args.share)

openvoice/text/__init__.py DELETED Viewed

@@ -1,79 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-from openvoice.text import cleaners
-from openvoice.text.symbols import symbols
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-def text_to_sequence(text, symbols, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  sequence = []
-  symbol_to_id = {s: i for i, s in enumerate(symbols)}
-  clean_text = _clean_text(text, cleaner_names)
-  print(clean_text)
-  print(f" length:{len(clean_text)}")
-  for symbol in clean_text:
-    if symbol not in symbol_to_id.keys():
-      continue
-    symbol_id = symbol_to_id[symbol]
-    sequence += [symbol_id]
-  print(f" length:{len(sequence)}")
-  return sequence
-def cleaned_text_to_sequence(cleaned_text, symbols):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-    Returns:
-      List of integers corresponding to the symbols in the text
-  '''
-  symbol_to_id = {s: i for i, s in enumerate(symbols)}
-  sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
-  return sequence
-from openvoice.text.symbols import language_tone_start_map
-def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
-    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-    Returns:
-      List of integers corresponding to the symbols in the text
-    """
-    symbol_to_id = {s: i for i, s in enumerate(symbols)}
-    language_id_map = {s: i for i, s in enumerate(languages)}
-    phones = [symbol_to_id[symbol] for symbol in cleaned_text]
-    tone_start = language_tone_start_map[language]
-    tones = [i + tone_start for i in tones]
-    lang_id = language_id_map[language]
-    lang_ids = [lang_id for i in phones]
-    return phones, tones, lang_ids
-def sequence_to_text(sequence):
-  '''Converts a sequence of IDs back to a string'''
-  result = ''
-  for symbol_id in sequence:
-    s = _id_to_symbol[symbol_id]
-    result += s
-  return result
-def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text = cleaner(text)
-  return text

openvoice/text/cleaners.py DELETED Viewed

@@ -1,16 +0,0 @@
-import re
-from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
-from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
-def cjke_cleaners2(text):
-    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
-                  lambda x: chinese_to_ipa(x.group(1))+' ', text)
-    text = re.sub(r'\[JA\](.*?)\[JA\]',
-                  lambda x: japanese_to_ipa2(x.group(1))+' ', text)
-    text = re.sub(r'\[KO\](.*?)\[KO\]',
-                  lambda x: korean_to_ipa(x.group(1))+' ', text)
-    text = re.sub(r'\[EN\](.*?)\[EN\]',
-                  lambda x: english_to_ipa2(x.group(1))+' ', text)
-    text = re.sub(r'\s+$', '', text)
-    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
-    return text

openvoice/text/english.py DELETED Viewed

@@ -1,188 +0,0 @@
-""" from https://github.com/keithito/tacotron """
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-# Regular expression matching whitespace:
-import re
-import inflect
-from unidecode import unidecode
-import eng_to_ipa as ipa
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('mrs', 'misess'),
-    ('mr', 'mister'),
-    ('dr', 'doctor'),
-    ('st', 'saint'),
-    ('co', 'company'),
-    ('jr', 'junior'),
-    ('maj', 'major'),
-    ('gen', 'general'),
-    ('drs', 'doctors'),
-    ('rev', 'reverend'),
-    ('lt', 'lieutenant'),
-    ('hon', 'honorable'),
-    ('sgt', 'sergeant'),
-    ('capt', 'captain'),
-    ('esq', 'esquire'),
-    ('ltd', 'limited'),
-    ('col', 'colonel'),
-    ('ft', 'fort'),
-]]
-# List of (ipa, lazy ipa) pairs:
-_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('r', 'ɹ'),
-    ('æ', 'e'),
-    ('ɑ', 'a'),
-    ('ɔ', 'o'),
-    ('ð', 'z'),
-    ('θ', 's'),
-    ('ɛ', 'e'),
-    ('ɪ', 'i'),
-    ('ʊ', 'u'),
-    ('ʒ', 'ʥ'),
-    ('ʤ', 'ʥ'),
-    ('ˈ', '↓'),
-]]
-# List of (ipa, lazy ipa2) pairs:
-_lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('r', 'ɹ'),
-    ('ð', 'z'),
-    ('θ', 's'),
-    ('ʒ', 'ʑ'),
-    ('ʤ', 'dʑ'),
-    ('ˈ', '↓'),
-]]
-# List of (ipa, ipa2) pairs
-_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('r', 'ɹ'),
-    ('ʤ', 'dʒ'),
-    ('ʧ', 'tʃ')
-]]
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-def collapse_whitespace(text):
-    return re.sub(r'\s+', ' ', text)
-def _remove_commas(m):
-    return m.group(1).replace(',', '')
-def _expand_decimal_point(m):
-    return m.group(1).replace('.', ' point ')
-def _expand_dollars(m):
-    match = m.group(1)
-    parts = match.split('.')
-    if len(parts) > 2:
-        return match + ' dollars'  # Unexpected format
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        return '%s %s' % (dollars, dollar_unit)
-    elif cents:
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s' % (cents, cent_unit)
-    else:
-        return 'zero dollars'
-def _expand_ordinal(m):
-    return _inflect.number_to_words(m.group(0))
-def _expand_number(m):
-    num = int(m.group(0))
-    if num > 1000 and num < 3000:
-        if num == 2000:
-            return 'two thousand'
-        elif num > 2000 and num < 2010:
-            return 'two thousand ' + _inflect.number_to_words(num % 100)
-        elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + ' hundred'
-        else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
-    else:
-        return _inflect.number_to_words(num, andword='')
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r'\1 pounds', text)
-    text = re.sub(_dollars_re, _expand_dollars, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text
-def mark_dark_l(text):
-    return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
-def english_to_ipa(text):
-    text = unidecode(text).lower()
-    text = expand_abbreviations(text)
-    text = normalize_numbers(text)
-    phonemes = ipa.convert(text)
-    phonemes = collapse_whitespace(phonemes)
-    return phonemes
-def english_to_lazy_ipa(text):
-    text = english_to_ipa(text)
-    for regex, replacement in _lazy_ipa:
-        text = re.sub(regex, replacement, text)
-    return text
-def english_to_ipa2(text):
-    text = english_to_ipa(text)
-    text = mark_dark_l(text)
-    for regex, replacement in _ipa_to_ipa2:
-        text = re.sub(regex, replacement, text)
-    return text.replace('...', '…')
-def english_to_lazy_ipa2(text):
-    text = english_to_ipa(text)
-    for regex, replacement in _lazy_ipa2:
-        text = re.sub(regex, replacement, text)
-    return text

openvoice/text/mandarin.py DELETED Viewed

@@ -1,326 +0,0 @@
-import os
-import sys
-import re
-from pypinyin import lazy_pinyin, BOPOMOFO
-import jieba
-import cn2an
-import logging
-# List of (Latin alphabet, bopomofo) pairs:
-_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('a', 'ㄟˉ'),
-    ('b', 'ㄅㄧˋ'),
-    ('c', 'ㄙㄧˉ'),
-    ('d', 'ㄉㄧˋ'),
-    ('e', 'ㄧˋ'),
-    ('f', 'ㄝˊㄈㄨˋ'),
-    ('g', 'ㄐㄧˋ'),
-    ('h', 'ㄝˇㄑㄩˋ'),
-    ('i', 'ㄞˋ'),
-    ('j', 'ㄐㄟˋ'),
-    ('k', 'ㄎㄟˋ'),
-    ('l', 'ㄝˊㄛˋ'),
-    ('m', 'ㄝˊㄇㄨˋ'),
-    ('n', 'ㄣˉ'),
-    ('o', 'ㄡˉ'),
-    ('p', 'ㄆㄧˉ'),
-    ('q', 'ㄎㄧㄡˉ'),
-    ('r', 'ㄚˋ'),
-    ('s', 'ㄝˊㄙˋ'),
-    ('t', 'ㄊㄧˋ'),
-    ('u', 'ㄧㄡˉ'),
-    ('v', 'ㄨㄧˉ'),
-    ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
-    ('x', 'ㄝˉㄎㄨˋㄙˋ'),
-    ('y', 'ㄨㄞˋ'),
-    ('z', 'ㄗㄟˋ')
-]]
-# List of (bopomofo, romaji) pairs:
-_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('ㄅㄛ', 'p⁼wo'),
-    ('ㄆㄛ', 'pʰwo'),
-    ('ㄇㄛ', 'mwo'),
-    ('ㄈㄛ', 'fwo'),
-    ('ㄅ', 'p⁼'),
-    ('ㄆ', 'pʰ'),
-    ('ㄇ', 'm'),
-    ('ㄈ', 'f'),
-    ('ㄉ', 't⁼'),
-    ('ㄊ', 'tʰ'),
-    ('ㄋ', 'n'),
-    ('ㄌ', 'l'),
-    ('ㄍ', 'k⁼'),
-    ('ㄎ', 'kʰ'),
-    ('ㄏ', 'h'),
-    ('ㄐ', 'ʧ⁼'),
-    ('ㄑ', 'ʧʰ'),
-    ('ㄒ', 'ʃ'),
-    ('ㄓ', 'ʦ`⁼'),
-    ('ㄔ', 'ʦ`ʰ'),
-    ('ㄕ', 's`'),
-    ('ㄖ', 'ɹ`'),
-    ('ㄗ', 'ʦ⁼'),
-    ('ㄘ', 'ʦʰ'),
-    ('ㄙ', 's'),
-    ('ㄚ', 'a'),
-    ('ㄛ', 'o'),
-    ('ㄜ', 'ə'),
-    ('ㄝ', 'e'),
-    ('ㄞ', 'ai'),
-    ('ㄟ', 'ei'),
-    ('ㄠ', 'au'),
-    ('ㄡ', 'ou'),
-    ('ㄧㄢ', 'yeNN'),
-    ('ㄢ', 'aNN'),
-    ('ㄧㄣ', 'iNN'),
-    ('ㄣ', 'əNN'),
-    ('ㄤ', 'aNg'),
-    ('ㄧㄥ', 'iNg'),
-    ('ㄨㄥ', 'uNg'),
-    ('ㄩㄥ', 'yuNg'),
-    ('ㄥ', 'əNg'),
-    ('ㄦ', 'əɻ'),
-    ('ㄧ', 'i'),
-    ('ㄨ', 'u'),
-    ('ㄩ', 'ɥ'),
-    ('ˉ', '→'),
-    ('ˊ', '↑'),
-    ('ˇ', '↓↑'),
-    ('ˋ', '↓'),
-    ('˙', ''),
-    ('，', ','),
-    ('。', '.'),
-    ('！', '!'),
-    ('？', '?'),
-    ('—', '-')
-]]
-# List of (romaji, ipa) pairs:
-_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('ʃy', 'ʃ'),
-    ('ʧʰy', 'ʧʰ'),
-    ('ʧ⁼y', 'ʧ⁼'),
-    ('NN', 'n'),
-    ('Ng', 'ŋ'),
-    ('y', 'j'),
-    ('h', 'x')
-]]
-# List of (bopomofo, ipa) pairs:
-_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('ㄅㄛ', 'p⁼wo'),
-    ('ㄆㄛ', 'pʰwo'),
-    ('ㄇㄛ', 'mwo'),
-    ('ㄈㄛ', 'fwo'),
-    ('ㄅ', 'p⁼'),
-    ('ㄆ', 'pʰ'),
-    ('ㄇ', 'm'),
-    ('ㄈ', 'f'),
-    ('ㄉ', 't⁼'),
-    ('ㄊ', 'tʰ'),
-    ('ㄋ', 'n'),
-    ('ㄌ', 'l'),
-    ('ㄍ', 'k⁼'),
-    ('ㄎ', 'kʰ'),
-    ('ㄏ', 'x'),
-    ('ㄐ', 'tʃ⁼'),
-    ('ㄑ', 'tʃʰ'),
-    ('ㄒ', 'ʃ'),
-    ('ㄓ', 'ts`⁼'),
-    ('ㄔ', 'ts`ʰ'),
-    ('ㄕ', 's`'),
-    ('ㄖ', 'ɹ`'),
-    ('ㄗ', 'ts⁼'),
-    ('ㄘ', 'tsʰ'),
-    ('ㄙ', 's'),
-    ('ㄚ', 'a'),
-    ('ㄛ', 'o'),
-    ('ㄜ', 'ə'),
-    ('ㄝ', 'ɛ'),
-    ('ㄞ', 'aɪ'),
-    ('ㄟ', 'eɪ'),
-    ('ㄠ', 'ɑʊ'),
-    ('ㄡ', 'oʊ'),
-    ('ㄧㄢ', 'jɛn'),
-    ('ㄩㄢ', 'ɥæn'),
-    ('ㄢ', 'an'),
-    ('ㄧㄣ', 'in'),
-    ('ㄩㄣ', 'ɥn'),
-    ('ㄣ', 'ən'),
-    ('ㄤ', 'ɑŋ'),
-    ('ㄧㄥ', 'iŋ'),
-    ('ㄨㄥ', 'ʊŋ'),
-    ('ㄩㄥ', 'jʊŋ'),
-    ('ㄥ', 'əŋ'),
-    ('ㄦ', 'əɻ'),
-    ('ㄧ', 'i'),
-    ('ㄨ', 'u'),
-    ('ㄩ', 'ɥ'),
-    ('ˉ', '→'),
-    ('ˊ', '↑'),
-    ('ˇ', '↓↑'),
-    ('ˋ', '↓'),
-    ('˙', ''),
-    ('，', ','),
-    ('。', '.'),
-    ('！', '!'),
-    ('？', '?'),
-    ('—', '-')
-]]
-# List of (bopomofo, ipa2) pairs:
-_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('ㄅㄛ', 'pwo'),
-    ('ㄆㄛ', 'pʰwo'),
-    ('ㄇㄛ', 'mwo'),
-    ('ㄈㄛ', 'fwo'),
-    ('ㄅ', 'p'),
-    ('ㄆ', 'pʰ'),
-    ('ㄇ', 'm'),
-    ('ㄈ', 'f'),
-    ('ㄉ', 't'),
-    ('ㄊ', 'tʰ'),
-    ('ㄋ', 'n'),
-    ('ㄌ', 'l'),
-    ('ㄍ', 'k'),
-    ('ㄎ', 'kʰ'),
-    ('ㄏ', 'h'),
-    ('ㄐ', 'tɕ'),
-    ('ㄑ', 'tɕʰ'),
-    ('ㄒ', 'ɕ'),
-    ('ㄓ', 'tʂ'),
-    ('ㄔ', 'tʂʰ'),
-    ('ㄕ', 'ʂ'),
-    ('ㄖ', 'ɻ'),
-    ('ㄗ', 'ts'),
-    ('ㄘ', 'tsʰ'),
-    ('ㄙ', 's'),
-    ('ㄚ', 'a'),
-    ('ㄛ', 'o'),
-    ('ㄜ', 'ɤ'),
-    ('ㄝ', 'ɛ'),
-    ('ㄞ', 'aɪ'),
-    ('ㄟ', 'eɪ'),
-    ('ㄠ', 'ɑʊ'),
-    ('ㄡ', 'oʊ'),
-    ('ㄧㄢ', 'jɛn'),
-    ('ㄩㄢ', 'yæn'),
-    ('ㄢ', 'an'),
-    ('ㄧㄣ', 'in'),
-    ('ㄩㄣ', 'yn'),
-    ('ㄣ', 'ən'),
-    ('ㄤ', 'ɑŋ'),
-    ('ㄧㄥ', 'iŋ'),
-    ('ㄨㄥ', 'ʊŋ'),
-    ('ㄩㄥ', 'jʊŋ'),
-    ('ㄥ', 'ɤŋ'),
-    ('ㄦ', 'əɻ'),
-    ('ㄧ', 'i'),
-    ('ㄨ', 'u'),
-    ('ㄩ', 'y'),
-    ('ˉ', '˥'),
-    ('ˊ', '˧˥'),
-    ('ˇ', '˨˩˦'),
-    ('ˋ', '˥˩'),
-    ('˙', ''),
-    ('，', ','),
-    ('。', '.'),
-    ('！', '!'),
-    ('？', '?'),
-    ('—', '-')
-]]
-def number_to_chinese(text):
-    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
-    for number in numbers:
-        text = text.replace(number, cn2an.an2cn(number), 1)
-    return text
-def chinese_to_bopomofo(text):
-    text = text.replace('、', '，').replace('；', '，').replace('：', '，')
-    words = jieba.lcut(text, cut_all=False)
-    text = ''
-    for word in words:
-        bopomofos = lazy_pinyin(word, BOPOMOFO)
-        if not re.search('[\u4e00-\u9fff]', word):
-            text += word
-            continue
-        for i in range(len(bopomofos)):
-            bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
-        if text != '':
-            text += ' '
-        text += ''.join(bopomofos)
-    return text
-def latin_to_bopomofo(text):
-    for regex, replacement in _latin_to_bopomofo:
-        text = re.sub(regex, replacement, text)
-    return text
-def bopomofo_to_romaji(text):
-    for regex, replacement in _bopomofo_to_romaji:
-        text = re.sub(regex, replacement, text)
-    return text
-def bopomofo_to_ipa(text):
-    for regex, replacement in _bopomofo_to_ipa:
-        text = re.sub(regex, replacement, text)
-    return text
-def bopomofo_to_ipa2(text):
-    for regex, replacement in _bopomofo_to_ipa2:
-        text = re.sub(regex, replacement, text)
-    return text
-def chinese_to_romaji(text):
-    text = number_to_chinese(text)
-    text = chinese_to_bopomofo(text)
-    text = latin_to_bopomofo(text)
-    text = bopomofo_to_romaji(text)
-    text = re.sub('i([aoe])', r'y\1', text)
-    text = re.sub('u([aoəe])', r'w\1', text)
-    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
-                  r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
-    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
-    return text
-def chinese_to_lazy_ipa(text):
-    text = chinese_to_romaji(text)
-    for regex, replacement in _romaji_to_ipa:
-        text = re.sub(regex, replacement, text)
-    return text
-def chinese_to_ipa(text):
-    text = number_to_chinese(text)
-    text = chinese_to_bopomofo(text)
-    text = latin_to_bopomofo(text)
-    text = bopomofo_to_ipa(text)
-    text = re.sub('i([aoe])', r'j\1', text)
-    text = re.sub('u([aoəe])', r'w\1', text)
-    text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
-                  r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
-    text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
-    return text
-def chinese_to_ipa2(text):
-    text = number_to_chinese(text)
-    text = chinese_to_bopomofo(text)
-    text = latin_to_bopomofo(text)
-    text = bopomofo_to_ipa2(text)
-    text = re.sub(r'i([aoe])', r'j\1', text)
-    text = re.sub(r'u([aoəe])', r'w\1', text)
-    text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
-    text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
-    return text

openvoice/text/symbols.py DELETED Viewed

@@ -1,88 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-'''
-# japanese_cleaners
-# _pad        = '_'
-# _punctuation = ',.!?-'
-# _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
-'''# japanese_cleaners2
-_pad        = '_'
-_punctuation = ',.!?-~…'
-_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
-'''
-'''# korean_cleaners
-_pad        = '_'
-_punctuation = ',.!?…~'
-_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
-'''
-'''# chinese_cleaners
-_pad        = '_'
-_punctuation = '，。！？—…'
-_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
-'''
-# # zh_ja_mixture_cleaners
-# _pad        = '_'
-# _punctuation = ',.!?-~…'
-# _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
-'''# sanskrit_cleaners
-_pad        = '_'
-_punctuation = '।'
-_letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
-'''
-'''# cjks_cleaners
-_pad        = '_'
-_punctuation = ',.!?-~…'
-_letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
-'''
-'''# thai_cleaners
-_pad        = '_'
-_punctuation = '.!? '
-_letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
-'''
-# # cjke_cleaners2
-_pad        = '_'
-_punctuation = ',.!?-~…'
-_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
-'''# shanghainese_cleaners
-_pad        = '_'
-_punctuation = ',.!?…'
-_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
-'''
-'''# chinese_dialect_cleaners
-_pad        = '_'
-_punctuation = ',.!?~…─'
-_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
-'''
-# Export all symbols:
-symbols = [_pad] + list(_punctuation) + list(_letters)
-# Special symbol ids
-SPACE_ID = symbols.index(" ")
-num_ja_tones = 1
-num_kr_tones = 1
-num_zh_tones = 6
-num_en_tones = 4
-language_tone_start_map = {
-    "ZH": 0,
-    "JP": num_zh_tones,
-    "EN": num_zh_tones + num_ja_tones,
-    'KR': num_zh_tones + num_ja_tones + num_en_tones,
-}

requirements.txt CHANGED Viewed

@@ -1,16 +1,8 @@
-langid
 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
-wavmark==0.0.2
 numpy==1.22.0
-eng_to_ipa==0.0.2
-inflect==7.0.0
-unidecode==1.3.7
 whisper-timestamped==1.14.2
 openai
-python-dotenv
-pypinyin==0.50.0
-cn2an==0.5.22
-jieba==0.42.1
-torch

 librosa==0.9.1
 faster-whisper==0.9.0
 pydub==0.25.1
 numpy==1.22.0
 whisper-timestamped==1.14.2
 openai
+torch
+torchaudio