Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,166 +1,56 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
-
# # # from TTS.api import TTS
|
| 4 |
-
# # # import os
|
| 5 |
-
|
| 6 |
-
# # # # Agree to the Coqui license terms
|
| 7 |
-
# # # os.environ["COQUI_TOS_AGREED"] = "1"
|
| 8 |
-
|
| 9 |
-
# # # # Initialize device for GPU or CPU
|
| 10 |
-
# # # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 11 |
-
|
| 12 |
-
# # # # Initialize TTS model
|
| 13 |
-
# # # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 14 |
-
|
| 15 |
-
# # # # Define clone function for Gradio interface
|
| 16 |
-
# # # def clone(text, audio):
|
| 17 |
-
# # # # Use the reference audio and synthesize speech
|
| 18 |
-
# # # output_path = "./output.wav"
|
| 19 |
-
# # # tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path=output_path)
|
| 20 |
-
# # # return output_path
|
| 21 |
-
|
| 22 |
-
# # # # Set up the Gradio Interface
|
| 23 |
-
# # # iface = gr.Interface(
|
| 24 |
-
# # # fn=clone,
|
| 25 |
-
# # # inputs=[
|
| 26 |
-
# # # gr.Textbox(label='Text'),
|
| 27 |
-
# # # gr.Audio(type='filepath', label='Voice reference audio file')
|
| 28 |
-
# # # ],
|
| 29 |
-
# # # outputs=gr.Audio(type='filepath'),
|
| 30 |
-
# # # title='Voice Clone Trial',
|
| 31 |
-
# # # theme=gr.themes.Base(primary_hue="red", secondary_hue="orange", neutral_hue="black"),
|
| 32 |
-
# # # )
|
| 33 |
-
|
| 34 |
-
# # # # Launch the interface
|
| 35 |
-
# # # iface.launch()
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
# # import whisper
|
| 39 |
-
# # from gtts import gTTS
|
| 40 |
-
# # import gradio as gr
|
| 41 |
-
# # import os
|
| 42 |
-
|
| 43 |
-
# # # Load Whisper model
|
| 44 |
-
# # model = whisper.load_model("base")
|
| 45 |
-
|
| 46 |
-
# # # Function to transcribe and convert text to speech
|
| 47 |
-
# # def audio_to_audio(input_audio):
|
| 48 |
-
# # # Check if the audio path exists (Gradio will pass a file path as input_audio)
|
| 49 |
-
# # if input_audio is None:
|
| 50 |
-
# # return "No audio file provided", None
|
| 51 |
-
|
| 52 |
-
# # # Transcribe the audio using Whisper
|
| 53 |
-
# # result = model.transcribe(input_audio)
|
| 54 |
-
# # transcription = result["text"]
|
| 55 |
-
|
| 56 |
-
# # # Convert transcription to speech using gTTS
|
| 57 |
-
# # tts = gTTS(transcription)
|
| 58 |
-
|
| 59 |
-
# # # Save the output in a temporary directory
|
| 60 |
-
# # output_audio_path = "/tmp/output_audio.mp3"
|
| 61 |
-
# # tts.save(output_audio_path)
|
| 62 |
-
|
| 63 |
-
# # # Return the transcribed text and the path to the generated speech
|
| 64 |
-
# # return transcription, output_audio_path
|
| 65 |
-
|
| 66 |
-
# # # Create Gradio interface
|
| 67 |
-
# # interface = gr.Interface(
|
| 68 |
-
# # fn=audio_to_audio,
|
| 69 |
-
# # inputs=gr.Audio(type="filepath"),
|
| 70 |
-
# # outputs=[gr.Textbox(label="Transcription"), gr.Audio(label="Generated Audio")]
|
| 71 |
-
# # )
|
| 72 |
-
|
| 73 |
-
# # # Launch the Gradio app
|
| 74 |
-
# # interface.launch()
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
# import gradio as gr
|
| 78 |
-
# from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 79 |
-
# import torch
|
| 80 |
-
# import soundfile as sf
|
| 81 |
-
|
| 82 |
-
# # Load the Coqui XTTS model and tokenizer
|
| 83 |
-
# model_name = "coqui/XTTS-v2"
|
| 84 |
-
# model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 85 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 86 |
-
|
| 87 |
-
# # Function to generate speech
|
| 88 |
-
# def generate_speech(text):
|
| 89 |
-
# # Tokenize the input text
|
| 90 |
-
# inputs = tokenizer(text, return_tensors="pt")
|
| 91 |
-
|
| 92 |
-
# # Generate speech using the model
|
| 93 |
-
# with torch.no_grad():
|
| 94 |
-
# speech = model.generate(**inputs)
|
| 95 |
-
|
| 96 |
-
# # Convert the output speech into a wav format
|
| 97 |
-
# speech_wav = speech[0].cpu().numpy() # assuming the output is in a NumPy array
|
| 98 |
-
# output_path = "/tmp/output.wav" # save it in the temporary directory
|
| 99 |
-
|
| 100 |
-
# # Save audio as .wav file
|
| 101 |
-
# sf.write(output_path, speech_wav, samplerate=16000)
|
| 102 |
-
|
| 103 |
-
# return output_path
|
| 104 |
-
|
| 105 |
-
# # Create the Gradio interface
|
| 106 |
-
# interface = gr.Interface(
|
| 107 |
-
# fn=generate_speech,
|
| 108 |
-
# inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
|
| 109 |
-
# outputs=gr.Audio(label="Generated Speech"),
|
| 110 |
-
# title="Text to Speech with Coqui XTTS-v2",
|
| 111 |
-
# description="Generate speech from text using the Coqui XTTS-v2 model."
|
| 112 |
-
# )
|
| 113 |
-
|
| 114 |
-
# # Launch the Gradio app
|
| 115 |
-
# interface.launch()
|
| 116 |
-
|
| 117 |
-
|
| 118 |
import gradio as gr
|
| 119 |
-
import torch
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# WebUI by mrfakename
|
| 2 |
+
# Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
+
import os, torch, io
|
| 5 |
+
os.system('python -m unidic download')
|
| 6 |
+
# print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
|
| 7 |
+
from melo.api import TTS
|
| 8 |
+
speed = 1.0
|
| 9 |
+
import tempfile
|
| 10 |
+
import nltk
|
| 11 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
| 12 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 13 |
+
models = {
|
| 14 |
+
'EN': TTS(language='EN', device=device),
|
| 15 |
+
'ES': TTS(language='ES', device=device),
|
| 16 |
+
'FR': TTS(language='FR', device=device),
|
| 17 |
+
'ZH': TTS(language='ZH', device=device),
|
| 18 |
+
'JP': TTS(language='JP', device=device),
|
| 19 |
+
'KR': TTS(language='KR', device=device),
|
| 20 |
+
}
|
| 21 |
+
speaker_ids = models['EN'].hps.data.spk2id
|
| 22 |
+
|
| 23 |
+
default_text_dict = {
|
| 24 |
+
'EN': 'The field of text-to-speech has seen rapid development recently.',
|
| 25 |
+
'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.',
|
| 26 |
+
'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment',
|
| 27 |
+
'ZH': 'text-to-speech 领域近年来发展迅速',
|
| 28 |
+
'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています',
|
| 29 |
+
'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.',
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def synthesize(text, speaker, speed, language, progress=gr.Progress()):
|
| 33 |
+
bio = io.BytesIO()
|
| 34 |
+
models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav')
|
| 35 |
+
return bio.getvalue()
|
| 36 |
+
def load_speakers(language, text):
|
| 37 |
+
if text in list(default_text_dict.values()):
|
| 38 |
+
newtext = default_text_dict[language]
|
| 39 |
+
else:
|
| 40 |
+
newtext = text
|
| 41 |
+
return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext
|
| 42 |
+
with gr.Blocks() as demo:
|
| 43 |
+
gr.Markdown('# MeloTTS Demo\n\nAn unofficial demo for [MeloTTS](https://github.com/myshell-ai/MeloTTS). **Make sure to try out several speakers, for example EN-Default!**')
|
| 44 |
+
with gr.Group():
|
| 45 |
+
speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker')
|
| 46 |
+
language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN')
|
| 47 |
+
speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1)
|
| 48 |
+
text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])
|
| 49 |
+
language.input(load_speakers, inputs=[language, text], outputs=[speaker, text])
|
| 50 |
+
btn = gr.Button('Synthesize', variant='primary')
|
| 51 |
+
aud = gr.Audio(interactive=False)
|
| 52 |
+
btn.click(synthesize, inputs=[text, speaker, speed, language], outputs=[aud])
|
| 53 |
+
gr.Markdown('Demo by [mrfakename](https://twitter.com/realmrfakename).')
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True)
|