Spaces:

HabibiBear
/

audioTOaudio

Sleeping

App Files Files Community

HabibiBear commited on Oct 20, 2024

Commit

1f4f8a5

verified ·

1 Parent(s): 09ff742

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -165

app.py CHANGED Viewed

@@ -1,166 +1,56 @@
-# # # import gradio as gr
-# # # import torch
-# # # from TTS.api import TTS
-# # # import os
-# # # # Agree to the Coqui license terms
-# # # os.environ["COQUI_TOS_AGREED"] = "1"
-# # # # Initialize device for GPU or CPU
-# # # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# # # # Initialize TTS model
-# # # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
-# # # # Define clone function for Gradio interface
-# # # def clone(text, audio):
-# # #     # Use the reference audio and synthesize speech
-# # #     output_path = "./output.wav"
-# # #     tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path=output_path)
-# # #     return output_path
-# # # # Set up the Gradio Interface
-# # # iface = gr.Interface(
-# # #     fn=clone,
-# # #     inputs=[
-# # #         gr.Textbox(label='Text'),
-# # #         gr.Audio(type='filepath', label='Voice reference audio file')
-# # #     ],
-# # #     outputs=gr.Audio(type='filepath'),
-# # #     title='Voice Clone Trial',
-# # #     theme=gr.themes.Base(primary_hue="red", secondary_hue="orange", neutral_hue="black"),
-# # # )
-# # # # Launch the interface
-# # # iface.launch()
-# # import whisper
-# # from gtts import gTTS
-# # import gradio as gr
-# # import os
-# # # Load Whisper model
-# # model = whisper.load_model("base")
-# # # Function to transcribe and convert text to speech
-# # def audio_to_audio(input_audio):
-# #     # Check if the audio path exists (Gradio will pass a file path as input_audio)
-# #     if input_audio is None:
-# #         return "No audio file provided", None
-# #     # Transcribe the audio using Whisper
-# #     result = model.transcribe(input_audio)
-# #     transcription = result["text"]
-# #     # Convert transcription to speech using gTTS
-# #     tts = gTTS(transcription)
-# #     # Save the output in a temporary directory
-# #     output_audio_path = "/tmp/output_audio.mp3"
-# #     tts.save(output_audio_path)
-# #     # Return the transcribed text and the path to the generated speech
-# #     return transcription, output_audio_path
-# # # Create Gradio interface
-# # interface = gr.Interface(
-# #     fn=audio_to_audio,
-# #     inputs=gr.Audio(type="filepath"),
-# #     outputs=[gr.Textbox(label="Transcription"), gr.Audio(label="Generated Audio")]
-# # )
-# # # Launch the Gradio app
-# # interface.launch()
-# import gradio as gr
-# from transformers import AutoModelForCausalLM, AutoTokenizer
-# import torch
-# import soundfile as sf
-# # Load the Coqui XTTS model and tokenizer
-# model_name = "coqui/XTTS-v2"
-# model = AutoModelForCausalLM.from_pretrained(model_name)
-# tokenizer = AutoTokenizer.from_pretrained(model_name)
-# # Function to generate speech
-# def generate_speech(text):
-#     # Tokenize the input text
-#     inputs = tokenizer(text, return_tensors="pt")
-#     # Generate speech using the model
-#     with torch.no_grad():
-#         speech = model.generate(**inputs)
-#     # Convert the output speech into a wav format
-#     speech_wav = speech[0].cpu().numpy()  # assuming the output is in a NumPy array
-#     output_path = "/tmp/output.wav"  # save it in the temporary directory
-#     # Save audio as .wav file
-#     sf.write(output_path, speech_wav, samplerate=16000)
-#     return output_path
-# # Create the Gradio interface
-# interface = gr.Interface(
-#     fn=generate_speech,
-#     inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
-#     outputs=gr.Audio(label="Generated Speech"),
-#     title="Text to Speech with Coqui XTTS-v2",
-#     description="Generate speech from text using the Coqui XTTS-v2 model."
-# )
-# # Launch the Gradio app
-# interface.launch()
 import gradio as gr
-import torch
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-import soundfile as sf
-# Load the XTTS model and configuration
-def load_model():
-    config = XttsConfig()
-    config_path = "/path/to/xtts/config.json"  # Make sure this is the correct path
-    config.load_json(config_path)
-    model = Xtts.init_from_config(config)
-    checkpoint_dir = "/path/to/xtts/"  # Make sure this is the correct path
-    model.load_checkpoint(config, checkpoint_dir=checkpoint_dir, eval=True)
-    model.cuda()
-    return model, config
-# Synthesize text to audio
-def text_to_speech(text, speaker_wav_path="/data/TTS-public/_refclips/3.wav"):
-    model, config = load_model()
-    # Generate the output audio
-    outputs = model.synthesize(
-        text,
-        config,
-        speaker_wav=speaker_wav_path,
-        gpt_cond_len=3,
-        language="en",
-    )
-    # Save the synthesized audio as a .wav file
-    output_path = "/tmp/output_audio.wav"  # Save in /tmp for Hugging Face compatibility
-    sf.write(output_path, outputs[0], samplerate=config.audio.sample_rate)
-    return output_path
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=text_to_speech,
-    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
-    outputs=gr.Audio(label="Generated Speech"),
-    title="Text to Speech using Coqui XTTS",
-    description="Input text and generate speech using the XTTS model."
-)
-# Launch the Gradio app
-interface.launch()

+# WebUI by mrfakename
+# Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS
 import gradio as gr
+import os, torch, io
+os.system('python -m unidic download')
+# print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
+from melo.api import TTS
+speed = 1.0
+import tempfile
+import nltk
+nltk.download('averaged_perceptron_tagger_eng')
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+models = {
+    'EN': TTS(language='EN', device=device),
+    'ES': TTS(language='ES', device=device),
+    'FR': TTS(language='FR', device=device),
+    'ZH': TTS(language='ZH', device=device),
+    'JP': TTS(language='JP', device=device),
+    'KR': TTS(language='KR', device=device),
+}
+speaker_ids = models['EN'].hps.data.spk2id
+default_text_dict = {
+    'EN': 'The field of text-to-speech has seen rapid development recently.',
+    'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.',
+    'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment',
+    'ZH': 'text-to-speech 领域近年来发展迅速',
+    'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています',
+    'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.',
+}
+def synthesize(text, speaker, speed, language, progress=gr.Progress()):
+    bio = io.BytesIO()
+    models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav')
+    return bio.getvalue()
+def load_speakers(language, text):
+    if text in list(default_text_dict.values()):
+        newtext = default_text_dict[language]
+    else:
+        newtext = text
+    return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext
+with gr.Blocks() as demo:
+    gr.Markdown('# MeloTTS Demo\n\nAn unofficial demo for [MeloTTS](https://github.com/myshell-ai/MeloTTS). **Make sure to try out several speakers, for example EN-Default!**')
+    with gr.Group():
+        speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker')
+        language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN')
+        speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1)
+        text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])
+        language.input(load_speakers, inputs=[language, text], outputs=[speaker, text])
+    btn = gr.Button('Synthesize', variant='primary')
+    aud = gr.Audio(interactive=False)
+    btn.click(synthesize, inputs=[text, speaker, speed, language], outputs=[aud])
+    gr.Markdown('Demo by [mrfakename](https://twitter.com/realmrfakename).')
+demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True)