HabibiBear commited on
Commit
1f4f8a5
·
verified ·
1 Parent(s): 09ff742

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -165
app.py CHANGED
@@ -1,166 +1,56 @@
1
- # # # import gradio as gr
2
- # # # import torch
3
- # # # from TTS.api import TTS
4
- # # # import os
5
-
6
- # # # # Agree to the Coqui license terms
7
- # # # os.environ["COQUI_TOS_AGREED"] = "1"
8
-
9
- # # # # Initialize device for GPU or CPU
10
- # # # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
-
12
- # # # # Initialize TTS model
13
- # # # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
14
-
15
- # # # # Define clone function for Gradio interface
16
- # # # def clone(text, audio):
17
- # # # # Use the reference audio and synthesize speech
18
- # # # output_path = "./output.wav"
19
- # # # tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path=output_path)
20
- # # # return output_path
21
-
22
- # # # # Set up the Gradio Interface
23
- # # # iface = gr.Interface(
24
- # # # fn=clone,
25
- # # # inputs=[
26
- # # # gr.Textbox(label='Text'),
27
- # # # gr.Audio(type='filepath', label='Voice reference audio file')
28
- # # # ],
29
- # # # outputs=gr.Audio(type='filepath'),
30
- # # # title='Voice Clone Trial',
31
- # # # theme=gr.themes.Base(primary_hue="red", secondary_hue="orange", neutral_hue="black"),
32
- # # # )
33
-
34
- # # # # Launch the interface
35
- # # # iface.launch()
36
-
37
-
38
- # # import whisper
39
- # # from gtts import gTTS
40
- # # import gradio as gr
41
- # # import os
42
-
43
- # # # Load Whisper model
44
- # # model = whisper.load_model("base")
45
-
46
- # # # Function to transcribe and convert text to speech
47
- # # def audio_to_audio(input_audio):
48
- # # # Check if the audio path exists (Gradio will pass a file path as input_audio)
49
- # # if input_audio is None:
50
- # # return "No audio file provided", None
51
-
52
- # # # Transcribe the audio using Whisper
53
- # # result = model.transcribe(input_audio)
54
- # # transcription = result["text"]
55
-
56
- # # # Convert transcription to speech using gTTS
57
- # # tts = gTTS(transcription)
58
-
59
- # # # Save the output in a temporary directory
60
- # # output_audio_path = "/tmp/output_audio.mp3"
61
- # # tts.save(output_audio_path)
62
-
63
- # # # Return the transcribed text and the path to the generated speech
64
- # # return transcription, output_audio_path
65
-
66
- # # # Create Gradio interface
67
- # # interface = gr.Interface(
68
- # # fn=audio_to_audio,
69
- # # inputs=gr.Audio(type="filepath"),
70
- # # outputs=[gr.Textbox(label="Transcription"), gr.Audio(label="Generated Audio")]
71
- # # )
72
-
73
- # # # Launch the Gradio app
74
- # # interface.launch()
75
-
76
-
77
- # import gradio as gr
78
- # from transformers import AutoModelForCausalLM, AutoTokenizer
79
- # import torch
80
- # import soundfile as sf
81
-
82
- # # Load the Coqui XTTS model and tokenizer
83
- # model_name = "coqui/XTTS-v2"
84
- # model = AutoModelForCausalLM.from_pretrained(model_name)
85
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
86
-
87
- # # Function to generate speech
88
- # def generate_speech(text):
89
- # # Tokenize the input text
90
- # inputs = tokenizer(text, return_tensors="pt")
91
-
92
- # # Generate speech using the model
93
- # with torch.no_grad():
94
- # speech = model.generate(**inputs)
95
-
96
- # # Convert the output speech into a wav format
97
- # speech_wav = speech[0].cpu().numpy() # assuming the output is in a NumPy array
98
- # output_path = "/tmp/output.wav" # save it in the temporary directory
99
-
100
- # # Save audio as .wav file
101
- # sf.write(output_path, speech_wav, samplerate=16000)
102
-
103
- # return output_path
104
-
105
- # # Create the Gradio interface
106
- # interface = gr.Interface(
107
- # fn=generate_speech,
108
- # inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
109
- # outputs=gr.Audio(label="Generated Speech"),
110
- # title="Text to Speech with Coqui XTTS-v2",
111
- # description="Generate speech from text using the Coqui XTTS-v2 model."
112
- # )
113
-
114
- # # Launch the Gradio app
115
- # interface.launch()
116
-
117
-
118
  import gradio as gr
119
- import torch
120
- from TTS.tts.configs.xtts_config import XttsConfig
121
- from TTS.tts.models.xtts import Xtts
122
- import soundfile as sf
123
-
124
- # Load the XTTS model and configuration
125
- def load_model():
126
- config = XttsConfig()
127
- config_path = "/path/to/xtts/config.json" # Make sure this is the correct path
128
- config.load_json(config_path)
129
-
130
- model = Xtts.init_from_config(config)
131
- checkpoint_dir = "/path/to/xtts/" # Make sure this is the correct path
132
- model.load_checkpoint(config, checkpoint_dir=checkpoint_dir, eval=True)
133
- model.cuda()
134
- return model, config
135
-
136
- # Synthesize text to audio
137
- def text_to_speech(text, speaker_wav_path="/data/TTS-public/_refclips/3.wav"):
138
- model, config = load_model()
139
-
140
- # Generate the output audio
141
- outputs = model.synthesize(
142
- text,
143
- config,
144
- speaker_wav=speaker_wav_path,
145
- gpt_cond_len=3,
146
- language="en",
147
- )
148
-
149
- # Save the synthesized audio as a .wav file
150
- output_path = "/tmp/output_audio.wav" # Save in /tmp for Hugging Face compatibility
151
- sf.write(output_path, outputs[0], samplerate=config.audio.sample_rate)
152
-
153
- return output_path
154
-
155
- # Create the Gradio interface
156
- interface = gr.Interface(
157
- fn=text_to_speech,
158
- inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
159
- outputs=gr.Audio(label="Generated Speech"),
160
- title="Text to Speech using Coqui XTTS",
161
- description="Input text and generate speech using the XTTS model."
162
- )
163
-
164
- # Launch the Gradio app
165
- interface.launch()
166
-
 
 
 
 
 
 
1
+ # WebUI by mrfakename
2
+ # Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import gradio as gr
4
+ import os, torch, io
5
+ os.system('python -m unidic download')
6
+ # print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
7
+ from melo.api import TTS
8
+ speed = 1.0
9
+ import tempfile
10
+ import nltk
11
+ nltk.download('averaged_perceptron_tagger_eng')
12
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
13
+ models = {
14
+ 'EN': TTS(language='EN', device=device),
15
+ 'ES': TTS(language='ES', device=device),
16
+ 'FR': TTS(language='FR', device=device),
17
+ 'ZH': TTS(language='ZH', device=device),
18
+ 'JP': TTS(language='JP', device=device),
19
+ 'KR': TTS(language='KR', device=device),
20
+ }
21
+ speaker_ids = models['EN'].hps.data.spk2id
22
+
23
+ default_text_dict = {
24
+ 'EN': 'The field of text-to-speech has seen rapid development recently.',
25
+ 'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.',
26
+ 'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment',
27
+ 'ZH': 'text-to-speech 领域近年来发展迅速',
28
+ 'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています',
29
+ 'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.',
30
+ }
31
+
32
+ def synthesize(text, speaker, speed, language, progress=gr.Progress()):
33
+ bio = io.BytesIO()
34
+ models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav')
35
+ return bio.getvalue()
36
+ def load_speakers(language, text):
37
+ if text in list(default_text_dict.values()):
38
+ newtext = default_text_dict[language]
39
+ else:
40
+ newtext = text
41
+ return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext
42
+ with gr.Blocks() as demo:
43
+ gr.Markdown('# MeloTTS Demo\n\nAn unofficial demo for [MeloTTS](https://github.com/myshell-ai/MeloTTS). **Make sure to try out several speakers, for example EN-Default!**')
44
+ with gr.Group():
45
+ speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker')
46
+ language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN')
47
+ speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1)
48
+ text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])
49
+ language.input(load_speakers, inputs=[language, text], outputs=[speaker, text])
50
+ btn = gr.Button('Synthesize', variant='primary')
51
+ aud = gr.Audio(interactive=False)
52
+ btn.click(synthesize, inputs=[text, speaker, speed, language], outputs=[aud])
53
+ gr.Markdown('Demo by [mrfakename](https://twitter.com/realmrfakename).')
54
+
55
+
56
+ demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True)