Replaced Encodec with Vocos
Browse files
app.py
CHANGED
|
@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
|
| 44 |
text_collater = get_text_token_collater()
|
| 45 |
|
| 46 |
device = torch.device("cpu")
|
| 47 |
-
if torch.cuda.is_available():
|
| 48 |
-
|
| 49 |
|
| 50 |
# VALL-E-X model
|
| 51 |
model = VALLE(
|
|
@@ -141,17 +141,18 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
|
|
| 141 |
|
| 142 |
if transcript_content == "":
|
| 143 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
|
|
|
|
|
|
| 144 |
else:
|
| 145 |
lang_pr = langid.classify(str(transcript_content))[0]
|
| 146 |
lang_token = lang2token[lang_pr]
|
|
|
|
| 147 |
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
|
| 148 |
# tokenize audio
|
| 149 |
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
|
| 150 |
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
| 151 |
|
| 152 |
# tokenize text
|
| 153 |
-
lang_token = lang2token[lang_pr]
|
| 154 |
-
text_pr = lang_token + text_pr + lang_token
|
| 155 |
phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
| 156 |
text_tokens, enroll_x_lens = text_collater(
|
| 157 |
[
|
|
@@ -193,16 +194,20 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
|
|
| 193 |
|
| 194 |
if transcript_content == "":
|
| 195 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
|
|
|
|
|
|
| 196 |
else:
|
| 197 |
lang_pr = langid.classify(str(transcript_content))[0]
|
|
|
|
| 198 |
lang_token = lang2token[lang_pr]
|
| 199 |
-
text_pr =
|
| 200 |
|
| 201 |
if language == 'auto-detect':
|
| 202 |
lang_token = lang2token[langid.classify(text)[0]]
|
| 203 |
else:
|
| 204 |
lang_token = langdropdown2token[language]
|
| 205 |
lang = token2lang[lang_token]
|
|
|
|
| 206 |
text = lang_token + text + lang_token
|
| 207 |
|
| 208 |
if lang_pr not in ['ja', 'zh', 'en']:
|
|
@@ -223,8 +228,6 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
|
|
| 223 |
|
| 224 |
enroll_x_lens = None
|
| 225 |
if text_pr:
|
| 226 |
-
lang_token = lang2token[lang_pr]
|
| 227 |
-
text_pr = lang_token + text_pr + lang_token
|
| 228 |
text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
| 229 |
text_prompts, enroll_x_lens = text_collater(
|
| 230 |
[
|
|
@@ -266,6 +269,7 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
|
|
| 266 |
else:
|
| 267 |
lang_token = langdropdown2token[language]
|
| 268 |
lang = token2lang[lang_token]
|
|
|
|
| 269 |
text = lang_token + text + lang_token
|
| 270 |
|
| 271 |
# load prompt
|
|
|
|
| 44 |
text_collater = get_text_token_collater()
|
| 45 |
|
| 46 |
device = torch.device("cpu")
|
| 47 |
+
# if torch.cuda.is_available():
|
| 48 |
+
# device = torch.device("cuda", 0)
|
| 49 |
|
| 50 |
# VALL-E-X model
|
| 51 |
model = VALLE(
|
|
|
|
| 141 |
|
| 142 |
if transcript_content == "":
|
| 143 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
| 144 |
+
lang_token = lang2token[lang_pr]
|
| 145 |
+
text_pr = lang_token + text_pr + lang_token
|
| 146 |
else:
|
| 147 |
lang_pr = langid.classify(str(transcript_content))[0]
|
| 148 |
lang_token = lang2token[lang_pr]
|
| 149 |
+
transcript_content = transcript_content.replace("\n", "")
|
| 150 |
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
|
| 151 |
# tokenize audio
|
| 152 |
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
|
| 153 |
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
| 154 |
|
| 155 |
# tokenize text
|
|
|
|
|
|
|
| 156 |
phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
| 157 |
text_tokens, enroll_x_lens = text_collater(
|
| 158 |
[
|
|
|
|
| 194 |
|
| 195 |
if transcript_content == "":
|
| 196 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
| 197 |
+
lang_token = lang2token[lang_pr]
|
| 198 |
+
text_pr = lang_token + text_pr + lang_token
|
| 199 |
else:
|
| 200 |
lang_pr = langid.classify(str(transcript_content))[0]
|
| 201 |
+
text_pr = transcript_content.replace("\n", "")
|
| 202 |
lang_token = lang2token[lang_pr]
|
| 203 |
+
text_pr = lang_token + text_pr + lang_token
|
| 204 |
|
| 205 |
if language == 'auto-detect':
|
| 206 |
lang_token = lang2token[langid.classify(text)[0]]
|
| 207 |
else:
|
| 208 |
lang_token = langdropdown2token[language]
|
| 209 |
lang = token2lang[lang_token]
|
| 210 |
+
text = text.replace("\n", "")
|
| 211 |
text = lang_token + text + lang_token
|
| 212 |
|
| 213 |
if lang_pr not in ['ja', 'zh', 'en']:
|
|
|
|
| 228 |
|
| 229 |
enroll_x_lens = None
|
| 230 |
if text_pr:
|
|
|
|
|
|
|
| 231 |
text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
| 232 |
text_prompts, enroll_x_lens = text_collater(
|
| 233 |
[
|
|
|
|
| 269 |
else:
|
| 270 |
lang_token = langdropdown2token[language]
|
| 271 |
lang = token2lang[lang_token]
|
| 272 |
+
text = text.replace("\n", "")
|
| 273 |
text = lang_token + text + lang_token
|
| 274 |
|
| 275 |
# load prompt
|