| import logging |
| import os |
| import pathlib |
| import time |
| import tempfile |
| import platform |
| import gc |
| if platform.system().lower() == 'windows': |
| temp = pathlib.PosixPath |
| pathlib.PosixPath = pathlib.WindowsPath |
| elif platform.system().lower() == 'linux': |
| temp = pathlib.WindowsPath |
| pathlib.WindowsPath = pathlib.PosixPath |
| os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" |
|
|
| import langid |
| langid.set_languages(['en', 'zh', 'ja']) |
|
|
| import torch |
| import torchaudio |
|
|
| import numpy as np |
|
|
| from data.tokenizer import ( |
| AudioTokenizer, |
| tokenize_audio, |
| ) |
| from data.collation import get_text_token_collater |
| from models.vallex import VALLE |
| from utils.g2p import PhonemeBpeTokenizer |
| from descriptions import * |
| from macros import * |
| from examples import * |
|
|
| import gradio as gr |
| from vocos import Vocos |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
|
|
| |
| lang2token = {"en": "<en>", "ja": "<ja>", "zh": "<zh>"} |
| lang2code = {"en": 0, "ja": 1, "zh": 2} |
| langid = None |
|
|
| |
| def clear_prompts(): |
| try: |
| path = tempfile.gettempdir() |
| for eachfile in os.listdir(path): |
| filename = os.path.join(path, eachfile) |
| if os.path.isfile(filename) and filename.endswith(".npz"): |
| lastmodifytime = os.stat(filename).st_mtime |
| endfiletime = time.time() - 60 |
| if endfiletime > lastmodifytime: |
| os.remove(filename) |
| del path, filename, lastmodifytime, endfiletime |
| gc.collect() |
| except: |
| return |
| def transcribe_one(wav, sr): |
| if sr != 16000: |
| wav4trans = torchaudio.transforms.Resample(sr, 16000)(wav) |
| else: |
| wav4trans = wav |
|
|
| input_features = whisper_processor(wav4trans.squeeze(0), sampling_rate=16000, return_tensors="pt").input_features |
|
|
| |
| predicted_ids = whisper.generate(input_features.to(device)) |
| lang = whisper_processor.batch_decode(predicted_ids[:, 1])[0].strip("<|>") |
| |
| text_pr = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
| |
| print(text_pr) |
|
|
| if text_pr.strip(" ")[-1] not in "?!.,。,?!。、": |
| text_pr += "." |
|
|
| |
| del wav4trans, input_features, predicted_ids |
| gc.collect() |
| return lang, text_pr |
| |
| from data.tokenizer import ( |
| AudioTokenizer, |
| tokenize_audio, |
| ) |
|
|
| def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content): |
| clear_prompts() |
| audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio |
| sr, wav_pr = audio_prompt |
| if len(wav_pr) / sr > 15: |
| return "Rejected, Audio too long (should be less than 15 seconds)", None |
| if not isinstance(wav_pr, torch.FloatTensor): |
| wav_pr = torch.FloatTensor(wav_pr) |
| if wav_pr.abs().max() > 1: |
| wav_pr /= wav_pr.abs().max() |
| if wav_pr.size(-1) == 2: |
| wav_pr = wav_pr[:, 0] |
| if wav_pr.ndim == 1: |
| wav_pr = wav_pr.unsqueeze(0) |
| assert wav_pr.ndim and wav_pr.size(0) == 1 |
|
|
| if transcript_content == "": |
| lang_pr, text_pr = transcribe_one(wav_pr, sr) |
| lang_token = lang2token[lang_pr] |
| text_pr = lang_token + text_pr + lang_token |
| else: |
| lang_pr = langid.classify(str(transcript_content))[0] |
| lang_token = lang2token[lang_pr] |
| transcript_content = transcript_content.replace("\n", "") |
| text_pr = f"{lang_token}{str(transcript_content)}{lang_token}" |
| |
| encoded_frames = tokenize_audio(None, (wav_pr, sr)) |
| audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy() |
|
|
| |
| text_tokens = np.random.randint(0, 100, (1, 50)) |
|
|
| message = f"Detected language: {lang_pr}\n Detected text: {text_pr}\n" |
| if lang_pr not in ['ja', 'zh', 'en']: |
| return f"Prompt can only made with one of model-supported languages, got {lang_pr} instead", None |
|
|
| |
| file_path = os.path.join(tempfile.gettempdir(), f"{name}.npz") |
| np.savez(file_path, audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr]) |
|
|
| |
| del audio_tokens, text_tokens, lang_pr, text_pr, wav_pr, sr, uploaded_audio, recorded_audio |
| gc.collect() |
| return message, file_path |
|
|
| def infer_from_prompt(text, language, accent, preset_prompt, prompt_file): |
| if len(text) > 150: |
| return "Rejected, Text too long (should be less than 150 characters)", None |
| return f"Synthesized text: {text}", (24000, np.zeros(24000)) |
|
|
| def get_available_npz_files(): |
| |
| return [f for f in os.listdir(tempfile.gettempdir()) if f.endswith(".npz")] |
|
|
| |
| with gr.Blocks() as app: |
| with gr.Tabs(): |
| |
| with gr.Tab("NPZファイルを作成"): |
| gr.Markdown("### 音声とテキストから .npz ファイルを作成") |
| name = gr.Textbox(label="ファイル名", placeholder="保存する .npz ファイル名を入力") |
| uploaded_audio = gr.Audio(label="アップロード音声", type="numpy") |
| transcript_content = gr.Textbox(label="テキスト内容", placeholder="音声に対応する文字起こしを入力") |
| result_message = gr.Textbox(label="結果", interactive=False) |
| npz_output = gr.File(label=".npz ファイル") |
| save_button = gr.Button("変換して保存") |
| dummy_input = gr.Textbox(visible=False) |
| |
| save_button.click( |
| make_npz_prompt, |
| inputs=[name, uploaded_audio, dummy_input, transcript_content], |
| outputs=[result_message, npz_output], |
| ) |
|
|
| |
| with gr.Tab("NPZファイルで生成"): |
| gr.Markdown("### 保存した .npz ファイルから音声を生成") |
| npz_files_dropdown = gr.Dropdown( |
| label="利用可能な .npz ファイル", choices=get_available_npz_files(), interactive=True |
| ) |
| text_input = gr.Textbox(label="生成するテキスト", placeholder="150文字以内のテキストを入力") |
| language = gr.Radio( |
| label="言語選択", |
| choices=["auto-detect", "en", "ja", "zh"], |
| value="auto-detect" |
| ) |
| accent = gr.Radio( |
| label="アクセント選択", |
| choices=["no-accent", "en-accent", "ja-accent", "zh-accent"], |
| value="no-accent" |
| ) |
| preset_prompt = gr.Textbox(label="プロンプト名", placeholder="既存のプロンプトを選択") |
| synthesis_message = gr.Textbox(label="結果", interactive=False) |
| audio_output = gr.Audio(label="生成音声", type="numpy") |
| generate_button = gr.Button("生成開始") |
|
|
| generate_button.click( |
| infer_from_prompt, |
| inputs=[text_input, language, accent, preset_prompt, npz_files_dropdown], |
| outputs=[synthesis_message, audio_output], |
| ) |
|
|
| app.launch() |
|
|