Spaces:
Sleeping
Sleeping
| import spaces | |
| import os | |
| import sys | |
| import json | |
| import shutil | |
| import tempfile | |
| import requests | |
| import soundfile as sf | |
| import gradio as gr | |
| now_dir = os.getcwd() | |
| sys.path.append(now_dir) | |
| from configs.config import Config | |
| from infer.modules.vc.modules import VC | |
| niij_api = os.getenv("niji_api") | |
| # ====== nijiVoice APIを使ってテキストを音声化(mp3)し、保存する関数 ====== | |
| def test_voice(text: str, speed_str: str = "1.0"): | |
| """ | |
| テキストを nijiVoice API で mp3 音声に変換し、 | |
| "infer.mp3" として保存して、そのパスを返す。 | |
| speed_str は文字列で受け取り、float に変換してから使う。 | |
| """ | |
| # speed_str を float に変換 | |
| try: | |
| speed_val = float(speed_str) | |
| except ValueError: | |
| # 万一変換できなかった場合はデフォルト値を使う | |
| speed_val = 1.0 | |
| url = "https://api.nijivoice.com/api/platform/v1/voice-actors/5f1e8106-5e5a-422f-b269-9f2e53c18146/generate-voice" | |
| payload = { | |
| "format": "mp3", | |
| "speed": str(speed_val), | |
| "script": text | |
| } | |
| headers = { | |
| "accept": "application/json", | |
| "x-api-key": niij_api, | |
| "content-type": "application/json" | |
| } | |
| response = requests.post(url, json=payload, headers=headers) | |
| response_json = json.loads(response.text) | |
| audio_file_url = response_json["generatedVoice"]["audioFileUrl"] | |
| # mp3ファイルを取得して保存 | |
| audio_file = requests.get(audio_file_url) | |
| with open("infer.mp3", "wb") as f: | |
| f.write(audio_file.content) | |
| return os.path.abspath("infer.mp3") | |
| # ====== 音声ファイルパスを受け取り、変換処理を行う関数 ====== | |
| def promete(audio_file_path, vc_transform0): | |
| """ | |
| audio_file_path: | |
| Gradioで受け取った音声ファイルのパス(文字列)。 | |
| vc_transform0: | |
| 音高を変換するパラメータ(整数、半音単位)。 | |
| """ | |
| # ====== 音声変換(VC)を行う関数 ====== | |
| config = Config() | |
| vc = VC(config) | |
| def convert_voice( | |
| model_name, spk_item, vc_input3, vc_transform0, f0_file, f0method0, | |
| file_index1, file_index2, index_rate1, filter_radius0, | |
| resample_sr0, rms_mix_rate0, protect0 | |
| ): | |
| vc_output1, vc_output2 = vc.vc_single( | |
| model_name, | |
| spk_item, | |
| vc_input3, | |
| vc_transform0, | |
| f0_file, | |
| f0method0, | |
| file_index1, | |
| file_index2, | |
| index_rate1, | |
| filter_radius0, | |
| resample_sr0, | |
| rms_mix_rate0, | |
| protect0 | |
| ) | |
| return vc_output1, vc_output2 | |
| if not audio_file_path: | |
| return None | |
| model_name = "ebaytest2.pth" # 推論音色 | |
| spk_item = 0 # 話者ID | |
| f0method0 = "rmvpe" # 音高抽出アルゴリズム | |
| filter_radius0 = 3 # 中値フィルタの半径 | |
| file_index1 = "" | |
| file_index2 = None | |
| index_rate1 = 0.88 | |
| resample_sr0 = 0 | |
| rms_mix_rate0 = 1 | |
| protect0 = 0.33 | |
| f0_file = None | |
| # 一時フォルダの作成 | |
| temp_dir = tempfile.mkdtemp() | |
| # 入力音声をコピー | |
| input_path = os.path.join(temp_dir, "input.wav") | |
| shutil.copy(audio_file_path, input_path) | |
| # 変換処理 | |
| vc_output1, vc_output2 = convert_voice( | |
| model_name, | |
| spk_item, | |
| input_path, | |
| vc_transform0, | |
| f0_file, | |
| f0method0, | |
| file_index1, | |
| file_index2, | |
| index_rate1, | |
| filter_radius0, | |
| resample_sr0, | |
| rms_mix_rate0, | |
| protect0 | |
| ) | |
| # 変換後音声を一時フォルダに保存 | |
| output_path = os.path.join(temp_dir, "output.wav") | |
| # vc_output2 は (sample_rate, numpy配列) | |
| sf.write(output_path, vc_output2[1], vc_output2[0]) | |
| return output_path | |
| # 1) マイク録音用の推論関数 | |
| def inference_mic(mic_file_path, vc_transform_val): | |
| """ | |
| マイク録音データ用 | |
| """ | |
| if mic_file_path is None: | |
| return None | |
| return promete(mic_file_path, vc_transform_val) | |
| # 2) アップロード用の推論関数 | |
| def inference_file(upload_file_path, vc_transform_val): | |
| """ | |
| ファイルアップロード用 | |
| """ | |
| if upload_file_path is None: | |
| return None | |
| return promete(upload_file_path, vc_transform_val) | |
| # 3) テキスト入力(TTS)用の推論関数 | |
| def inference_tts(text_input, vc_transform_val, speed_str): | |
| """ | |
| テキスト入力をTTSでmp3生成し、その音声を変換する。 | |
| speed_str は str で受け取って float に変換。 | |
| """ | |
| if not text_input: | |
| return None | |
| # テキストから音声を作成(infer.mp3): speed_str を追加 | |
| tts_audio_path = test_voice(text_input, speed_str) | |
| converted_file = tts_audio_path | |
| return promete(converted_file, vc_transform_val) | |
| # ====== Gradioインターフェース ====== | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 音声変換デモ") | |
| gr.Markdown("### 音高パラメータ(半音単位)と TTS の速度を自由に変更して、音声を変換します。") | |
| # 音高変換(半音)スライダー | |
| pitch_slider = gr.Slider( | |
| minimum=-12, | |
| maximum=12, | |
| value=-2, | |
| step=1, | |
| label="音高変換(半音単位)" | |
| ) | |
| # TTS 速度を文字列で入力 | |
| tts_speed_str = gr.Textbox( | |
| lines=1, | |
| value="1.0", | |
| label="TTS速度 (strで入力してください 例: 1.5)" | |
| ) | |
| with gr.Tab("マイクで録音"): | |
| audio_input_mic = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="録音した音声" | |
| ) | |
| audio_output_mic = gr.Audio( | |
| label="変換後の音声", | |
| type="filepath" | |
| ) | |
| btn_mic = gr.Button("音声変換(マイク)") | |
| btn_mic.click( | |
| fn=inference_mic, | |
| inputs=[audio_input_mic, pitch_slider], | |
| outputs=[audio_output_mic] | |
| ) | |
| with gr.Tab("音声ファイルをアップロード"): | |
| audio_input_file = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="アップロード音声" | |
| ) | |
| audio_output_file = gr.Audio( | |
| label="変換後の音声", | |
| type="filepath" | |
| ) | |
| btn_file = gr.Button("音声変換(アップロード)") | |
| btn_file.click( | |
| fn=inference_file, | |
| inputs=[audio_input_file, pitch_slider], | |
| outputs=[audio_output_file] | |
| ) | |
| with gr.Tab("テキスト読み上げ (TTS)"): | |
| text_input = gr.Textbox( | |
| lines=3, | |
| placeholder="読み上げたいテキストを入力", | |
| label="テキスト入力" | |
| ) | |
| audio_output_tts = gr.Audio( | |
| label="TTS & 変換後の音声", | |
| type="filepath" | |
| ) | |
| btn_tts = gr.Button("音声変換(TTS)") | |
| btn_tts.click( | |
| fn=inference_tts, | |
| inputs=[text_input, pitch_slider, tts_speed_str], | |
| outputs=[audio_output_tts] | |
| ) | |
| demo.launch() | |