import io import numpy as np import librosa import pyworld as pw import gradio as gr from openai import OpenAI import soundfile as sf client = OpenAI() # ========================== # 1) 標準の TTS(OpenAI) # ========================== def tts_standard(text): response = client.audio.speech.create( model="gpt-4o-mini-tts", voice="alloy", input=text, ) audio_bytes = response.read() # WAVとして読み込み audio, sr = sf.read(io.BytesIO(audio_bytes)) return sr, audio.astype(np.float64) # ========================== # 2) Kansai HL F0 カーブ生成 # ========================== def kansai_hl_curve(length, f0_src): nz = np.where(f0_src > 0)[0] if len(nz) < 2: return f0_src low = np.percentile(f0_src[nz], 20) high = np.percentile(f0_src[nz], 85) L = length p1 = int(L * 0.25) p2 = int(L * 0.70) p3 = L seg1 = np.linspace(low, high, p1) seg2 = np.linspace(high, high, p2 - p1) seg3 = np.linspace(high, low, p3 - p2) out = np.concatenate([seg1, seg2, seg3]) if len(out) < length: out = np.pad(out, (0, length - len(out)), mode="edge") else: out = out[:length] return out # ========================== # 3) 標準 → 関西イントネーション化 # ========================== def convert_to_kansai_pitch(audio, sr): f0, sp, ap = pw.wav2world(audio, sr) f0_new = kansai_hl_curve(len(f0), f0) y = pw.synthesize(f0_new, sp, ap, sr) return y.astype(np.float32) # ========================== # 4) 統合 # ========================== def kansai_tts(text): sr, audio_std = tts_standard(text) audio_ks = convert_to_kansai_pitch(audio_std, sr) return (sr, audio_ks) # ========================== # 5) Gradio UI # ========================== with gr.Blocks() as demo: gr.Markdown("## 🎙 Kansign — Kansai Accent TTS(本物HLイントネーション搭載)") text_in = gr.Textbox(label="テキストを入力(例:なんでやねん)") audio_out = gr.Audio(label="関西イントネーション音声", type="numpy") btn = gr.Button("関西イントネーションで喋る") btn.click(kansai_tts, inputs=text_in, outputs=audio_out) demo.launch()