Kansign / app.py
KGNINJA's picture
Update app.py
019cdd5 verified
import io
import numpy as np
import librosa
import pyworld as pw
import gradio as gr
from openai import OpenAI
import soundfile as sf
client = OpenAI()
# ==========================
# 1) 標準の TTS(OpenAI)
# ==========================
def tts_standard(text):
response = client.audio.speech.create(
model="gpt-4o-mini-tts",
voice="alloy",
input=text,
)
audio_bytes = response.read()
# WAVとして読み込み
audio, sr = sf.read(io.BytesIO(audio_bytes))
return sr, audio.astype(np.float64)
# ==========================
# 2) Kansai HL F0 カーブ生成
# ==========================
def kansai_hl_curve(length, f0_src):
nz = np.where(f0_src > 0)[0]
if len(nz) < 2:
return f0_src
low = np.percentile(f0_src[nz], 20)
high = np.percentile(f0_src[nz], 85)
L = length
p1 = int(L * 0.25)
p2 = int(L * 0.70)
p3 = L
seg1 = np.linspace(low, high, p1)
seg2 = np.linspace(high, high, p2 - p1)
seg3 = np.linspace(high, low, p3 - p2)
out = np.concatenate([seg1, seg2, seg3])
if len(out) < length:
out = np.pad(out, (0, length - len(out)), mode="edge")
else:
out = out[:length]
return out
# ==========================
# 3) 標準 → 関西イントネーション化
# ==========================
def convert_to_kansai_pitch(audio, sr):
f0, sp, ap = pw.wav2world(audio, sr)
f0_new = kansai_hl_curve(len(f0), f0)
y = pw.synthesize(f0_new, sp, ap, sr)
return y.astype(np.float32)
# ==========================
# 4) 統合
# ==========================
def kansai_tts(text):
sr, audio_std = tts_standard(text)
audio_ks = convert_to_kansai_pitch(audio_std, sr)
return (sr, audio_ks)
# ==========================
# 5) Gradio UI
# ==========================
with gr.Blocks() as demo:
gr.Markdown("## 🎙 Kansign — Kansai Accent TTS(本物HLイントネーション搭載)")
text_in = gr.Textbox(label="テキストを入力(例:なんでやねん)")
audio_out = gr.Audio(label="関西イントネーション音声", type="numpy")
btn = gr.Button("関西イントネーションで喋る")
btn.click(kansai_tts, inputs=text_in, outputs=audio_out)
demo.launch()