|
|
import io |
|
|
import numpy as np |
|
|
import librosa |
|
|
import pyworld as pw |
|
|
import gradio as gr |
|
|
from openai import OpenAI |
|
|
import soundfile as sf |
|
|
|
|
|
client = OpenAI() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tts_standard(text): |
|
|
response = client.audio.speech.create( |
|
|
model="gpt-4o-mini-tts", |
|
|
voice="alloy", |
|
|
input=text, |
|
|
) |
|
|
audio_bytes = response.read() |
|
|
|
|
|
|
|
|
audio, sr = sf.read(io.BytesIO(audio_bytes)) |
|
|
return sr, audio.astype(np.float64) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def kansai_hl_curve(length, f0_src): |
|
|
nz = np.where(f0_src > 0)[0] |
|
|
if len(nz) < 2: |
|
|
return f0_src |
|
|
|
|
|
low = np.percentile(f0_src[nz], 20) |
|
|
high = np.percentile(f0_src[nz], 85) |
|
|
|
|
|
L = length |
|
|
p1 = int(L * 0.25) |
|
|
p2 = int(L * 0.70) |
|
|
p3 = L |
|
|
|
|
|
seg1 = np.linspace(low, high, p1) |
|
|
seg2 = np.linspace(high, high, p2 - p1) |
|
|
seg3 = np.linspace(high, low, p3 - p2) |
|
|
|
|
|
out = np.concatenate([seg1, seg2, seg3]) |
|
|
|
|
|
if len(out) < length: |
|
|
out = np.pad(out, (0, length - len(out)), mode="edge") |
|
|
else: |
|
|
out = out[:length] |
|
|
|
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_to_kansai_pitch(audio, sr): |
|
|
f0, sp, ap = pw.wav2world(audio, sr) |
|
|
f0_new = kansai_hl_curve(len(f0), f0) |
|
|
y = pw.synthesize(f0_new, sp, ap, sr) |
|
|
return y.astype(np.float32) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def kansai_tts(text): |
|
|
sr, audio_std = tts_standard(text) |
|
|
audio_ks = convert_to_kansai_pitch(audio_std, sr) |
|
|
return (sr, audio_ks) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("## 🎙 Kansign — Kansai Accent TTS(本物HLイントネーション搭載)") |
|
|
|
|
|
text_in = gr.Textbox(label="テキストを入力(例:なんでやねん)") |
|
|
audio_out = gr.Audio(label="関西イントネーション音声", type="numpy") |
|
|
btn = gr.Button("関西イントネーションで喋る") |
|
|
|
|
|
btn.click(kansai_tts, inputs=text_in, outputs=audio_out) |
|
|
|
|
|
demo.launch() |
|
|
|