3d-AI-tuber / app.py
buchi-stdesign's picture
Initial Docker Gradio API with TTS/STT/LLM
21f4884
import gradio as gr
from faster_whisper import WhisperModel
import google.generativeai as genai
import subprocess
# Gemini設定
genai.configure(api_key="YOUR_GOOGLE_API_KEY")
# Whisperモデルロード
model = WhisperModel("small")
def transcribe(audio):
segments, _ = model.transcribe(audio, beam_size=5)
return "".join([seg.text for seg in segments])
def generate_reply(text):
res = genai.chat.create(model="gemini-mobile", messages=[{"role":"user","content":text}])
return res.last
def synthesize(text):
# voicevox_engine を CLI で呼ぶ例
p = subprocess.Popen(
["voicevox_engine", "--text", text, "--speaker", "1", "--wav", "/tmp/out.wav"]
)
p.wait()
return "/tmp/out.wav"
def pipeline(audio):
user_text = transcribe(audio)
bot_text = generate_reply(user_text)
wav_path = synthesize(bot_text)
return bot_text, wav_path
iface = gr.Interface(
fn=pipeline,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[gr.Textbox(), gr.Audio(source="file", type="filepath")],
live=False, title="3DAItuber API"
)
iface.launch(server_name="0.0.0.0", server_port=7860)