import gradio as gr from faster_whisper import WhisperModel import google.generativeai as genai import subprocess # Gemini設定 genai.configure(api_key="YOUR_GOOGLE_API_KEY") # Whisperモデルロード model = WhisperModel("small") def transcribe(audio): segments, _ = model.transcribe(audio, beam_size=5) return "".join([seg.text for seg in segments]) def generate_reply(text): res = genai.chat.create(model="gemini-mobile", messages=[{"role":"user","content":text}]) return res.last def synthesize(text): # voicevox_engine を CLI で呼ぶ例 p = subprocess.Popen( ["voicevox_engine", "--text", text, "--speaker", "1", "--wav", "/tmp/out.wav"] ) p.wait() return "/tmp/out.wav" def pipeline(audio): user_text = transcribe(audio) bot_text = generate_reply(user_text) wav_path = synthesize(bot_text) return bot_text, wav_path iface = gr.Interface( fn=pipeline, inputs=gr.Audio(source="microphone", type="filepath"), outputs=[gr.Textbox(), gr.Audio(source="file", type="filepath")], live=False, title="3DAItuber API" ) iface.launch(server_name="0.0.0.0", server_port=7860)