File size: 1,159 Bytes
21f4884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import gradio as gr
from faster_whisper import WhisperModel
import google.generativeai as genai
import subprocess

# Gemini設定
genai.configure(api_key="YOUR_GOOGLE_API_KEY")

# Whisperモデルロード
model = WhisperModel("small")

def transcribe(audio):
    segments, _ = model.transcribe(audio, beam_size=5)
    return "".join([seg.text for seg in segments])

def generate_reply(text):
    res = genai.chat.create(model="gemini-mobile", messages=[{"role":"user","content":text}])
    return res.last

def synthesize(text):
    # voicevox_engine を CLI で呼ぶ例
    p = subprocess.Popen(
        ["voicevox_engine", "--text", text, "--speaker", "1", "--wav", "/tmp/out.wav"]
    )
    p.wait()
    return "/tmp/out.wav"

def pipeline(audio):
    user_text = transcribe(audio)
    bot_text  = generate_reply(user_text)
    wav_path  = synthesize(bot_text)
    return bot_text, wav_path

iface = gr.Interface(
    fn=pipeline,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[gr.Textbox(), gr.Audio(source="file", type="filepath")],
    live=False, title="3DAItuber API"
)
iface.launch(server_name="0.0.0.0", server_port=7860)