ai-interview-system / speech.py
sugitora
AI面接システム - 初回リリース (Streamlit + Claude API)
6d1fe52
import os
import struct
import subprocess
import tempfile
import wave
import numpy as np
class SpeechRecognizer:
def __init__(
self,
model_name: str = "base",
sample_rate: int = 16000,
silence_timeout: float = 3.0,
max_duration: float = 120.0,
):
self.model_name = model_name
self.sample_rate = sample_rate
self.silence_timeout = silence_timeout
self.max_duration = max_duration
self.chunk_size = 1024
self._model = None
def _load_model(self):
if self._model is None:
import whisper
print(f"Whisperモデル ({self.model_name}) を読み込み中...")
self._model = whisper.load_model(self.model_name)
return self._model
def listen(self) -> tuple[str, float]:
import pyaudio
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size,
)
print("🎤 回答をどうぞ(話し終わったら少し待ってください)...")
frames: list[bytes] = []
silent_chunks = 0
silence_limit = int(self.silence_timeout * self.sample_rate / self.chunk_size)
max_chunks = int(self.max_duration * self.sample_rate / self.chunk_size)
has_speech = False
silence_threshold = 500
try:
for _ in range(max_chunks):
data = stream.read(self.chunk_size, exception_on_overflow=False)
frames.append(data)
rms = self._calculate_rms(data)
if rms > silence_threshold:
has_speech = True
silent_chunks = 0
else:
silent_chunks += 1
if has_speech and silent_chunks > silence_limit:
break
finally:
stream.stop_stream()
stream.close()
audio.terminate()
if not frames or not has_speech:
return "", 0.0
duration = len(frames) * self.chunk_size / self.sample_rate
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
with wave.open(tmp.name, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
wf.writeframes(b"".join(frames))
try:
model = self._load_model()
result = model.transcribe(tmp_path, language="ja")
text = result["text"].strip()
finally:
os.unlink(tmp_path)
return text, duration
@staticmethod
def _calculate_rms(data: bytes) -> float:
count = len(data) // 2
shorts = struct.unpack(f"{count}h", data)
arr = np.array(shorts, dtype=np.float64)
if len(arr) == 0:
return 0.0
return float(np.sqrt(np.mean(arr ** 2)))
class SpeechSynthesizer:
def speak(self, text: str) -> None:
from gtts import gTTS
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
tmp_path = tmp.name
try:
tts = gTTS(text=text, lang="ja")
tts.save(tmp_path)
subprocess.run(
["afplay", tmp_path],
check=True,
capture_output=True,
)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
class DryRunRecognizer:
"""テスト用: マイクの代わりにキーボード入力を使用"""
def listen(self) -> tuple[str, float]:
print("📝 回答を入力してください(空行で終了):")
lines: list[str] = []
while True:
try:
line = input()
if line == "":
break
lines.append(line)
except EOFError:
break
text = "\n".join(lines)
return text, 0.0
class DryRunSynthesizer:
"""テスト用: 音声の代わりにテキスト表示"""
def speak(self, text: str) -> None:
print(f"🔊 {text}")