Spaces:

sugitora
/

ai-interview-system

Sleeping

sugitora

AI面接システム - 初回リリース (Streamlit + Claude API)

6d1fe52 3 months ago

4.29 kB

	import os
	import struct
	import subprocess
	import tempfile
	import wave

	import numpy as np


	class SpeechRecognizer:
	def __init__(
	self,
	model_name: str = "base",
	sample_rate: int = 16000,
	silence_timeout: float = 3.0,
	max_duration: float = 120.0,
	):
	self.model_name = model_name
	self.sample_rate = sample_rate
	self.silence_timeout = silence_timeout
	self.max_duration = max_duration
	self.chunk_size = 1024
	self._model = None

	def _load_model(self):
	if self._model is None:
	import whisper
	print(f"Whisperモデル ({self.model_name}) を読み込み中...")
	self._model = whisper.load_model(self.model_name)
	return self._model

	def listen(self) -> tuple[str, float]:
	import pyaudio

	audio = pyaudio.PyAudio()
	stream = audio.open(
	format=pyaudio.paInt16,
	channels=1,
	rate=self.sample_rate,
	input=True,
	frames_per_buffer=self.chunk_size,
	)

	print("🎤 回答をどうぞ（話し終わったら少し待ってください）...")

	frames: list[bytes] = []
	silent_chunks = 0
	silence_limit = int(self.silence_timeout * self.sample_rate / self.chunk_size)
	max_chunks = int(self.max_duration * self.sample_rate / self.chunk_size)
	has_speech = False
	silence_threshold = 500

	try:
	for _ in range(max_chunks):
	data = stream.read(self.chunk_size, exception_on_overflow=False)
	frames.append(data)

	rms = self._calculate_rms(data)

	if rms > silence_threshold:
	has_speech = True
	silent_chunks = 0
	else:
	silent_chunks += 1

	if has_speech and silent_chunks > silence_limit:
	break
	finally:
	stream.stop_stream()
	stream.close()
	audio.terminate()

	if not frames or not has_speech:
	return "", 0.0

	duration = len(frames) * self.chunk_size / self.sample_rate

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	tmp_path = tmp.name
	with wave.open(tmp.name, "wb") as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(self.sample_rate)
	wf.writeframes(b"".join(frames))

	try:
	model = self._load_model()
	result = model.transcribe(tmp_path, language="ja")
	text = result["text"].strip()
	finally:
	os.unlink(tmp_path)

	return text, duration

	@staticmethod
	def _calculate_rms(data: bytes) -> float:
	count = len(data) // 2
	shorts = struct.unpack(f"{count}h", data)
	arr = np.array(shorts, dtype=np.float64)
	if len(arr) == 0:
	return 0.0
	return float(np.sqrt(np.mean(arr ** 2)))


	class SpeechSynthesizer:
	def speak(self, text: str) -> None:
	from gtts import gTTS

	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
	tmp_path = tmp.name

	try:
	tts = gTTS(text=text, lang="ja")
	tts.save(tmp_path)
	subprocess.run(
	["afplay", tmp_path],
	check=True,
	capture_output=True,
	)
	finally:
	if os.path.exists(tmp_path):
	os.unlink(tmp_path)


	class DryRunRecognizer:
	"""テスト用: マイクの代わりにキーボード入力を使用"""

	def listen(self) -> tuple[str, float]:
	print("📝 回答を入力してください（空行で終了）:")
	lines: list[str] = []
	while True:
	try:
	line = input()
	if line == "":
	break
	lines.append(line)
	except EOFError:
	break
	text = "\n".join(lines)
	return text, 0.0


	class DryRunSynthesizer:
	"""テスト用: 音声の代わりにテキスト表示"""

	def speak(self, text: str) -> None:
	print(f"🔊 {text}")