computer-agent-v2 / voice_interface.py
jkorstad's picture
Deploy Computer Agent v2.0 full stack
31e5b3a verified
"""
voice_interface.py — Voice I/O for the Computer Agent
======================================================
Speech-to-Text (Whisper / Faster-Whisper) and TTS (HF Inference API)
"""
import os
import io
import tempfile
import base64
from typing import Optional, Dict, Any
import numpy as np
# STT
try:
from faster_whisper import WhisperModel
HAS_FASTER_WHISPER = True
except ImportError:
HAS_FASTER_WHISPER = False
# TTS via HF Inference
try:
from huggingface_hub import InferenceClient
HAS_HF_INFERENCE = True
except ImportError:
HAS_HF_INFERENCE = False
class VoiceInterface:
"""Handles audio input (STT) and output (TTS) for the agent."""
def __init__(
self,
stt_model_size: str = "base",
tts_model: str = "hexgrad/Kokoro-82M",
hf_token: Optional[str] = None,
):
self.stt_model_size = stt_model_size
self.tts_model = tts_model
self.hf_token = hf_token or os.getenv("HF_TOKEN")
self._stt: Optional[Any] = None
self._tts_client: Optional[Any] = None
# ------------------------------------------------------------------
# STT
# ------------------------------------------------------------------
def _load_stt(self) -> Any:
if self._stt is None:
if HAS_FASTER_WHISPER:
# Use CPU for Spaces compatibility; auto-detect compute type
self._stt = WhisperModel(self.stt_model_size, device="cpu", compute_type="int8")
else:
raise RuntimeError("faster-whisper not installed. Run: pip install faster-whisper")
return self._stt
def transcribe(self, audio_np: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
"""Transcribe audio waveform to text.
audio_np: numpy array of float32 audio samples
"""
model = self._load_stt()
# faster-whisper expects a file path or bytes; save to temp wav
import soundfile as sf
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, audio_np, sample_rate)
segments, info = model.transcribe(f.name, beam_size=5)
text = " ".join([seg.text for seg in segments])
os.unlink(f.name)
return {
"text": text.strip(),
"language": info.language,
"probability": info.language_probability,
}
def transcribe_from_file(self, file_path: str) -> Dict[str, Any]:
model = self._load_stt()
segments, info = model.transcribe(file_path, beam_size=5)
text = " ".join([seg.text for seg in segments])
return {
"text": text.strip(),
"language": info.language,
"probability": info.language_probability,
}
# ------------------------------------------------------------------
# TTS
# ------------------------------------------------------------------
def _load_tts(self) -> Any:
if self._tts_client is None:
if HAS_HF_INFERENCE:
self._tts_client = InferenceClient(model=self.tts_model, token=self.hf_token)
else:
raise RuntimeError("huggingface_hub not installed")
return self._tts_client
def synthesize(self, text: str, voice: str = "af") -> bytes:
"""Synthesize text to speech bytes.
Returns raw audio bytes (usually WAV or MP3 depending on model).
"""
client = self._load_tts()
try:
audio = client.text_to_speech(text, model=self.tts_model)
if hasattr(audio, "read"):
return audio.read()
return audio
except Exception as e:
# Fallback to standard TTS endpoint
alt_client = InferenceClient(token=self.hf_token)
audio = alt_client.text_to_speech(text, model="espnet/kan-bayashi_ljspeech_vits")
if hasattr(audio, "read"):
return audio.read()
return audio
def synthesize_to_file(self, text: str, output_path: str, voice: str = "af") -> str:
audio_bytes = self.synthesize(text, voice)
with open(output_path, "wb") as f:
f.write(audio_bytes)
return output_path
# ------------------------------------------------------------------
# Gradio helpers
# ------------------------------------------------------------------
def process_gradio_audio(self, audio_tuple) -> str:
"""Process Gradio audio input (tuple of sample_rate, numpy_array)."""
if audio_tuple is None:
return ""
sample_rate, audio_np = audio_tuple
# Convert to mono float32 if needed
if audio_np.ndim > 1:
audio_np = audio_np.mean(axis=1)
if audio_np.dtype != np.float32:
audio_np = audio_np.astype(np.float32)
result = self.transcribe(audio_np, sample_rate)
return result["text"]