""" voice_interface.py — Voice I/O for the Computer Agent ====================================================== Speech-to-Text (Whisper / Faster-Whisper) and TTS (HF Inference API) """ import os import io import tempfile import base64 from typing import Optional, Dict, Any import numpy as np # STT try: from faster_whisper import WhisperModel HAS_FASTER_WHISPER = True except ImportError: HAS_FASTER_WHISPER = False # TTS via HF Inference try: from huggingface_hub import InferenceClient HAS_HF_INFERENCE = True except ImportError: HAS_HF_INFERENCE = False class VoiceInterface: """Handles audio input (STT) and output (TTS) for the agent.""" def __init__( self, stt_model_size: str = "base", tts_model: str = "hexgrad/Kokoro-82M", hf_token: Optional[str] = None, ): self.stt_model_size = stt_model_size self.tts_model = tts_model self.hf_token = hf_token or os.getenv("HF_TOKEN") self._stt: Optional[Any] = None self._tts_client: Optional[Any] = None # ------------------------------------------------------------------ # STT # ------------------------------------------------------------------ def _load_stt(self) -> Any: if self._stt is None: if HAS_FASTER_WHISPER: # Use CPU for Spaces compatibility; auto-detect compute type self._stt = WhisperModel(self.stt_model_size, device="cpu", compute_type="int8") else: raise RuntimeError("faster-whisper not installed. Run: pip install faster-whisper") return self._stt def transcribe(self, audio_np: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: """Transcribe audio waveform to text. audio_np: numpy array of float32 audio samples """ model = self._load_stt() # faster-whisper expects a file path or bytes; save to temp wav import soundfile as sf with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, audio_np, sample_rate) segments, info = model.transcribe(f.name, beam_size=5) text = " ".join([seg.text for seg in segments]) os.unlink(f.name) return { "text": text.strip(), "language": info.language, "probability": info.language_probability, } def transcribe_from_file(self, file_path: str) -> Dict[str, Any]: model = self._load_stt() segments, info = model.transcribe(file_path, beam_size=5) text = " ".join([seg.text for seg in segments]) return { "text": text.strip(), "language": info.language, "probability": info.language_probability, } # ------------------------------------------------------------------ # TTS # ------------------------------------------------------------------ def _load_tts(self) -> Any: if self._tts_client is None: if HAS_HF_INFERENCE: self._tts_client = InferenceClient(model=self.tts_model, token=self.hf_token) else: raise RuntimeError("huggingface_hub not installed") return self._tts_client def synthesize(self, text: str, voice: str = "af") -> bytes: """Synthesize text to speech bytes. Returns raw audio bytes (usually WAV or MP3 depending on model). """ client = self._load_tts() try: audio = client.text_to_speech(text, model=self.tts_model) if hasattr(audio, "read"): return audio.read() return audio except Exception as e: # Fallback to standard TTS endpoint alt_client = InferenceClient(token=self.hf_token) audio = alt_client.text_to_speech(text, model="espnet/kan-bayashi_ljspeech_vits") if hasattr(audio, "read"): return audio.read() return audio def synthesize_to_file(self, text: str, output_path: str, voice: str = "af") -> str: audio_bytes = self.synthesize(text, voice) with open(output_path, "wb") as f: f.write(audio_bytes) return output_path # ------------------------------------------------------------------ # Gradio helpers # ------------------------------------------------------------------ def process_gradio_audio(self, audio_tuple) -> str: """Process Gradio audio input (tuple of sample_rate, numpy_array).""" if audio_tuple is None: return "" sample_rate, audio_np = audio_tuple # Convert to mono float32 if needed if audio_np.ndim > 1: audio_np = audio_np.mean(axis=1) if audio_np.dtype != np.float32: audio_np = audio_np.astype(np.float32) result = self.transcribe(audio_np, sample_rate) return result["text"]