File size: 4,965 Bytes

31e5b3a

"""
voice_interface.py — Voice I/O for the Computer Agent
======================================================
Speech-to-Text (Whisper / Faster-Whisper) and TTS (HF Inference API)
"""

import os
import io
import tempfile
import base64
from typing import Optional, Dict, Any

import numpy as np

# STT
try:
    from faster_whisper import WhisperModel
    HAS_FASTER_WHISPER = True
except ImportError:
    HAS_FASTER_WHISPER = False

# TTS via HF Inference
try:
    from huggingface_hub import InferenceClient
    HAS_HF_INFERENCE = True
except ImportError:
    HAS_HF_INFERENCE = False


class VoiceInterface:
    """Handles audio input (STT) and output (TTS) for the agent."""

    def __init__(
        self,
        stt_model_size: str = "base",
        tts_model: str = "hexgrad/Kokoro-82M",
        hf_token: Optional[str] = None,
    ):
        self.stt_model_size = stt_model_size
        self.tts_model = tts_model
        self.hf_token = hf_token or os.getenv("HF_TOKEN")
        self._stt: Optional[Any] = None
        self._tts_client: Optional[Any] = None

    # ------------------------------------------------------------------
    # STT
    # ------------------------------------------------------------------

    def _load_stt(self) -> Any:
        if self._stt is None:
            if HAS_FASTER_WHISPER:
                # Use CPU for Spaces compatibility; auto-detect compute type
                self._stt = WhisperModel(self.stt_model_size, device="cpu", compute_type="int8")
            else:
                raise RuntimeError("faster-whisper not installed. Run: pip install faster-whisper")
        return self._stt

    def transcribe(self, audio_np: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
        """Transcribe audio waveform to text.
        audio_np: numpy array of float32 audio samples
        """
        model = self._load_stt()
        # faster-whisper expects a file path or bytes; save to temp wav
        import soundfile as sf
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            sf.write(f.name, audio_np, sample_rate)
            segments, info = model.transcribe(f.name, beam_size=5)
            text = " ".join([seg.text for seg in segments])
            os.unlink(f.name)
        return {
            "text": text.strip(),
            "language": info.language,
            "probability": info.language_probability,
        }

    def transcribe_from_file(self, file_path: str) -> Dict[str, Any]:
        model = self._load_stt()
        segments, info = model.transcribe(file_path, beam_size=5)
        text = " ".join([seg.text for seg in segments])
        return {
            "text": text.strip(),
            "language": info.language,
            "probability": info.language_probability,
        }

    # ------------------------------------------------------------------
    # TTS
    # ------------------------------------------------------------------

    def _load_tts(self) -> Any:
        if self._tts_client is None:
            if HAS_HF_INFERENCE:
                self._tts_client = InferenceClient(model=self.tts_model, token=self.hf_token)
            else:
                raise RuntimeError("huggingface_hub not installed")
        return self._tts_client

    def synthesize(self, text: str, voice: str = "af") -> bytes:
        """Synthesize text to speech bytes.
        Returns raw audio bytes (usually WAV or MP3 depending on model).
        """
        client = self._load_tts()
        try:
            audio = client.text_to_speech(text, model=self.tts_model)
            if hasattr(audio, "read"):
                return audio.read()
            return audio
        except Exception as e:
            # Fallback to standard TTS endpoint
            alt_client = InferenceClient(token=self.hf_token)
            audio = alt_client.text_to_speech(text, model="espnet/kan-bayashi_ljspeech_vits")
            if hasattr(audio, "read"):
                return audio.read()
            return audio

    def synthesize_to_file(self, text: str, output_path: str, voice: str = "af") -> str:
        audio_bytes = self.synthesize(text, voice)
        with open(output_path, "wb") as f:
            f.write(audio_bytes)
        return output_path

    # ------------------------------------------------------------------
    # Gradio helpers
    # ------------------------------------------------------------------

    def process_gradio_audio(self, audio_tuple) -> str:
        """Process Gradio audio input (tuple of sample_rate, numpy_array)."""
        if audio_tuple is None:
            return ""
        sample_rate, audio_np = audio_tuple
        # Convert to mono float32 if needed
        if audio_np.ndim > 1:
            audio_np = audio_np.mean(axis=1)
        if audio_np.dtype != np.float32:
            audio_np = audio_np.astype(np.float32)
        result = self.transcribe(audio_np, sample_rate)
        return result["text"]