Spaces:
Sleeping
Sleeping
| """ | |
| voice_interface.py — Voice I/O for the Computer Agent | |
| ====================================================== | |
| Speech-to-Text (Whisper / Faster-Whisper) and TTS (HF Inference API) | |
| """ | |
| import os | |
| import io | |
| import tempfile | |
| import base64 | |
| from typing import Optional, Dict, Any | |
| import numpy as np | |
| # STT | |
| try: | |
| from faster_whisper import WhisperModel | |
| HAS_FASTER_WHISPER = True | |
| except ImportError: | |
| HAS_FASTER_WHISPER = False | |
| # TTS via HF Inference | |
| try: | |
| from huggingface_hub import InferenceClient | |
| HAS_HF_INFERENCE = True | |
| except ImportError: | |
| HAS_HF_INFERENCE = False | |
| class VoiceInterface: | |
| """Handles audio input (STT) and output (TTS) for the agent.""" | |
| def __init__( | |
| self, | |
| stt_model_size: str = "base", | |
| tts_model: str = "hexgrad/Kokoro-82M", | |
| hf_token: Optional[str] = None, | |
| ): | |
| self.stt_model_size = stt_model_size | |
| self.tts_model = tts_model | |
| self.hf_token = hf_token or os.getenv("HF_TOKEN") | |
| self._stt: Optional[Any] = None | |
| self._tts_client: Optional[Any] = None | |
| # ------------------------------------------------------------------ | |
| # STT | |
| # ------------------------------------------------------------------ | |
| def _load_stt(self) -> Any: | |
| if self._stt is None: | |
| if HAS_FASTER_WHISPER: | |
| # Use CPU for Spaces compatibility; auto-detect compute type | |
| self._stt = WhisperModel(self.stt_model_size, device="cpu", compute_type="int8") | |
| else: | |
| raise RuntimeError("faster-whisper not installed. Run: pip install faster-whisper") | |
| return self._stt | |
| def transcribe(self, audio_np: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: | |
| """Transcribe audio waveform to text. | |
| audio_np: numpy array of float32 audio samples | |
| """ | |
| model = self._load_stt() | |
| # faster-whisper expects a file path or bytes; save to temp wav | |
| import soundfile as sf | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| sf.write(f.name, audio_np, sample_rate) | |
| segments, info = model.transcribe(f.name, beam_size=5) | |
| text = " ".join([seg.text for seg in segments]) | |
| os.unlink(f.name) | |
| return { | |
| "text": text.strip(), | |
| "language": info.language, | |
| "probability": info.language_probability, | |
| } | |
| def transcribe_from_file(self, file_path: str) -> Dict[str, Any]: | |
| model = self._load_stt() | |
| segments, info = model.transcribe(file_path, beam_size=5) | |
| text = " ".join([seg.text for seg in segments]) | |
| return { | |
| "text": text.strip(), | |
| "language": info.language, | |
| "probability": info.language_probability, | |
| } | |
| # ------------------------------------------------------------------ | |
| # TTS | |
| # ------------------------------------------------------------------ | |
| def _load_tts(self) -> Any: | |
| if self._tts_client is None: | |
| if HAS_HF_INFERENCE: | |
| self._tts_client = InferenceClient(model=self.tts_model, token=self.hf_token) | |
| else: | |
| raise RuntimeError("huggingface_hub not installed") | |
| return self._tts_client | |
| def synthesize(self, text: str, voice: str = "af") -> bytes: | |
| """Synthesize text to speech bytes. | |
| Returns raw audio bytes (usually WAV or MP3 depending on model). | |
| """ | |
| client = self._load_tts() | |
| try: | |
| audio = client.text_to_speech(text, model=self.tts_model) | |
| if hasattr(audio, "read"): | |
| return audio.read() | |
| return audio | |
| except Exception as e: | |
| # Fallback to standard TTS endpoint | |
| alt_client = InferenceClient(token=self.hf_token) | |
| audio = alt_client.text_to_speech(text, model="espnet/kan-bayashi_ljspeech_vits") | |
| if hasattr(audio, "read"): | |
| return audio.read() | |
| return audio | |
| def synthesize_to_file(self, text: str, output_path: str, voice: str = "af") -> str: | |
| audio_bytes = self.synthesize(text, voice) | |
| with open(output_path, "wb") as f: | |
| f.write(audio_bytes) | |
| return output_path | |
| # ------------------------------------------------------------------ | |
| # Gradio helpers | |
| # ------------------------------------------------------------------ | |
| def process_gradio_audio(self, audio_tuple) -> str: | |
| """Process Gradio audio input (tuple of sample_rate, numpy_array).""" | |
| if audio_tuple is None: | |
| return "" | |
| sample_rate, audio_np = audio_tuple | |
| # Convert to mono float32 if needed | |
| if audio_np.ndim > 1: | |
| audio_np = audio_np.mean(axis=1) | |
| if audio_np.dtype != np.float32: | |
| audio_np = audio_np.astype(np.float32) | |
| result = self.transcribe(audio_np, sample_rate) | |
| return result["text"] | |