Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import io | |
| import wave | |
| import os | |
| import requests | |
| class AudioHandler: | |
| def __init__(self): | |
| # Sample rate will depend on upstream TTS; default is safe for playback. | |
| self.sample_rate = 22050 | |
| # Optional: URL of your external TTS Space or vLLM-style OpenAI server. | |
| # For your case, set KANITTS_URL in the my-voice-agent Space settings to point at: | |
| # https://jblast94-KaniTTS.hf.space (or the Space's /proxy or API endpoint) | |
| self.kanitts_url = os.getenv("KANITTS_URL") | |
| print(f"AudioHandler initialized. KaniTTS URL: {self.kanitts_url}") | |
| def text_to_speech(self, text: str): | |
| """ | |
| Convert text to speech using an external TTS backend. | |
| Recommended setups: | |
| - Use your KaniTTS Space: | |
| - Expose an HTTP endpoint there (e.g. /tts) that returns raw WAV/OGG bytes. | |
| - Set KANITTS_URL in this Space to that endpoint URL. | |
| - OR use a vLLM/OpenAI-compatible TTS server: | |
| - Point KANITTS_URL to its TTS endpoint. | |
| Returns: | |
| A tuple (sample_rate, np.ndarray) suitable for Gradio's Audio component, | |
| or None if TTS is not configured or fails. | |
| """ | |
| if not text or not text.strip(): | |
| return None | |
| if not self.kanitts_url: | |
| print("KANITTS_URL is not set; skipping TTS.") | |
| return None | |
| try: | |
| # Example: POST JSON; adjust if your KaniTTS/vLLM API differs. | |
| # For KaniTTS Space, implement a compatible /tts handler that: | |
| # - Accepts: { "text": "..." } | |
| # - Returns: audio bytes (wav/ogg) as response body. | |
| resp = requests.post( | |
| self.kanitts_url, | |
| json={"text": text}, | |
| timeout=30, | |
| ) | |
| resp.raise_for_status() | |
| audio_bytes = resp.content | |
| if not audio_bytes: | |
| print("KaniTTS/vLLM TTS returned empty audio.") | |
| return None | |
| # Try to parse as WAV; if different format, adapt accordingly. | |
| with wave.open(io.BytesIO(audio_bytes), "rb") as wf: | |
| sr = wf.getframerate() | |
| n_channels = wf.getnchannels() | |
| n_frames = wf.getnframes() | |
| audio_data = wf.readframes(n_frames) | |
| audio_np = np.frombuffer(audio_data, dtype=np.int16) | |
| if n_channels > 1: | |
| audio_np = audio_np.reshape(-1, n_channels).mean(axis=1).astype(np.int16) | |
| return (sr, audio_np.astype(np.float32) / 32768.0) | |
| except Exception as e: | |
| print(f"Error during TTS request to {self.kanitts_url}: {e}") | |
| return None | |
| def speech_to_text(self, audio_filepath): | |
| """ | |
| Placeholder STT. | |
| Options: | |
| - Connect to OpenAI Whisper / local STT / another Space and call it here. | |
| - For now, returns None so the rest of the app still works. | |
| """ | |
| if not audio_filepath: | |
| return None | |
| print(f"Speech-to-text not configured. Received audio file: '{audio_filepath}'") | |
| return None |