my-voice-agent / audio_handler.py
jblast94's picture
Update audio_handler.py
1f1d266 verified
import numpy as np
import io
import wave
import os
import requests
class AudioHandler:
def __init__(self):
# Sample rate will depend on upstream TTS; default is safe for playback.
self.sample_rate = 22050
# Optional: URL of your external TTS Space or vLLM-style OpenAI server.
# For your case, set KANITTS_URL in the my-voice-agent Space settings to point at:
# https://jblast94-KaniTTS.hf.space (or the Space's /proxy or API endpoint)
self.kanitts_url = os.getenv("KANITTS_URL")
print(f"AudioHandler initialized. KaniTTS URL: {self.kanitts_url}")
def text_to_speech(self, text: str):
"""
Convert text to speech using an external TTS backend.
Recommended setups:
- Use your KaniTTS Space:
- Expose an HTTP endpoint there (e.g. /tts) that returns raw WAV/OGG bytes.
- Set KANITTS_URL in this Space to that endpoint URL.
- OR use a vLLM/OpenAI-compatible TTS server:
- Point KANITTS_URL to its TTS endpoint.
Returns:
A tuple (sample_rate, np.ndarray) suitable for Gradio's Audio component,
or None if TTS is not configured or fails.
"""
if not text or not text.strip():
return None
if not self.kanitts_url:
print("KANITTS_URL is not set; skipping TTS.")
return None
try:
# Example: POST JSON; adjust if your KaniTTS/vLLM API differs.
# For KaniTTS Space, implement a compatible /tts handler that:
# - Accepts: { "text": "..." }
# - Returns: audio bytes (wav/ogg) as response body.
resp = requests.post(
self.kanitts_url,
json={"text": text},
timeout=30,
)
resp.raise_for_status()
audio_bytes = resp.content
if not audio_bytes:
print("KaniTTS/vLLM TTS returned empty audio.")
return None
# Try to parse as WAV; if different format, adapt accordingly.
with wave.open(io.BytesIO(audio_bytes), "rb") as wf:
sr = wf.getframerate()
n_channels = wf.getnchannels()
n_frames = wf.getnframes()
audio_data = wf.readframes(n_frames)
audio_np = np.frombuffer(audio_data, dtype=np.int16)
if n_channels > 1:
audio_np = audio_np.reshape(-1, n_channels).mean(axis=1).astype(np.int16)
return (sr, audio_np.astype(np.float32) / 32768.0)
except Exception as e:
print(f"Error during TTS request to {self.kanitts_url}: {e}")
return None
def speech_to_text(self, audio_filepath):
"""
Placeholder STT.
Options:
- Connect to OpenAI Whisper / local STT / another Space and call it here.
- For now, returns None so the rest of the app still works.
"""
if not audio_filepath:
return None
print(f"Speech-to-text not configured. Received audio file: '{audio_filepath}'")
return None