Spaces:
Sleeping
Sleeping
File size: 3,146 Bytes
993bb20 1f1d266 0603011 1f1d266 993bb20 1f1d266 993bb20 1f1d266 993bb20 1f1d266 993bb20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import numpy as np
import io
import wave
import os
import requests
class AudioHandler:
def __init__(self):
# Sample rate will depend on upstream TTS; default is safe for playback.
self.sample_rate = 22050
# Optional: URL of your external TTS Space or vLLM-style OpenAI server.
# For your case, set KANITTS_URL in the my-voice-agent Space settings to point at:
# https://jblast94-KaniTTS.hf.space (or the Space's /proxy or API endpoint)
self.kanitts_url = os.getenv("KANITTS_URL")
print(f"AudioHandler initialized. KaniTTS URL: {self.kanitts_url}")
def text_to_speech(self, text: str):
"""
Convert text to speech using an external TTS backend.
Recommended setups:
- Use your KaniTTS Space:
- Expose an HTTP endpoint there (e.g. /tts) that returns raw WAV/OGG bytes.
- Set KANITTS_URL in this Space to that endpoint URL.
- OR use a vLLM/OpenAI-compatible TTS server:
- Point KANITTS_URL to its TTS endpoint.
Returns:
A tuple (sample_rate, np.ndarray) suitable for Gradio's Audio component,
or None if TTS is not configured or fails.
"""
if not text or not text.strip():
return None
if not self.kanitts_url:
print("KANITTS_URL is not set; skipping TTS.")
return None
try:
# Example: POST JSON; adjust if your KaniTTS/vLLM API differs.
# For KaniTTS Space, implement a compatible /tts handler that:
# - Accepts: { "text": "..." }
# - Returns: audio bytes (wav/ogg) as response body.
resp = requests.post(
self.kanitts_url,
json={"text": text},
timeout=30,
)
resp.raise_for_status()
audio_bytes = resp.content
if not audio_bytes:
print("KaniTTS/vLLM TTS returned empty audio.")
return None
# Try to parse as WAV; if different format, adapt accordingly.
with wave.open(io.BytesIO(audio_bytes), "rb") as wf:
sr = wf.getframerate()
n_channels = wf.getnchannels()
n_frames = wf.getnframes()
audio_data = wf.readframes(n_frames)
audio_np = np.frombuffer(audio_data, dtype=np.int16)
if n_channels > 1:
audio_np = audio_np.reshape(-1, n_channels).mean(axis=1).astype(np.int16)
return (sr, audio_np.astype(np.float32) / 32768.0)
except Exception as e:
print(f"Error during TTS request to {self.kanitts_url}: {e}")
return None
def speech_to_text(self, audio_filepath):
"""
Placeholder STT.
Options:
- Connect to OpenAI Whisper / local STT / another Space and call it here.
- For now, returns None so the rest of the app still works.
"""
if not audio_filepath:
return None
print(f"Speech-to-text not configured. Received audio file: '{audio_filepath}'")
return None |