Spaces:

jblast94
/

my-voice-agent

Sleeping

App Files Files Community

my-voice-agent / audio_handler.py

jblast94

Update audio_handler.py

1f1d266 verified 4 months ago

raw

history blame contribute delete

3.15 kB

	import numpy as np
	import io
	import wave
	import os
	import requests

	class AudioHandler:
	def __init__(self):
	# Sample rate will depend on upstream TTS; default is safe for playback.
	self.sample_rate = 22050
	# Optional: URL of your external TTS Space or vLLM-style OpenAI server.
	# For your case, set KANITTS_URL in the my-voice-agent Space settings to point at:
	# https://jblast94-KaniTTS.hf.space (or the Space's /proxy or API endpoint)
	self.kanitts_url = os.getenv("KANITTS_URL")
	print(f"AudioHandler initialized. KaniTTS URL: {self.kanitts_url}")

	def text_to_speech(self, text: str):
	"""
	Convert text to speech using an external TTS backend.

	Recommended setups:
	- Use your KaniTTS Space:
	- Expose an HTTP endpoint there (e.g. /tts) that returns raw WAV/OGG bytes.
	- Set KANITTS_URL in this Space to that endpoint URL.
	- OR use a vLLM/OpenAI-compatible TTS server:
	- Point KANITTS_URL to its TTS endpoint.

	Returns:
	A tuple (sample_rate, np.ndarray) suitable for Gradio's Audio component,
	or None if TTS is not configured or fails.
	"""
	if not text or not text.strip():
	return None

	if not self.kanitts_url:
	print("KANITTS_URL is not set; skipping TTS.")
	return None

	try:
	# Example: POST JSON; adjust if your KaniTTS/vLLM API differs.
	# For KaniTTS Space, implement a compatible /tts handler that:
	# - Accepts: { "text": "..." }
	# - Returns: audio bytes (wav/ogg) as response body.
	resp = requests.post(
	self.kanitts_url,
	json={"text": text},
	timeout=30,
	)
	resp.raise_for_status()
	audio_bytes = resp.content
	if not audio_bytes:
	print("KaniTTS/vLLM TTS returned empty audio.")
	return None

	# Try to parse as WAV; if different format, adapt accordingly.
	with wave.open(io.BytesIO(audio_bytes), "rb") as wf:
	sr = wf.getframerate()
	n_channels = wf.getnchannels()
	n_frames = wf.getnframes()
	audio_data = wf.readframes(n_frames)

	audio_np = np.frombuffer(audio_data, dtype=np.int16)
	if n_channels > 1:
	audio_np = audio_np.reshape(-1, n_channels).mean(axis=1).astype(np.int16)

	return (sr, audio_np.astype(np.float32) / 32768.0)
	except Exception as e:
	print(f"Error during TTS request to {self.kanitts_url}: {e}")
	return None

	def speech_to_text(self, audio_filepath):
	"""
	Placeholder STT.

	Options:
	- Connect to OpenAI Whisper / local STT / another Space and call it here.
	- For now, returns None so the rest of the app still works.
	"""
	if not audio_filepath:
	return None

	print(f"Speech-to-text not configured. Received audio file: '{audio_filepath}'")
	return None