ismdrobiul489's picture
feat: Major optimizations - Quiz dynamic fonts, TTS 1.2x speed, Video stream copy (10x faster), Single API call, Fact Image dynamic fonts, Text Story position fix
ee36c8e
import aiohttp
import struct
import logging
from typing import Tuple
logger = logging.getLogger(__name__)
class TTSClient:
"""Client for Kokoro TTS via Hugging Face Cloud API"""
def __init__(self, api_url: str):
"""
Initialize TTS client
Args:
api_url: Base URL for the TTS API (HF_TTS environment variable)
"""
self.api_url = api_url.rstrip('/')
logger.info(f"Using cloud TTS API at {self.api_url}")
async def generate(self, text: str, voice: str, speed: float = 1.0) -> Tuple[bytes, float]:
"""
Generate speech from text
Args:
text: Text to convert to speech
voice: Voice identifier (e.g., 'af_heart', 'am_adam')
speed: Speech speed multiplier (0.5-2.0, default 1.0)
Returns:
Tuple of (audio_bytes, duration_seconds)
"""
endpoint = f"{self.api_url}/v1/audio/speech"
logger.debug(f"Generating audio with voice={voice}, speed={speed}, text_length={len(text)}")
async with aiohttp.ClientSession() as session:
async with session.post(
endpoint,
json={
"model": "kokoro",
"input": text,
"voice": voice,
"speed": speed
},
headers={"Content-Type": "application/json"},
timeout=aiohttp.ClientTimeout(total=120)
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"TTS API error ({response.status}): {error_text}")
audio_data = await response.read()
duration = self._estimate_audio_duration(audio_data)
logger.debug(f"Generated audio: {len(audio_data)} bytes, {duration:.2f}s")
return audio_data, duration
def _estimate_audio_duration(self, audio_buffer: bytes) -> float:
"""
Estimate audio duration from WAV buffer
WAV format: 44 byte header, then PCM data
"""
if len(audio_buffer) < 44:
# Fallback estimation
return (len(audio_buffer) - 44) / (2 * 24000)
# Check if it's a valid WAV file (starts with 'RIFF')
if audio_buffer[:4] != b'RIFF':
# Fallback estimation
return (len(audio_buffer) - 44) / (2 * 24000)
try:
# Parse WAV header
# Data size at bytes 40-43
data_size = struct.unpack('<I', audio_buffer[40:44])[0]
# Sample rate at bytes 24-27
sample_rate = struct.unpack('<I', audio_buffer[24:28])[0]
# Bits per sample at bytes 34-35
bits_per_sample = struct.unpack('<H', audio_buffer[34:36])[0]
# Number of channels at bytes 22-23
num_channels = struct.unpack('<H', audio_buffer[22:24])[0]
bytes_per_sample = (bits_per_sample // 8) * num_channels
num_samples = data_size / bytes_per_sample
duration = num_samples / sample_rate
return duration
except Exception as e:
logger.warning(f"Failed to parse WAV header: {e}, using fallback")
# Fallback: estimate based on buffer size
# Assuming 24kHz, 16-bit, mono
estimated_samples = (len(audio_buffer) - 44) / 2
return estimated_samples / 24000
@staticmethod
def list_available_voices() -> list:
"""Return list of available TTS voices"""
return [
"af_heart", "af_alloy", "af_aoede", "af_bella", "af_jessica",
"af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky",
"am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam",
"am_michael", "am_onyx", "am_puck", "am_santa",
"bf_emma", "bf_isabella", "bm_george", "bm_lewis",
"bf_alice", "bf_lily", "bm_daniel", "bm_fable"
]