Spaces:

robiul487
/

NCAkit

Sleeping

NCAkit / modules /video_creator /services /libraries /tts_client.py

feat: Major optimizations - Quiz dynamic fonts, TTS 1.2x speed, Video stream copy (10x faster), Single API call, Fact Image dynamic fonts, Text Story position fix

ee36c8e 4 months ago

raw

history blame contribute delete

4.15 kB

	import aiohttp
	import struct
	import logging
	from typing import Tuple

	logger = logging.getLogger(__name__)


	class TTSClient:
	"""Client for Kokoro TTS via Hugging Face Cloud API"""

	def __init__(self, api_url: str):
	"""
	Initialize TTS client

	Args:
	api_url: Base URL for the TTS API (HF_TTS environment variable)
	"""
	self.api_url = api_url.rstrip('/')
	logger.info(f"Using cloud TTS API at {self.api_url}")

	async def generate(self, text: str, voice: str, speed: float = 1.0) -> Tuple[bytes, float]:
	"""
	Generate speech from text

	Args:
	text: Text to convert to speech
	voice: Voice identifier (e.g., 'af_heart', 'am_adam')
	speed: Speech speed multiplier (0.5-2.0, default 1.0)

	Returns:
	Tuple of (audio_bytes, duration_seconds)
	"""
	endpoint = f"{self.api_url}/v1/audio/speech"

	logger.debug(f"Generating audio with voice={voice}, speed={speed}, text_length={len(text)}")

	async with aiohttp.ClientSession() as session:
	async with session.post(
	endpoint,
	json={
	"model": "kokoro",
	"input": text,
	"voice": voice,
	"speed": speed
	},
	headers={"Content-Type": "application/json"},
	timeout=aiohttp.ClientTimeout(total=120)
	) as response:
	if response.status != 200:
	error_text = await response.text()
	raise Exception(f"TTS API error ({response.status}): {error_text}")

	audio_data = await response.read()
	duration = self._estimate_audio_duration(audio_data)

	logger.debug(f"Generated audio: {len(audio_data)} bytes, {duration:.2f}s")
	return audio_data, duration

	def _estimate_audio_duration(self, audio_buffer: bytes) -> float:
	"""
	Estimate audio duration from WAV buffer

	WAV format: 44 byte header, then PCM data
	"""
	if len(audio_buffer) < 44:
	# Fallback estimation
	return (len(audio_buffer) - 44) / (2 * 24000)

	# Check if it's a valid WAV file (starts with 'RIFF')
	if audio_buffer[:4] != b'RIFF':
	# Fallback estimation
	return (len(audio_buffer) - 44) / (2 * 24000)

	try:
	# Parse WAV header
	# Data size at bytes 40-43
	data_size = struct.unpack('<I', audio_buffer[40:44])[0]
	# Sample rate at bytes 24-27
	sample_rate = struct.unpack('<I', audio_buffer[24:28])[0]
	# Bits per sample at bytes 34-35
	bits_per_sample = struct.unpack('<H', audio_buffer[34:36])[0]
	# Number of channels at bytes 22-23
	num_channels = struct.unpack('<H', audio_buffer[22:24])[0]

	bytes_per_sample = (bits_per_sample // 8) * num_channels
	num_samples = data_size / bytes_per_sample
	duration = num_samples / sample_rate

	return duration
	except Exception as e:
	logger.warning(f"Failed to parse WAV header: {e}, using fallback")
	# Fallback: estimate based on buffer size
	# Assuming 24kHz, 16-bit, mono
	estimated_samples = (len(audio_buffer) - 44) / 2
	return estimated_samples / 24000

	@staticmethod
	def list_available_voices() -> list:
	"""Return list of available TTS voices"""
	return [
	"af_heart", "af_alloy", "af_aoede", "af_bella", "af_jessica",
	"af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky",
	"am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam",
	"am_michael", "am_onyx", "am_puck", "am_santa",
	"bf_emma", "bf_isabella", "bm_george", "bm_lewis",
	"bf_alice", "bf_lily", "bm_daniel", "bm_fable"
	]