Spaces:

claudqunwang
/

ClareCourseWare

Sleeping

App Files Files Community

ClareCourseWare / api /tts_podcast.py

claudqunwang

Add Clare product UI: run_web.sh, README, exclude hf_space from push

c8c6034 2 months ago

raw

history blame contribute delete

4.56 kB

	# api/tts_podcast.py
	"""
	Text-to-Speech and Podcast generation for Clare.
	Uses OpenAI TTS API (same OPENAI_API_KEY as chat). Safe for Hugging Face deployment.
	- Max 4096 characters per TTS request; long text is chunked.
	"""
	import io
	import re
	from typing import List, Tuple, Optional

	from .config import client

	# OpenAI TTS limits (see https://platform.openai.com/docs/guides/text-to-speech)
	TTS_MAX_CHARS = 4096
	TTS_MODEL = "tts-1" # or "tts-1-hd" for higher quality (slower)
	TTS_VOICES = ("alloy", "echo", "fable", "onyx", "nova", "shimmer")
	DEFAULT_VOICE = "nova"


	def _chunk_text_for_tts(text: str, max_chars: int = TTS_MAX_CHARS - 100) -> List[str]:
	"""Split text into chunks under max_chars, trying to break at sentence boundaries."""
	text = (text or "").strip()
	if not text:
	return []
	if len(text) <= max_chars:
	return [text]

	chunks: List[str] = []
	# Prefer splitting on sentence end
	pattern = re.compile(r'(?<=[.!?。！？\n])\s+')
	parts = pattern.split(text)
	current = ""
	for p in parts:
	if len(current) + len(p) + 1 <= max_chars:
	current = (current + " " + p).strip() if current else p
	else:
	if current:
	chunks.append(current)
	# If single part is too long, split by hard limit
	if len(p) > max_chars:
	for i in range(0, len(p), max_chars):
	chunks.append(p[i : i + max_chars])
	current = ""
	else:
	current = p
	if current:
	chunks.append(current)
	return chunks


	def text_to_speech(
	text: str,
	voice: str = DEFAULT_VOICE,
	model: str = TTS_MODEL,
	) -> bytes:
	"""
	Convert text to MP3 audio using OpenAI TTS.
	Long text is chunked and concatenated (binary concatenation of MP3 is valid).
	"""
	if not text or not text.strip():
	return b""
	voice = (voice or DEFAULT_VOICE).lower()
	if voice not in TTS_VOICES:
	voice = DEFAULT_VOICE

	chunks = _chunk_text_for_tts(text)
	if not chunks:
	return b""

	all_bytes: List[bytes] = []
	for chunk in chunks:
	if not chunk.strip():
	continue
	resp = client.audio.speech.create(
	model=model,
	voice=voice,
	input=chunk,
	)
	all_bytes.append(resp.content)

	return b"".join(all_bytes)


	def build_podcast_script_from_history(
	history: List[Tuple[str, str]],
	intro_title: str = "Clare Learning Summary",
	max_turns: int = 20,
	) -> str:
	"""
	Build a podcast script from chat history: intro + alternating user question / assistant answer.
	"""
	lines: List[str] = []
	lines.append(f"Welcome to {intro_title}. Here are the key points from your session with Clare.")
	turns = (history or [])[:max_turns]
	for i, (user_msg, assistant_msg) in enumerate(turns):
	if user_msg and user_msg.strip():
	lines.append(f"Question: {user_msg.strip()}")
	if assistant_msg and assistant_msg.strip():
	# Optional: truncate very long answers for listenability
	msg = assistant_msg.strip()
	if len(msg) > 1500:
	msg = msg[:1500] + " ..."
	lines.append(f"Clare: {msg}")
	lines.append("Thanks for listening. Keep learning with Clare.")
	return "\n\n".join(lines)


	def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare Summary Podcast") -> str:
	"""Build a short podcast script from an existing summary markdown."""
	if not summary_md or not summary_md.strip():
	return f"Welcome to {intro_title}. No summary available for this session."
	# Strip markdown for cleaner speech
	text = summary_md.strip()
	# Fix: patterns without capture groups should use empty string replacement
	# Remove markdown headers (no capture group, replace with empty)
	text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)
	# Remove bold (text -> text)
	text = re.sub(r"\\([^]+)\\*", r"\1", text)
	# Remove italic (text -> text)
	text = re.sub(r"\([^]+)\*", r"\1", text)
	# Remove links ([text](url) -> text)
	text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
	return f"Welcome to {intro_title}. {text} Thanks for listening."


	def generate_podcast_audio(
	script: str,
	voice: str = DEFAULT_VOICE,
	model: str = TTS_MODEL,
	) -> bytes:
	"""Generate full podcast audio from a script (chunked TTS)."""
	return text_to_speech(script, voice=voice, model=model)