ClareCourseWare / api /tts_podcast.py
claudqunwang's picture
Add Clare product UI: run_web.sh, README, exclude hf_space from push
c8c6034
# api/tts_podcast.py
"""
Text-to-Speech and Podcast generation for Clare.
Uses OpenAI TTS API (same OPENAI_API_KEY as chat). Safe for Hugging Face deployment.
- Max 4096 characters per TTS request; long text is chunked.
"""
import io
import re
from typing import List, Tuple, Optional
from .config import client
# OpenAI TTS limits (see https://platform.openai.com/docs/guides/text-to-speech)
TTS_MAX_CHARS = 4096
TTS_MODEL = "tts-1" # or "tts-1-hd" for higher quality (slower)
TTS_VOICES = ("alloy", "echo", "fable", "onyx", "nova", "shimmer")
DEFAULT_VOICE = "nova"
def _chunk_text_for_tts(text: str, max_chars: int = TTS_MAX_CHARS - 100) -> List[str]:
"""Split text into chunks under max_chars, trying to break at sentence boundaries."""
text = (text or "").strip()
if not text:
return []
if len(text) <= max_chars:
return [text]
chunks: List[str] = []
# Prefer splitting on sentence end
pattern = re.compile(r'(?<=[.!?。!?\n])\s+')
parts = pattern.split(text)
current = ""
for p in parts:
if len(current) + len(p) + 1 <= max_chars:
current = (current + " " + p).strip() if current else p
else:
if current:
chunks.append(current)
# If single part is too long, split by hard limit
if len(p) > max_chars:
for i in range(0, len(p), max_chars):
chunks.append(p[i : i + max_chars])
current = ""
else:
current = p
if current:
chunks.append(current)
return chunks
def text_to_speech(
text: str,
voice: str = DEFAULT_VOICE,
model: str = TTS_MODEL,
) -> bytes:
"""
Convert text to MP3 audio using OpenAI TTS.
Long text is chunked and concatenated (binary concatenation of MP3 is valid).
"""
if not text or not text.strip():
return b""
voice = (voice or DEFAULT_VOICE).lower()
if voice not in TTS_VOICES:
voice = DEFAULT_VOICE
chunks = _chunk_text_for_tts(text)
if not chunks:
return b""
all_bytes: List[bytes] = []
for chunk in chunks:
if not chunk.strip():
continue
resp = client.audio.speech.create(
model=model,
voice=voice,
input=chunk,
)
all_bytes.append(resp.content)
return b"".join(all_bytes)
def build_podcast_script_from_history(
history: List[Tuple[str, str]],
intro_title: str = "Clare Learning Summary",
max_turns: int = 20,
) -> str:
"""
Build a podcast script from chat history: intro + alternating user question / assistant answer.
"""
lines: List[str] = []
lines.append(f"Welcome to {intro_title}. Here are the key points from your session with Clare.")
turns = (history or [])[:max_turns]
for i, (user_msg, assistant_msg) in enumerate(turns):
if user_msg and user_msg.strip():
lines.append(f"Question: {user_msg.strip()}")
if assistant_msg and assistant_msg.strip():
# Optional: truncate very long answers for listenability
msg = assistant_msg.strip()
if len(msg) > 1500:
msg = msg[:1500] + " ..."
lines.append(f"Clare: {msg}")
lines.append("Thanks for listening. Keep learning with Clare.")
return "\n\n".join(lines)
def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare Summary Podcast") -> str:
"""Build a short podcast script from an existing summary markdown."""
if not summary_md or not summary_md.strip():
return f"Welcome to {intro_title}. No summary available for this session."
# Strip markdown for cleaner speech
text = summary_md.strip()
# Fix: patterns without capture groups should use empty string replacement
# Remove markdown headers (no capture group, replace with empty)
text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)
# Remove bold (**text** -> text)
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
# Remove italic (*text* -> text)
text = re.sub(r"\*([^*]+)\*", r"\1", text)
# Remove links ([text](url) -> text)
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
return f"Welcome to {intro_title}. {text} Thanks for listening."
def generate_podcast_audio(
script: str,
voice: str = DEFAULT_VOICE,
model: str = TTS_MODEL,
) -> bytes:
"""Generate full podcast audio from a script (chunked TTS)."""
return text_to_speech(script, voice=voice, model=model)