Spaces:
Sleeping
Sleeping
| # api/tts_podcast.py | |
| """ | |
| Text-to-Speech and Podcast generation for Clare. | |
| Uses OpenAI TTS API (same OPENAI_API_KEY as chat). Safe for Hugging Face deployment. | |
| - Max 4096 characters per TTS request; long text is chunked. | |
| """ | |
| import io | |
| import re | |
| from typing import List, Tuple, Optional | |
| from .config import client | |
| # OpenAI TTS limits (see https://platform.openai.com/docs/guides/text-to-speech) | |
| TTS_MAX_CHARS = 4096 | |
| TTS_MODEL = "tts-1" # or "tts-1-hd" for higher quality (slower) | |
| TTS_VOICES = ("alloy", "echo", "fable", "onyx", "nova", "shimmer") | |
| DEFAULT_VOICE = "nova" | |
| def _chunk_text_for_tts(text: str, max_chars: int = TTS_MAX_CHARS - 100) -> List[str]: | |
| """Split text into chunks under max_chars, trying to break at sentence boundaries.""" | |
| text = (text or "").strip() | |
| if not text: | |
| return [] | |
| if len(text) <= max_chars: | |
| return [text] | |
| chunks: List[str] = [] | |
| # Prefer splitting on sentence end | |
| pattern = re.compile(r'(?<=[.!?。!?\n])\s+') | |
| parts = pattern.split(text) | |
| current = "" | |
| for p in parts: | |
| if len(current) + len(p) + 1 <= max_chars: | |
| current = (current + " " + p).strip() if current else p | |
| else: | |
| if current: | |
| chunks.append(current) | |
| # If single part is too long, split by hard limit | |
| if len(p) > max_chars: | |
| for i in range(0, len(p), max_chars): | |
| chunks.append(p[i : i + max_chars]) | |
| current = "" | |
| else: | |
| current = p | |
| if current: | |
| chunks.append(current) | |
| return chunks | |
| def text_to_speech( | |
| text: str, | |
| voice: str = DEFAULT_VOICE, | |
| model: str = TTS_MODEL, | |
| ) -> bytes: | |
| """ | |
| Convert text to MP3 audio using OpenAI TTS. | |
| Long text is chunked and concatenated (binary concatenation of MP3 is valid). | |
| """ | |
| if not text or not text.strip(): | |
| return b"" | |
| voice = (voice or DEFAULT_VOICE).lower() | |
| if voice not in TTS_VOICES: | |
| voice = DEFAULT_VOICE | |
| chunks = _chunk_text_for_tts(text) | |
| if not chunks: | |
| return b"" | |
| all_bytes: List[bytes] = [] | |
| for chunk in chunks: | |
| if not chunk.strip(): | |
| continue | |
| resp = client.audio.speech.create( | |
| model=model, | |
| voice=voice, | |
| input=chunk, | |
| ) | |
| all_bytes.append(resp.content) | |
| return b"".join(all_bytes) | |
| def build_podcast_script_from_history( | |
| history: List[Tuple[str, str]], | |
| intro_title: str = "Clare Learning Summary", | |
| max_turns: int = 20, | |
| ) -> str: | |
| """ | |
| Build a podcast script from chat history: intro + alternating user question / assistant answer. | |
| """ | |
| lines: List[str] = [] | |
| lines.append(f"Welcome to {intro_title}. Here are the key points from your session with Clare.") | |
| turns = (history or [])[:max_turns] | |
| for i, (user_msg, assistant_msg) in enumerate(turns): | |
| if user_msg and user_msg.strip(): | |
| lines.append(f"Question: {user_msg.strip()}") | |
| if assistant_msg and assistant_msg.strip(): | |
| # Optional: truncate very long answers for listenability | |
| msg = assistant_msg.strip() | |
| if len(msg) > 1500: | |
| msg = msg[:1500] + " ..." | |
| lines.append(f"Clare: {msg}") | |
| lines.append("Thanks for listening. Keep learning with Clare.") | |
| return "\n\n".join(lines) | |
| def build_podcast_script_from_summary(summary_md: str, intro_title: str = "Clare Summary Podcast") -> str: | |
| """Build a short podcast script from an existing summary markdown.""" | |
| if not summary_md or not summary_md.strip(): | |
| return f"Welcome to {intro_title}. No summary available for this session." | |
| # Strip markdown for cleaner speech | |
| text = summary_md.strip() | |
| # Fix: patterns without capture groups should use empty string replacement | |
| # Remove markdown headers (no capture group, replace with empty) | |
| text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE) | |
| # Remove bold (**text** -> text) | |
| text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) | |
| # Remove italic (*text* -> text) | |
| text = re.sub(r"\*([^*]+)\*", r"\1", text) | |
| # Remove links ([text](url) -> text) | |
| text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) | |
| return f"Welcome to {intro_title}. {text} Thanks for listening." | |
| def generate_podcast_audio( | |
| script: str, | |
| voice: str = DEFAULT_VOICE, | |
| model: str = TTS_MODEL, | |
| ) -> bytes: | |
| """Generate full podcast audio from a script (chunked TTS).""" | |
| return text_to_speech(script, voice=voice, model=model) | |