""" cleaner.py Reformats raw YouTube transcript text into clean, readable paragraphs. Author: algorembrant """ from __future__ import annotations from config import DEFAULT_MODEL, MAX_TOKENS from ai_client import complete_long # --------------------------------------------------------------------------- # Prompts # --------------------------------------------------------------------------- _CLEAN_SYSTEM = """You are a professional transcript editor. Your task is to reformat raw, fragmented YouTube transcript text into clean, readable paragraphs that preserve the speaker's words and intent exactly. Rules: - Do NOT paraphrase, summarize, or omit any content. - Fix only punctuation, capitalization, and paragraph breaks. - Group related sentences into coherent paragraphs of 3-6 sentences each. - Remove filler words only when they impede readability (e.g. repeated "um", "uh", "like"). - Remove duplicate lines caused by auto-captioning overlap. - Preserve proper nouns, technical terms, and speaker style. - Output clean, flowing prose — no bullet points, no headers, no markdown. - Do not add any commentary, preamble, or notes of your own. """ _CLEAN_USER_PREFIX = ( "Reformat the following raw YouTube transcript into clean, readable paragraphs. " "Preserve all content. Fix punctuation and capitalization only.\n\n" "RAW TRANSCRIPT:" ) _CLEAN_MERGE_SYSTEM = """You are a professional transcript editor. You will receive several already-cleaned transcript sections. Merge them into a single, seamless, well-paragraphed document. Do not summarize or omit any content. Output clean flowing prose only. """ # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def clean( raw_text: str, model: str = DEFAULT_MODEL, max_tokens: int = MAX_TOKENS, stream: bool = True, ) -> str: """ Reformat a raw transcript into clean paragraphs. Args: raw_text: Plain-text transcript (output of fetcher.TranscriptResult.plain_text). model: Anthropic model to use. max_tokens: Max output tokens per API call. stream: Whether to stream progress tokens to stderr. Returns: Cleaned, paragraph-formatted transcript as a string. """ if not raw_text or not raw_text.strip(): raise ValueError("Cannot clean an empty transcript.") return complete_long( system=_CLEAN_SYSTEM, user_prefix=_CLEAN_USER_PREFIX, text=raw_text.strip(), model=model, max_tokens=max_tokens, merge_system=_CLEAN_MERGE_SYSTEM, stream=stream, )