| | """
|
| | cleaner.py
|
| | Reformats raw YouTube transcript text into clean, readable paragraphs.
|
| | Author: algorembrant
|
| | """
|
| |
|
| | from __future__ import annotations
|
| |
|
| | from config import DEFAULT_MODEL, MAX_TOKENS
|
| | from ai_client import complete_long
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | _CLEAN_SYSTEM = """You are a professional transcript editor.
|
| | Your task is to reformat raw, fragmented YouTube transcript text into clean,
|
| | readable paragraphs that preserve the speaker's words and intent exactly.
|
| |
|
| | Rules:
|
| | - Do NOT paraphrase, summarize, or omit any content.
|
| | - Fix only punctuation, capitalization, and paragraph breaks.
|
| | - Group related sentences into coherent paragraphs of 3-6 sentences each.
|
| | - Remove filler words only when they impede readability (e.g. repeated "um", "uh", "like").
|
| | - Remove duplicate lines caused by auto-captioning overlap.
|
| | - Preserve proper nouns, technical terms, and speaker style.
|
| | - Output clean, flowing prose — no bullet points, no headers, no markdown.
|
| | - Do not add any commentary, preamble, or notes of your own.
|
| | """
|
| |
|
| | _CLEAN_USER_PREFIX = (
|
| | "Reformat the following raw YouTube transcript into clean, readable paragraphs. "
|
| | "Preserve all content. Fix punctuation and capitalization only.\n\n"
|
| | "RAW TRANSCRIPT:"
|
| | )
|
| |
|
| | _CLEAN_MERGE_SYSTEM = """You are a professional transcript editor.
|
| | You will receive several already-cleaned transcript sections.
|
| | Merge them into a single, seamless, well-paragraphed document.
|
| | Do not summarize or omit any content. Output clean flowing prose only.
|
| | """
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def clean(
|
| | raw_text: str,
|
| | model: str = DEFAULT_MODEL,
|
| | max_tokens: int = MAX_TOKENS,
|
| | stream: bool = True,
|
| | ) -> str:
|
| | """
|
| | Reformat a raw transcript into clean paragraphs.
|
| |
|
| | Args:
|
| | raw_text: Plain-text transcript (output of fetcher.TranscriptResult.plain_text).
|
| | model: Anthropic model to use.
|
| | max_tokens: Max output tokens per API call.
|
| | stream: Whether to stream progress tokens to stderr.
|
| |
|
| | Returns:
|
| | Cleaned, paragraph-formatted transcript as a string.
|
| | """
|
| | if not raw_text or not raw_text.strip():
|
| | raise ValueError("Cannot clean an empty transcript.")
|
| |
|
| | return complete_long(
|
| | system=_CLEAN_SYSTEM,
|
| | user_prefix=_CLEAN_USER_PREFIX,
|
| | text=raw_text.strip(),
|
| | model=model,
|
| | max_tokens=max_tokens,
|
| | merge_system=_CLEAN_MERGE_SYSTEM,
|
| | stream=stream,
|
| | ) |