GitHub Actions
Deploy from GitHub: 0bf18943d192a2812c57599f6c25bf9739d523bf
1c7725b
"""PDF text extraction and cleaning for TTS processing."""
from __future__ import annotations
import io
import re
from collections import Counter
from dataclasses import dataclass
from pdfminer.high_level import extract_pages
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTPage,
LTTextBoxHorizontal,
LTTextLineHorizontal,
)
@dataclass
class TextBlock:
"""A block of text with positional metadata."""
text: str
y_ratio: float # 0.0 = bottom, 1.0 = top
font_size: float
page_num: int
x0: float = 0.0 # Left edge position for table detection
x1: float = 0.0 # Right edge position for table detection
def _is_caption(text: str) -> bool:
"""Check if text is a figure/table caption.
Captions typically start with:
- "Figure 1:", "Fig. 2:", "Figure 1."
- "Table 1:", "Table 2."
- "Exhibit A:", "Chart 1:"
- "Source:", "Note:", "Notes:"
Args:
text: Text to check.
Returns:
True if text appears to be a caption.
"""
text = text.strip()
if not text:
return False
# Common caption patterns (case-insensitive start)
caption_patterns = [
r"^fig(?:ure)?\.?\s*\d",
r"^table\.?\s*\d",
r"^exhibit\.?\s*[a-z0-9]",
r"^chart\.?\s*\d",
r"^graph\.?\s*\d",
r"^diagram\.?\s*\d",
r"^plate\.?\s*\d",
r"^scheme\.?\s*\d",
r"^box\.?\s*\d",
r"^panel\.?\s*[a-z0-9]",
r"^appendix\.?\s*[a-z0-9]",
r"^source\s*:",
r"^sources\s*:",
r"^note\s*:",
r"^notes\s*:",
r"^data\s*:",
r"^\*\s*p\s*[<>=]", # Statistical notes like "* p < 0.05"
r"^legend\s*:",
]
text_lower = text.lower()
for pattern in caption_patterns:
if re.match(pattern, text_lower):
return True
return False
def _is_table_like_text(text: str) -> bool:
"""Check if text looks like table content.
Tables often have:
- Very short text fragments
- Mostly numbers or single words
- Lots of whitespace-separated values
- Column headers or row labels
- Short phrases without sentence structure
Args:
text: Text to check.
Returns:
True if the text appears to be table content.
"""
text = text.strip()
# Very short fragments are likely table cells
if len(text) < 5:
return True
# Count numbers vs letters
digits = sum(1 for c in text if c.isdigit())
letters = sum(1 for c in text if c.isalpha())
# Mostly numbers with few letters (like "123.45" or "2024")
if digits > 0 and letters < 3 and digits >= letters:
return True
# Check for patterns common in tables
# Multiple tab-separated or heavily spaced values
if "\t" in text or " " in text:
parts = re.split(r"\s{2,}|\t", text)
if len(parts) >= 3:
# Multiple short parts suggests table row
short_parts = sum(1 for p in parts if len(p.strip()) < 15)
if short_parts >= len(parts) * 0.6:
return True
# Single words that look like column headers
words = text.split()
if len(words) == 1 and len(text) < 20:
# Common table headers/labels
table_keywords = {
"total",
"sum",
"avg",
"average",
"mean",
"count",
"min",
"max",
"date",
"time",
"year",
"month",
"day",
"name",
"id",
"no",
"no.",
"value",
"amount",
"price",
"cost",
"qty",
"quantity",
"unit",
"row",
"column",
"col",
"item",
"description",
"desc",
"note",
"status",
"type",
"category",
"code",
"ref",
"reference",
}
if text.lower() in table_keywords:
return True
# Short phrases without sentence structure (likely table cells)
# Table cells typically:
# - Are short (< 50 chars)
# - Don't end with sentence-ending punctuation
# - Don't start with lowercase (unless very short)
# - Have few words (< 8)
if len(text) < 50 and len(words) < 8:
# Doesn't end like a sentence
if not text.rstrip().endswith((".", "!", "?", ":")):
# Common table cell patterns
text_lower = text.lower()
# Technical/status phrases common in tables
table_phrases = [
"supported",
"not supported",
"yes",
"no",
"n/a",
"none",
"required",
"optional",
"enabled",
"disabled",
"active",
"inactive",
"read-only",
"read only",
"write",
"read/write",
"read-write",
"must be",
"can be",
"should be",
"will be",
"available",
"unavailable",
"pending",
"completed",
"failed",
"true",
"false",
"default",
"custom",
"manual",
"automatic",
"identical",
"different",
"same",
"other",
]
for phrase in table_phrases:
if phrase in text_lower:
return True
# Looks like a label or header (Title Case or ALL CAPS, short)
if len(words) <= 4 and len(text) < 40:
# Check if it's Title Case or contains common label patterns
if text.istitle() or text.isupper():
return True
# Two-three word phrases that look like labels
if len(words) in (2, 3) and all(w[0].isupper() for w in words if w):
return True
return False
def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
"""Filter out blocks that appear to be part of tables.
Detects tables by looking for:
- Multiple blocks at similar Y positions (table rows)
- Blocks with table-like content
Args:
blocks: List of text blocks.
Returns:
Filtered list with table content removed.
"""
if not blocks:
return blocks
# Group blocks by page and approximate Y position (row detection)
# Blocks within 1% of page height are considered same row
filtered = []
for page_num in set(b.page_num for b in blocks):
page_blocks = [b for b in blocks if b.page_num == page_num]
# Group by Y position (rounded to detect rows)
y_groups: dict[float, list[TextBlock]] = {}
for block in page_blocks:
y_key = round(block.y_ratio, 2) # Group within ~1% of page
if y_key not in y_groups:
y_groups[y_key] = []
y_groups[y_key].append(block)
for y_key, row_blocks in y_groups.items():
# If many blocks at same Y position, likely a table row
if len(row_blocks) >= 3:
# Check if most blocks look like table cells
table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text))
if table_like >= len(row_blocks) * 0.5:
# Skip this entire row - it's a table
continue
# Filter individual blocks that look like table content
for block in row_blocks:
if not _is_table_like_text(block.text):
filtered.append(block)
# Sort by page and position (top to bottom)
filtered.sort(key=lambda b: (b.page_num, -b.y_ratio))
return filtered
def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
"""Extract text blocks from PDF with positional information.
Args:
pdf_bytes: Raw PDF file content.
Returns:
List of TextBlock objects with text and metadata.
"""
blocks: list[TextBlock] = []
pdf_file = io.BytesIO(pdf_bytes)
laparams = LAParams(
line_margin=0.5,
word_margin=0.1,
char_margin=2.0,
boxes_flow=0.5,
)
for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1):
if not isinstance(page_layout, LTPage):
continue
page_height = page_layout.height
for element in page_layout:
if not isinstance(element, LTTextBoxHorizontal):
continue
# Extract characters with their font sizes
# LTChar has font size, LTAnno is whitespace (use size=-1 to always keep)
chars_with_sizes: list[tuple[str, float]] = []
for line in element:
if isinstance(line, LTTextLineHorizontal):
for char in line:
if isinstance(char, LTChar):
chars_with_sizes.append((char.get_text(), char.size))
elif isinstance(char, LTAnno):
# Whitespace/newlines - always keep (use -1 as marker)
chars_with_sizes.append((char.get_text(), -1))
if not chars_with_sizes:
text = element.get_text().strip()
if text:
blocks.append(
TextBlock(
text=text,
y_ratio=element.y0 / page_height if page_height > 0 else 0.5,
font_size=10.0,
page_num=page_num,
)
)
continue
# Find dominant font size (most common, excluding whitespace markers)
font_sizes = [size for _, size in chars_with_sizes if size > 0]
if not font_sizes:
continue
size_counts = Counter(round(s, 1) for s in font_sizes)
dominant_size = max(size_counts, key=lambda x: size_counts[x])
# Filter out superscript/subscript characters (< 70% of dominant size)
# Keep whitespace (size=-1) and normal-sized characters
min_size = dominant_size * 0.7
filtered_text = "".join(
char for char, size in chars_with_sizes if size < 0 or size >= min_size
)
text = filtered_text.strip()
if not text:
continue
# Calculate Y position as ratio (0=bottom, 1=top)
y_ratio = element.y0 / page_height if page_height > 0 else 0.5
avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0
blocks.append(
TextBlock(
text=text,
y_ratio=y_ratio,
font_size=avg_font_size,
page_num=page_num,
)
)
return blocks
def get_page_count(pdf_bytes: bytes) -> int:
"""Get the number of pages in a PDF.
Args:
pdf_bytes: Raw PDF file content.
Returns:
Number of pages in the PDF.
"""
pdf_file = io.BytesIO(pdf_bytes)
laparams = LAParams()
page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams))
return page_count
def extract_text(pdf_bytes: bytes) -> str:
"""Extract and clean text from a PDF file.
Args:
pdf_bytes: Raw PDF file content.
Returns:
Cleaned text suitable for TTS.
"""
blocks = extract_text_blocks(pdf_bytes)
if not blocks:
return ""
# Filter out table content first
blocks = _filter_table_blocks(blocks)
cleaned_blocks = clean_text_blocks(blocks)
text = "\n\n".join(block.text for block in cleaned_blocks)
# Apply TTS-specific normalization
return normalize_for_tts(text)
def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
"""Remove headers, footers, page numbers, and other artifacts.
Applies multiple heuristics:
1. Remove blocks in top/bottom margins (likely headers/footers)
2. Remove repeated text across pages (likely running headers)
3. Remove standalone page numbers
4. Remove very short lines that look like artifacts
Args:
blocks: List of TextBlock objects.
Returns:
Filtered list of TextBlock objects.
"""
if not blocks:
return []
# Find repeated text patterns (headers/footers)
text_counts = Counter(block.text for block in blocks)
total_pages = max(block.page_num for block in blocks)
repeated_threshold = max(2, total_pages // 2)
repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold}
# Calculate median font size for filtering
font_sizes = sorted(block.font_size for block in blocks)
median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0
cleaned: list[TextBlock] = []
for block in blocks:
# Skip if in header zone (top 10%)
if block.y_ratio > 0.90:
continue
# Skip if in footer zone (bottom 10%)
if block.y_ratio < 0.10:
continue
# Skip repeated text (running headers/footers)
if block.text in repeated_texts:
continue
# Skip standalone page numbers
if is_page_number(block.text):
continue
# Skip figure/table captions
if _is_caption(block.text):
continue
# Skip very short lines with small font (likely captions/footnotes)
if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
continue
cleaned.append(block)
return cleaned
def is_page_number(text: str) -> bool:
"""Check if text is likely a page number.
Args:
text: Text to check.
Returns:
True if text appears to be a page number.
"""
text = text.strip()
# Pure number
if text.isdigit():
return True
# Roman numerals
if re.match(r"^[ivxlcdmIVXLCDM]+$", text):
return True
# "Page N" or "N of M" patterns
if re.match(r"^(page\s*)?\d+(\s*(of|/)\s*\d+)?$", text, re.IGNORECASE):
return True
# "- N -" pattern
if re.match(r"^[-–—]\s*\d+\s*[-–—]$", text):
return True
return False
def clean_text(text: str) -> str:
"""Clean raw text for TTS processing.
This is a simpler function for cleaning already-extracted text,
without the positional information.
Args:
text: Raw text to clean.
Returns:
Cleaned text suitable for TTS.
"""
lines = text.split("\n")
cleaned_lines: list[str] = []
for line in lines:
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip standalone page numbers
if is_page_number(line):
continue
# Skip very short lines (likely artifacts)
if len(line) < 3:
continue
cleaned_lines.append(line)
# Rejoin with proper spacing
result = "\n".join(cleaned_lines)
# === FIX HYPHENATED/SPLIT WORDS ===
# These are words broken across lines, common in PDFs and web content
# Pattern 1: word-\nword (hyphen at end of line) -> rejoin word
result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result)
# Pattern 2: word-\n word (hyphen + newline + spaces)
result = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", result)
# Pattern 3: word- word (hyphen + space, often from copy-paste)
result = re.sub(r"(\w)- (\w)", r"\1\2", result)
# Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation)
result = re.sub(r"-\n([a-z])", r"\1", result)
# === FIX LINE BREAK ARTIFACTS ===
# Join lines that don't end with sentence-ending punctuation
# This handles text that was wrapped at fixed width
# Replace single newlines (not paragraph breaks) with spaces
# Keep double newlines as paragraph separators
result = re.sub(r"(?<![.!?:;\n])\n(?!\n)", " ", result)
# Normalize whitespace
result = re.sub(r"\n{3,}", "\n\n", result)
result = re.sub(r"[ \t]+", " ", result)
# Apply TTS-specific normalization
result = normalize_for_tts(result)
return result.strip()
def normalize_for_tts(text: str) -> str:
"""Normalize text for natural TTS pronunciation.
Handles special characters, punctuation, and formatting that can
cause TTS models to slow down or mispronounce.
Args:
text: Text to normalize.
Returns:
Normalized text optimized for TTS.
"""
# === REMOVE ACADEMIC/PAPER ARTIFACTS ===
# Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019)
# Also handles (Chen, 2018; Lee et al., 2020)
text = re.sub(r"\([^()]*\b\d{4}[a-z]?\b[^()]*\)", "", text)
# Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)"
text = re.sub(
r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*\(\d{4}[a-z]?\)", "", text
)
# Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar
# "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found"
text = re.sub(
r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*,?\s*(?=found|showed|demonstrated|reported|observed|noted|suggested|concluded|argued|claimed|stated|proposed|discovered|revealed|indicated|confirmed)",
"",
text,
)
# Remove orphaned "et al." and similar
text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text)
# Remove figure/table references like "see Figure 1" or "(see Table 2)"
text = re.sub(
r"\(?see\s+(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph|Appendix)\s*\d+[a-z]?\)?",
"",
text,
flags=re.IGNORECASE,
)
# Remove standalone figure/table references like "Figure 1 shows" -> "shows"
text = re.sub(
r"(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph)\s*\d+[a-z]?\s*(?:shows?|depicts?|illustrates?|presents?|displays?|summarizes?)",
"",
text,
flags=re.IGNORECASE,
)
# Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context)
text = re.sub(
r"(?:in|see|as\s+(?:shown|described|discussed)\s+in|according\s+to)\s+(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*,?\s*",
"",
text,
flags=re.IGNORECASE,
)
text = re.sub(r"(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*", "", text, flags=re.IGNORECASE)
# Remove equation references like "Equation 1" or "Eq. (2)"
text = re.sub(r"(?:Equation|Eq\.?)\s*\(?\d+\)?", "", text, flags=re.IGNORECASE)
# Remove DOIs
text = re.sub(r"(?:doi:|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE)
# Remove arXiv references
text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE)
# Remove ISSN/ISBN numbers
text = re.sub(r"(?:ISSN|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE)
# Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20"
text = re.sub(r"(?:p{1,2}\.?|pages?)\s*\d+(?:\s*[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE)
# Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase)
text = re.sub(
r"(?:Vol(?:ume)?\.?\s*\d+,?\s*)?(?:Issue|No\.?)\s*\d+,?\s*", "", text, flags=re.IGNORECASE
)
text = re.sub(r"Vol(?:ume)?\.?\s*\d+,?\s*", "", text, flags=re.IGNORECASE)
# Remove copyright notices
text = re.sub(r"©\s*\d{4}[^.]*\.", "", text)
text = re.sub(r"Copyright\s*©?\s*\d{4}[^.]*\.", "", text, flags=re.IGNORECASE)
# Remove "All rights reserved" and similar
text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE)
# Remove asterisks used for footnote markers
text = re.sub(r"\*{1,3}(?=\s|$)", "", text)
# === NORMALIZE NEWLINES FIRST ===
# Convert various newline formats to standard \n
text = text.replace("\r\n", "\n").replace("\r", "\n")
# Replace single newlines (mid-sentence line breaks) with spaces
# Keep double newlines as paragraph separators
# First, normalize multiple newlines to exactly two
text = re.sub(r"\n{3,}", "\n\n", text)
# Replace single newlines that aren't paragraph breaks with spaces
# A single newline not preceded by sentence-ending punctuation is likely a line wrap
text = re.sub(r"(?<![.!?:\n])\n(?!\n)", " ", text)
# === CODE AND TECHNICAL CONTENT ===
# Handle common programming patterns that read poorly
# === REMOVE URLS AND TECHNICAL STRINGS FIRST ===
# URLs (various formats) - remove completely
text = re.sub(r"https?://[^\s<>\"')\]]+", "", text)
text = re.sub(r"www\.[^\s<>\"')\]]+", "", text)
text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text)
# UUIDs (with or without dashes) - must come before git hash pattern
uuid_pattern = (
r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
)
text = re.sub(uuid_pattern, "", text)
# Git commit hashes (7-40 hex chars standalone)
text = re.sub(r"(?<![a-zA-Z0-9])[0-9a-f]{7,40}(?![a-zA-Z0-9])", "", text, flags=re.IGNORECASE)
# Hex color codes (#fff, #ffffff)
text = re.sub(r"#[0-9a-fA-F]{3,8}\b", "", text)
# Long hex/base64 strings (likely encoded data)
text = re.sub(r"\b[A-Za-z0-9+/]{20,}={0,2}\b", "", text)
# File paths (Unix and Windows style)
text = re.sub(r"[/\\][\w./\\-]+\.\w+", "", text)
# IP addresses
text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "", text)
# Port numbers after colon
text = re.sub(r":\d{2,5}\b", "", text)
# Remove email addresses
text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text)
# SHA/MD5 style hashes with prefix
text = re.sub(r"\b(sha\d*|md5|hash)[:\s]*[0-9a-f]+\b", "", text, flags=re.IGNORECASE)
# CamelCase: split into words (e.g., "getUserName" -> "get User Name")
text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
# snake_case: replace underscores with spaces
text = re.sub(r"(\w)_(\w)", r"\1 \2", text)
# Function calls: "func()" -> "func"
text = re.sub(r"(\w+)\(\)", r"\1", text)
# Arrow functions/operators: -> and =>
text = text.replace("->", " returns ")
text = text.replace("=>", " arrow ")
# Common code operators spoken naturally
text = text.replace("!=", " not equals ")
text = text.replace("==", " equals ")
text = text.replace("===", " strictly equals ")
text = text.replace("!==", " strictly not equals ")
text = text.replace("&&", " and ")
text = text.replace("||", " or ")
text = text.replace("++", " increment ")
text = text.replace("--", " decrement ")
# File extensions: ".py" -> " dot py" (only for common extensions)
ext_pattern = r"\.(py|js|ts|html|css|json|xml|md|txt|csv|pdf)\b"
text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE)
# Remove standalone hashes/pound signs (not hashtags)
text = re.sub(r"(?<!\w)#(?!\w)", "", text)
# Backticks (often used in markdown for code)
text = text.replace("`", "")
# Triple quotes
text = text.replace('"""', "")
text = text.replace("'''", "")
# === UNICODE NORMALIZATION ===
# Remove superscript characters (often footnote references)
# Includes Unicode superscript digits, letters, and modifier letters
superscripts = (
"⁰¹²³⁴⁵⁶⁷⁸⁹" # Superscript digits
"⁺⁻⁼⁽⁾" # Superscript operators
"ⁿⁱ" # Common superscript letters
"ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ" # Superscript lowercase
"ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿᵀᵁᵂ" # Superscript uppercase
"ᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ" # More modifier letters
"ʰʱʲʳʴʵʶʷʸʹʺʻʼʽˀˁˆˇˈˉˊˋˌˍˎˏːˑ" # Modifier letters
)
for char in superscripts:
text = text.replace(char, "")
# Also use regex to catch any remaining superscript-like characters
# Unicode categories for superscripts and modifiers
text = re.sub(r"[\u2070-\u209F]", "", text) # Superscripts and Subscripts block
text = re.sub(r"[\u1D2C-\u1D6A]", "", text) # Phonetic Extensions (modifier letters)
text = re.sub(r"[\u1D78-\u1D7F]", "", text) # More phonetic extensions
text = re.sub(r"[\u02B0-\u02FF]", "", text) # Spacing Modifier Letters
# Remove subscript characters
subscripts = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
for char in subscripts:
text = text.replace(char, "")
# Convert smart quotes to simple quotes
text = text.replace("\u201c", '"').replace("\u201d", '"')
text = text.replace("\u2018", "'").replace("\u2019", "'")
text = text.replace("\u201e", '"').replace("\u201f", '"')
# Normalize dashes to standard hyphen or remove
text = text.replace("–", "-") # en-dash
text = text.replace("—", " - ") # em-dash (add spaces for pause)
text = text.replace("―", " - ") # horizontal bar
text = text.replace("‐", "-") # Unicode hyphen
text = text.replace("‑", "-") # non-breaking hyphen
text = text.replace("⁃", "-") # hyphen bullet
text = text.replace("−", "-") # minus sign
# Normalize ellipsis
text = text.replace("…", "...")
text = re.sub(r"\.{4,}", "...", text) # Limit to 3 dots
# Normalize other Unicode punctuation
text = text.replace("•", ",") # Bullet points
text = text.replace("·", " ") # Middle dot
text = text.replace("‧", " ") # Hyphenation point
text = text.replace("※", " ") # Reference mark
text = text.replace("†", "") # Dagger (footnote)
text = text.replace("‡", "") # Double dagger
text = text.replace("§", "section ")
text = text.replace("¶", "") # Pilcrow
text = text.replace("©", "copyright ")
text = text.replace("®", " registered ")
text = text.replace("™", " trademark ")
text = text.replace("°", " degrees ")
# === SPACING AROUND PUNCTUATION ===
# Ensure proper spacing around dashes used as separators
text = re.sub(r"\s*-\s*-\s*", " - ", text) # Double dash
text = re.sub(r"(\w)\s*-\s*(\w)", r"\1 - \2", text) # Word-dash-word with spaces
# Fix missing space after punctuation
text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
text = re.sub(r",([A-Za-z])", r", \1", text)
# Fix multiple punctuation marks
text = re.sub(r"[,]{2,}", ",", text)
text = re.sub(r"[;]{2,}", ";", text)
text = re.sub(r"[:]{2,}", ":", text)
text = re.sub(r"[!]{2,}", "!", text)
text = re.sub(r"[?]{2,}", "?", text)
# === NUMBERS AND SPECIAL NOTATIONS ===
# Convert common fractions
text = text.replace("½", " one half ")
text = text.replace("⅓", " one third ")
text = text.replace("⅔", " two thirds ")
text = text.replace("¼", " one quarter ")
text = text.replace("¾", " three quarters ")
text = text.replace("⅕", " one fifth ")
text = text.replace("⅖", " two fifths ")
text = text.replace("⅗", " three fifths ")
text = text.replace("⅘", " four fifths ")
text = text.replace("⅙", " one sixth ")
text = text.replace("⅚", " five sixths ")
text = text.replace("⅛", " one eighth ")
text = text.replace("⅜", " three eighths ")
text = text.replace("⅝", " five eighths ")
text = text.replace("⅞", " seven eighths ")
# Handle percentage and math symbols
text = text.replace("%", " percent")
text = text.replace("&", " and ")
text = text.replace("+", " plus ")
text = text.replace("=", " equals ")
text = text.replace("<", " less than ")
text = text.replace(">", " greater than ")
text = text.replace("≤", " less than or equal to ")
text = text.replace("≥", " greater than or equal to ")
text = text.replace("≠", " not equal to ")
text = text.replace("±", " plus or minus ")
text = text.replace("×", " times ")
text = text.replace("÷", " divided by ")
# === ABBREVIATIONS AND SPECIAL CASES ===
# Common abbreviations that might cause issues
text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE)
text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE)
text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE)
text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE)
text = re.sub(r"\bDr\.", "Doctor", text)
text = re.sub(r"\bMr\.", "Mister", text)
text = re.sub(r"\bMrs\.", "Missus", text)
text = re.sub(r"\bMs\.", "Miss", text)
text = re.sub(r"\bProf\.", "Professor", text)
text = re.sub(r"\bSt\.", "Saint", text)
text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text)
text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE)
text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE)
text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE)
text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE)
# === BRACKETS AND PARENTHESES ===
# Remove or simplify brackets that might cause pauses
text = re.sub(r"\[([^\]]+)\]", r"(\1)", text) # Square to round
text = re.sub(r"\{([^}]+)\}", r"(\1)", text) # Curly to round
# Remove citation numbers like [1], [2,3], [1-5]
text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text)
text = re.sub(r"\(\d+(?:[-,]\d+)*\)", "", text)
# === CLEANUP ===
# Remove standalone special characters
text = re.sub(r"\s+[#@*^~`|\\]+\s+", " ", text)
# Remove content in angle brackets (often HTML/XML artifacts)
text = re.sub(r"<[^>]+>", "", text)
# Remove spaces before punctuation
text = re.sub(r"\s+([.,;:!?])", r"\1", text)
# Ensure space after punctuation (but not before another punctuation)
text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)
# === FINAL WHITESPACE NORMALIZATION ===
# This must happen LAST after all substitutions that can create gaps
# Collapse all whitespace (spaces, tabs, multiple spaces) to single space
# Do this per-line to preserve intentional paragraph breaks
lines = text.split("\n")
normalized_lines = []
for line in lines:
# Replace any sequence of whitespace with single space
line = re.sub(r"[ \t]+", " ", line)
# Strip leading/trailing whitespace from each line
line = line.strip()
normalized_lines.append(line)
text = "\n".join(normalized_lines)
# Remove excessive blank lines (keep max 1 blank line between paragraphs)
text = re.sub(r"\n{3,}", "\n\n", text)
# Remove blank lines at start/end
text = text.strip()
return text