Spaces:

LucaCappelletti94
/

talking-snake

Sleeping

talking-snake / src /talking_snake /extract.py

GitHub Actions

Deploy from GitHub: 0bf18943d192a2812c57599f6c25bf9739d523bf

1c7725b 3 months ago

31.3 kB

	"""PDF text extraction and cleaning for TTS processing."""

	from __future__ import annotations

	import io
	import re
	from collections import Counter
	from dataclasses import dataclass

	from pdfminer.high_level import extract_pages
	from pdfminer.layout import (
	LAParams,
	LTAnno,
	LTChar,
	LTPage,
	LTTextBoxHorizontal,
	LTTextLineHorizontal,
	)


	@dataclass
	class TextBlock:
	"""A block of text with positional metadata."""

	text: str
	y_ratio: float # 0.0 = bottom, 1.0 = top
	font_size: float
	page_num: int
	x0: float = 0.0 # Left edge position for table detection
	x1: float = 0.0 # Right edge position for table detection


	def _is_caption(text: str) -> bool:
	"""Check if text is a figure/table caption.

	Captions typically start with:
	- "Figure 1:", "Fig. 2:", "Figure 1."
	- "Table 1:", "Table 2."
	- "Exhibit A:", "Chart 1:"
	- "Source:", "Note:", "Notes:"

	Args:
	text: Text to check.

	Returns:
	True if text appears to be a caption.
	"""
	text = text.strip()
	if not text:
	return False

	# Common caption patterns (case-insensitive start)
	caption_patterns = [
	r"^fig(?:ure)?\.?\s*\d",
	r"^table\.?\s*\d",
	r"^exhibit\.?\s*[a-z0-9]",
	r"^chart\.?\s*\d",
	r"^graph\.?\s*\d",
	r"^diagram\.?\s*\d",
	r"^plate\.?\s*\d",
	r"^scheme\.?\s*\d",
	r"^box\.?\s*\d",
	r"^panel\.?\s*[a-z0-9]",
	r"^appendix\.?\s*[a-z0-9]",
	r"^source\s*:",
	r"^sources\s*:",
	r"^note\s*:",
	r"^notes\s*:",
	r"^data\s*:",
	r"^\\sp\s[<>=]", # Statistical notes like " p < 0.05"
	r"^legend\s*:",
	]

	text_lower = text.lower()
	for pattern in caption_patterns:
	if re.match(pattern, text_lower):
	return True

	return False


	def _is_table_like_text(text: str) -> bool:
	"""Check if text looks like table content.

	Tables often have:
	- Very short text fragments
	- Mostly numbers or single words
	- Lots of whitespace-separated values
	- Column headers or row labels
	- Short phrases without sentence structure

	Args:
	text: Text to check.

	Returns:
	True if the text appears to be table content.
	"""
	text = text.strip()

	# Very short fragments are likely table cells
	if len(text) < 5:
	return True

	# Count numbers vs letters
	digits = sum(1 for c in text if c.isdigit())
	letters = sum(1 for c in text if c.isalpha())

	# Mostly numbers with few letters (like "123.45" or "2024")
	if digits > 0 and letters < 3 and digits >= letters:
	return True

	# Check for patterns common in tables
	# Multiple tab-separated or heavily spaced values
	if "\t" in text or " " in text:
	parts = re.split(r"\s{2,}\|\t", text)
	if len(parts) >= 3:
	# Multiple short parts suggests table row
	short_parts = sum(1 for p in parts if len(p.strip()) < 15)
	if short_parts >= len(parts) * 0.6:
	return True

	# Single words that look like column headers
	words = text.split()
	if len(words) == 1 and len(text) < 20:
	# Common table headers/labels
	table_keywords = {
	"total",
	"sum",
	"avg",
	"average",
	"mean",
	"count",
	"min",
	"max",
	"date",
	"time",
	"year",
	"month",
	"day",
	"name",
	"id",
	"no",
	"no.",
	"value",
	"amount",
	"price",
	"cost",
	"qty",
	"quantity",
	"unit",
	"row",
	"column",
	"col",
	"item",
	"description",
	"desc",
	"note",
	"status",
	"type",
	"category",
	"code",
	"ref",
	"reference",
	}
	if text.lower() in table_keywords:
	return True

	# Short phrases without sentence structure (likely table cells)
	# Table cells typically:
	# - Are short (< 50 chars)
	# - Don't end with sentence-ending punctuation
	# - Don't start with lowercase (unless very short)
	# - Have few words (< 8)
	if len(text) < 50 and len(words) < 8:
	# Doesn't end like a sentence
	if not text.rstrip().endswith((".", "!", "?", ":")):
	# Common table cell patterns
	text_lower = text.lower()

	# Technical/status phrases common in tables
	table_phrases = [
	"supported",
	"not supported",
	"yes",
	"no",
	"n/a",
	"none",
	"required",
	"optional",
	"enabled",
	"disabled",
	"active",
	"inactive",
	"read-only",
	"read only",
	"write",
	"read/write",
	"read-write",
	"must be",
	"can be",
	"should be",
	"will be",
	"available",
	"unavailable",
	"pending",
	"completed",
	"failed",
	"true",
	"false",
	"default",
	"custom",
	"manual",
	"automatic",
	"identical",
	"different",
	"same",
	"other",
	]
	for phrase in table_phrases:
	if phrase in text_lower:
	return True

	# Looks like a label or header (Title Case or ALL CAPS, short)
	if len(words) <= 4 and len(text) < 40:
	# Check if it's Title Case or contains common label patterns
	if text.istitle() or text.isupper():
	return True
	# Two-three word phrases that look like labels
	if len(words) in (2, 3) and all(w[0].isupper() for w in words if w):
	return True

	return False


	def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
	"""Filter out blocks that appear to be part of tables.

	Detects tables by looking for:
	- Multiple blocks at similar Y positions (table rows)
	- Blocks with table-like content

	Args:
	blocks: List of text blocks.

	Returns:
	Filtered list with table content removed.
	"""
	if not blocks:
	return blocks

	# Group blocks by page and approximate Y position (row detection)
	# Blocks within 1% of page height are considered same row
	filtered = []

	for page_num in set(b.page_num for b in blocks):
	page_blocks = [b for b in blocks if b.page_num == page_num]

	# Group by Y position (rounded to detect rows)
	y_groups: dict[float, list[TextBlock]] = {}
	for block in page_blocks:
	y_key = round(block.y_ratio, 2) # Group within ~1% of page
	if y_key not in y_groups:
	y_groups[y_key] = []
	y_groups[y_key].append(block)

	for y_key, row_blocks in y_groups.items():
	# If many blocks at same Y position, likely a table row
	if len(row_blocks) >= 3:
	# Check if most blocks look like table cells
	table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text))
	if table_like >= len(row_blocks) * 0.5:
	# Skip this entire row - it's a table
	continue

	# Filter individual blocks that look like table content
	for block in row_blocks:
	if not _is_table_like_text(block.text):
	filtered.append(block)

	# Sort by page and position (top to bottom)
	filtered.sort(key=lambda b: (b.page_num, -b.y_ratio))
	return filtered


	def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]:
	"""Extract text blocks from PDF with positional information.

	Args:
	pdf_bytes: Raw PDF file content.

	Returns:
	List of TextBlock objects with text and metadata.
	"""
	blocks: list[TextBlock] = []
	pdf_file = io.BytesIO(pdf_bytes)

	laparams = LAParams(
	line_margin=0.5,
	word_margin=0.1,
	char_margin=2.0,
	boxes_flow=0.5,
	)

	for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1):
	if not isinstance(page_layout, LTPage):
	continue

	page_height = page_layout.height

	for element in page_layout:
	if not isinstance(element, LTTextBoxHorizontal):
	continue

	# Extract characters with their font sizes
	# LTChar has font size, LTAnno is whitespace (use size=-1 to always keep)
	chars_with_sizes: list[tuple[str, float]] = []
	for line in element:
	if isinstance(line, LTTextLineHorizontal):
	for char in line:
	if isinstance(char, LTChar):
	chars_with_sizes.append((char.get_text(), char.size))
	elif isinstance(char, LTAnno):
	# Whitespace/newlines - always keep (use -1 as marker)
	chars_with_sizes.append((char.get_text(), -1))

	if not chars_with_sizes:
	text = element.get_text().strip()
	if text:
	blocks.append(
	TextBlock(
	text=text,
	y_ratio=element.y0 / page_height if page_height > 0 else 0.5,
	font_size=10.0,
	page_num=page_num,
	)
	)
	continue

	# Find dominant font size (most common, excluding whitespace markers)
	font_sizes = [size for _, size in chars_with_sizes if size > 0]
	if not font_sizes:
	continue
	size_counts = Counter(round(s, 1) for s in font_sizes)
	dominant_size = max(size_counts, key=lambda x: size_counts[x])

	# Filter out superscript/subscript characters (< 70% of dominant size)
	# Keep whitespace (size=-1) and normal-sized characters
	min_size = dominant_size * 0.7
	filtered_text = "".join(
	char for char, size in chars_with_sizes if size < 0 or size >= min_size
	)

	text = filtered_text.strip()
	if not text:
	continue

	# Calculate Y position as ratio (0=bottom, 1=top)
	y_ratio = element.y0 / page_height if page_height > 0 else 0.5

	avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0

	blocks.append(
	TextBlock(
	text=text,
	y_ratio=y_ratio,
	font_size=avg_font_size,
	page_num=page_num,
	)
	)

	return blocks


	def get_page_count(pdf_bytes: bytes) -> int:
	"""Get the number of pages in a PDF.

	Args:
	pdf_bytes: Raw PDF file content.

	Returns:
	Number of pages in the PDF.
	"""
	pdf_file = io.BytesIO(pdf_bytes)
	laparams = LAParams()
	page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams))
	return page_count


	def extract_text(pdf_bytes: bytes) -> str:
	"""Extract and clean text from a PDF file.

	Args:
	pdf_bytes: Raw PDF file content.

	Returns:
	Cleaned text suitable for TTS.
	"""
	blocks = extract_text_blocks(pdf_bytes)
	if not blocks:
	return ""

	# Filter out table content first
	blocks = _filter_table_blocks(blocks)

	cleaned_blocks = clean_text_blocks(blocks)
	text = "\n\n".join(block.text for block in cleaned_blocks)

	# Apply TTS-specific normalization
	return normalize_for_tts(text)


	def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
	"""Remove headers, footers, page numbers, and other artifacts.

	Applies multiple heuristics:
	1. Remove blocks in top/bottom margins (likely headers/footers)
	2. Remove repeated text across pages (likely running headers)
	3. Remove standalone page numbers
	4. Remove very short lines that look like artifacts

	Args:
	blocks: List of TextBlock objects.

	Returns:
	Filtered list of TextBlock objects.
	"""
	if not blocks:
	return []

	# Find repeated text patterns (headers/footers)
	text_counts = Counter(block.text for block in blocks)
	total_pages = max(block.page_num for block in blocks)
	repeated_threshold = max(2, total_pages // 2)
	repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold}

	# Calculate median font size for filtering
	font_sizes = sorted(block.font_size for block in blocks)
	median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0

	cleaned: list[TextBlock] = []

	for block in blocks:
	# Skip if in header zone (top 10%)
	if block.y_ratio > 0.90:
	continue

	# Skip if in footer zone (bottom 10%)
	if block.y_ratio < 0.10:
	continue

	# Skip repeated text (running headers/footers)
	if block.text in repeated_texts:
	continue

	# Skip standalone page numbers
	if is_page_number(block.text):
	continue

	# Skip figure/table captions
	if _is_caption(block.text):
	continue

	# Skip very short lines with small font (likely captions/footnotes)
	if len(block.text) < 20 and block.font_size < median_font_size * 0.8:
	continue

	cleaned.append(block)

	return cleaned


	def is_page_number(text: str) -> bool:
	"""Check if text is likely a page number.

	Args:
	text: Text to check.

	Returns:
	True if text appears to be a page number.
	"""
	text = text.strip()

	# Pure number
	if text.isdigit():
	return True

	# Roman numerals
	if re.match(r"^[ivxlcdmIVXLCDM]+$", text):
	return True

	# "Page N" or "N of M" patterns
	if re.match(r"^(page\s)?\d+(\s(of\|/)\s*\d+)?$", text, re.IGNORECASE):
	return True

	# "- N -" pattern
	if re.match(r"^[-–—]\s\d+\s[-–—]$", text):
	return True

	return False


	def clean_text(text: str) -> str:
	"""Clean raw text for TTS processing.

	This is a simpler function for cleaning already-extracted text,
	without the positional information.

	Args:
	text: Raw text to clean.

	Returns:
	Cleaned text suitable for TTS.
	"""
	lines = text.split("\n")
	cleaned_lines: list[str] = []

	for line in lines:
	line = line.strip()

	# Skip empty lines
	if not line:
	continue

	# Skip standalone page numbers
	if is_page_number(line):
	continue

	# Skip very short lines (likely artifacts)
	if len(line) < 3:
	continue

	cleaned_lines.append(line)

	# Rejoin with proper spacing
	result = "\n".join(cleaned_lines)

	# === FIX HYPHENATED/SPLIT WORDS ===
	# These are words broken across lines, common in PDFs and web content

	# Pattern 1: word-\nword (hyphen at end of line) -> rejoin word
	result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result)

	# Pattern 2: word-\n word (hyphen + newline + spaces)
	result = re.sub(r"(\w)-\s\n\s(\w)", r"\1\2", result)

	# Pattern 3: word- word (hyphen + space, often from copy-paste)
	result = re.sub(r"(\w)- (\w)", r"\1\2", result)

	# Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation)
	result = re.sub(r"-\n([a-z])", r"\1", result)

	# === FIX LINE BREAK ARTIFACTS ===
	# Join lines that don't end with sentence-ending punctuation
	# This handles text that was wrapped at fixed width

	# Replace single newlines (not paragraph breaks) with spaces
	# Keep double newlines as paragraph separators
	result = re.sub(r"(?<![.!?:;\n])\n(?!\n)", " ", result)

	# Normalize whitespace
	result = re.sub(r"\n{3,}", "\n\n", result)
	result = re.sub(r"[ \t]+", " ", result)

	# Apply TTS-specific normalization
	result = normalize_for_tts(result)

	return result.strip()


	def normalize_for_tts(text: str) -> str:
	"""Normalize text for natural TTS pronunciation.

	Handles special characters, punctuation, and formatting that can
	cause TTS models to slow down or mispronounce.

	Args:
	text: Text to normalize.

	Returns:
	Normalized text optimized for TTS.
	"""
	# === REMOVE ACADEMIC/PAPER ARTIFACTS ===
	# Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019)
	# Also handles (Chen, 2018; Lee et al., 2020)
	text = re.sub(r"$[^()]\b\d{4}[a-z]?\b[^()]$", "", text)

	# Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)"
	text = re.sub(
	r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?\|and\|&)\s+[A-Z][a-z]+)?\s*$\d{4}[a-z]?$", "", text
	)

	# Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar
	# "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found"
	text = re.sub(
	r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?\|and\|&)\s+[A-Z][a-z]+)?\s,?\s(?=found\|showed\|demonstrated\|reported\|observed\|noted\|suggested\|concluded\|argued\|claimed\|stated\|proposed\|discovered\|revealed\|indicated\|confirmed)",
	"",
	text,
	)

	# Remove orphaned "et al." and similar
	text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text)

	# Remove figure/table references like "see Figure 1" or "(see Table 2)"
	text = re.sub(
	r"$?see\s+(?:Figure\|Fig\.?\|Table\|Exhibit\|Chart\|Graph\|Appendix)\s*\d+[a-z]?$?",
	"",
	text,
	flags=re.IGNORECASE,
	)

	# Remove standalone figure/table references like "Figure 1 shows" -> "shows"
	text = re.sub(
	r"(?:Figure\|Fig\.?\|Table\|Exhibit\|Chart\|Graph)\s\d+[a-z]?\s(?:shows?\|depicts?\|illustrates?\|presents?\|displays?\|summarizes?)",
	"",
	text,
	flags=re.IGNORECASE,
	)

	# Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context)
	text = re.sub(
	r"(?:in\|see\|as\s+(?:shown\|described\|discussed)\s+in\|according\s+to)\s+(?:Section\|Chapter\|Part)\s\d+(?:\.\d+),?\s*",
	"",
	text,
	flags=re.IGNORECASE,
	)
	text = re.sub(r"(?:Section\|Chapter\|Part)\s\d+(?:\.\d+)", "", text, flags=re.IGNORECASE)

	# Remove equation references like "Equation 1" or "Eq. (2)"
	text = re.sub(r"(?:Equation\|Eq\.?)\s*$?\d+$?", "", text, flags=re.IGNORECASE)

	# Remove DOIs
	text = re.sub(r"(?:doi:\|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE)

	# Remove arXiv references
	text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE)

	# Remove ISSN/ISBN numbers
	text = re.sub(r"(?:ISSN\|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE)

	# Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20"
	text = re.sub(r"(?:p{1,2}\.?\|pages?)\s\d+(?:\s[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE)

	# Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase)
	text = re.sub(
	r"(?:Vol(?:ume)?\.?\s\d+,?\s)?(?:Issue\|No\.?)\s\d+,?\s", "", text, flags=re.IGNORECASE
	)
	text = re.sub(r"Vol(?:ume)?\.?\s\d+,?\s", "", text, flags=re.IGNORECASE)

	# Remove copyright notices
	text = re.sub(r"©\s\d{4}[^.]\.", "", text)
	text = re.sub(r"Copyright\s©?\s\d{4}[^.]*\.", "", text, flags=re.IGNORECASE)

	# Remove "All rights reserved" and similar
	text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE)

	# Remove asterisks used for footnote markers
	text = re.sub(r"\*{1,3}(?=\s\|$)", "", text)

	# === NORMALIZE NEWLINES FIRST ===
	# Convert various newline formats to standard \n
	text = text.replace("\r\n", "\n").replace("\r", "\n")

	# Replace single newlines (mid-sentence line breaks) with spaces
	# Keep double newlines as paragraph separators
	# First, normalize multiple newlines to exactly two
	text = re.sub(r"\n{3,}", "\n\n", text)

	# Replace single newlines that aren't paragraph breaks with spaces
	# A single newline not preceded by sentence-ending punctuation is likely a line wrap
	text = re.sub(r"(?<![.!?:\n])\n(?!\n)", " ", text)

	# === CODE AND TECHNICAL CONTENT ===
	# Handle common programming patterns that read poorly

	# === REMOVE URLS AND TECHNICAL STRINGS FIRST ===
	# URLs (various formats) - remove completely
	text = re.sub(r"https?://[^\s<>\"')\]]+", "", text)
	text = re.sub(r"www\.[^\s<>\"')\]]+", "", text)
	text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text)

	# UUIDs (with or without dashes) - must come before git hash pattern
	uuid_pattern = (
	r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"
	)
	text = re.sub(uuid_pattern, "", text)

	# Git commit hashes (7-40 hex chars standalone)
	text = re.sub(r"(?<![a-zA-Z0-9])[0-9a-f]{7,40}(?![a-zA-Z0-9])", "", text, flags=re.IGNORECASE)

	# Hex color codes (#fff, #ffffff)
	text = re.sub(r"#[0-9a-fA-F]{3,8}\b", "", text)

	# Long hex/base64 strings (likely encoded data)
	text = re.sub(r"\b[A-Za-z0-9+/]{20,}={0,2}\b", "", text)

	# File paths (Unix and Windows style)
	text = re.sub(r"[/\\][\w./\\-]+\.\w+", "", text)

	# IP addresses
	text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "", text)

	# Port numbers after colon
	text = re.sub(r":\d{2,5}\b", "", text)

	# Remove email addresses
	text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", text)

	# SHA/MD5 style hashes with prefix
	text = re.sub(r"\b(sha\d\|md5\|hash)[:\s][0-9a-f]+\b", "", text, flags=re.IGNORECASE)

	# CamelCase: split into words (e.g., "getUserName" -> "get User Name")
	text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)

	# snake_case: replace underscores with spaces
	text = re.sub(r"(\w)_(\w)", r"\1 \2", text)

	# Function calls: "func()" -> "func"
	text = re.sub(r"(\w+)", r"\1", text)

	# Arrow functions/operators: -> and =>
	text = text.replace("->", " returns ")
	text = text.replace("=>", " arrow ")

	# Common code operators spoken naturally
	text = text.replace("!=", " not equals ")
	text = text.replace("==", " equals ")
	text = text.replace("===", " strictly equals ")
	text = text.replace("!==", " strictly not equals ")
	text = text.replace("&&", " and ")
	text = text.replace("\|\|", " or ")
	text = text.replace("++", " increment ")
	text = text.replace("--", " decrement ")

	# File extensions: ".py" -> " dot py" (only for common extensions)
	ext_pattern = r"\.(py\|js\|ts\|html\|css\|json\|xml\|md\|txt\|csv\|pdf)\b"
	text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE)

	# Remove standalone hashes/pound signs (not hashtags)
	text = re.sub(r"(?<!\w)#(?!\w)", "", text)

	# Backticks (often used in markdown for code)
	text = text.replace("`", "")

	# Triple quotes
	text = text.replace('"""', "")
	text = text.replace("'''", "")

	# === UNICODE NORMALIZATION ===

	# Remove superscript characters (often footnote references)
	# Includes Unicode superscript digits, letters, and modifier letters
	superscripts = (
	"⁰¹²³⁴⁵⁶⁷⁸⁹" # Superscript digits
	"⁺⁻⁼⁽⁾" # Superscript operators
	"ⁿⁱ" # Common superscript letters
	"ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘᵛʷˣʸᶻ" # Superscript lowercase
	"ᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁⱽᵂᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿᵀᵁᵂ" # Superscript uppercase
	"ᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ" # More modifier letters
	"ʰʱʲʳʴʵʶʷʸʹʺʻʼʽˀˁˆˇˈˉˊˋˌˍˎˏːˑ" # Modifier letters
	)
	for char in superscripts:
	text = text.replace(char, "")

	# Also use regex to catch any remaining superscript-like characters
	# Unicode categories for superscripts and modifiers
	text = re.sub(r"[\u2070-\u209F]", "", text) # Superscripts and Subscripts block
	text = re.sub(r"[\u1D2C-\u1D6A]", "", text) # Phonetic Extensions (modifier letters)
	text = re.sub(r"[\u1D78-\u1D7F]", "", text) # More phonetic extensions
	text = re.sub(r"[\u02B0-\u02FF]", "", text) # Spacing Modifier Letters

	# Remove subscript characters
	subscripts = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₔₕₖₗₘₙₚₛₜ"
	for char in subscripts:
	text = text.replace(char, "")

	# Convert smart quotes to simple quotes
	text = text.replace("\u201c", '"').replace("\u201d", '"')
	text = text.replace("\u2018", "'").replace("\u2019", "'")
	text = text.replace("\u201e", '"').replace("\u201f", '"')

	# Normalize dashes to standard hyphen or remove
	text = text.replace("–", "-") # en-dash
	text = text.replace("—", " - ") # em-dash (add spaces for pause)
	text = text.replace("―", " - ") # horizontal bar
	text = text.replace("‐", "-") # Unicode hyphen
	text = text.replace("‑", "-") # non-breaking hyphen
	text = text.replace("⁃", "-") # hyphen bullet
	text = text.replace("−", "-") # minus sign

	# Normalize ellipsis
	text = text.replace("…", "...")
	text = re.sub(r"\.{4,}", "...", text) # Limit to 3 dots

	# Normalize other Unicode punctuation
	text = text.replace("•", ",") # Bullet points
	text = text.replace("·", " ") # Middle dot
	text = text.replace("‧", " ") # Hyphenation point
	text = text.replace("※", " ") # Reference mark
	text = text.replace("†", "") # Dagger (footnote)
	text = text.replace("‡", "") # Double dagger
	text = text.replace("§", "section ")
	text = text.replace("¶", "") # Pilcrow
	text = text.replace("©", "copyright ")
	text = text.replace("®", " registered ")
	text = text.replace("™", " trademark ")
	text = text.replace("°", " degrees ")

	# === SPACING AROUND PUNCTUATION ===
	# Ensure proper spacing around dashes used as separators
	text = re.sub(r"\s-\s-\s*", " - ", text) # Double dash
	text = re.sub(r"(\w)\s-\s(\w)", r"\1 - \2", text) # Word-dash-word with spaces

	# Fix missing space after punctuation
	text = re.sub(r"([.!?])([A-Z])", r"\1 \2", text)
	text = re.sub(r",([A-Za-z])", r", \1", text)

	# Fix multiple punctuation marks
	text = re.sub(r"[,]{2,}", ",", text)
	text = re.sub(r"[;]{2,}", ";", text)
	text = re.sub(r"[:]{2,}", ":", text)
	text = re.sub(r"[!]{2,}", "!", text)
	text = re.sub(r"[?]{2,}", "?", text)

	# === NUMBERS AND SPECIAL NOTATIONS ===
	# Convert common fractions
	text = text.replace("½", " one half ")
	text = text.replace("⅓", " one third ")
	text = text.replace("⅔", " two thirds ")
	text = text.replace("¼", " one quarter ")
	text = text.replace("¾", " three quarters ")
	text = text.replace("⅕", " one fifth ")
	text = text.replace("⅖", " two fifths ")
	text = text.replace("⅗", " three fifths ")
	text = text.replace("⅘", " four fifths ")
	text = text.replace("⅙", " one sixth ")
	text = text.replace("⅚", " five sixths ")
	text = text.replace("⅛", " one eighth ")
	text = text.replace("⅜", " three eighths ")
	text = text.replace("⅝", " five eighths ")
	text = text.replace("⅞", " seven eighths ")

	# Handle percentage and math symbols
	text = text.replace("%", " percent")
	text = text.replace("&", " and ")
	text = text.replace("+", " plus ")
	text = text.replace("=", " equals ")
	text = text.replace("<", " less than ")
	text = text.replace(">", " greater than ")
	text = text.replace("≤", " less than or equal to ")
	text = text.replace("≥", " greater than or equal to ")
	text = text.replace("≠", " not equal to ")
	text = text.replace("±", " plus or minus ")
	text = text.replace("×", " times ")
	text = text.replace("÷", " divided by ")

	# === ABBREVIATIONS AND SPECIAL CASES ===
	# Common abbreviations that might cause issues
	text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE)
	text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE)
	text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE)
	text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE)
	text = re.sub(r"\bDr\.", "Doctor", text)
	text = re.sub(r"\bMr\.", "Mister", text)
	text = re.sub(r"\bMrs\.", "Missus", text)
	text = re.sub(r"\bMs\.", "Miss", text)
	text = re.sub(r"\bProf\.", "Professor", text)
	text = re.sub(r"\bSt\.", "Saint", text)
	text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text)
	text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE)
	text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE)
	text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE)
	text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE)

	# === BRACKETS AND PARENTHESES ===
	# Remove or simplify brackets that might cause pauses
	text = re.sub(r"\[([^\]]+)\]", r"(\1)", text) # Square to round
	text = re.sub(r"\{([^}]+)\}", r"(\1)", text) # Curly to round

	# Remove citation numbers like [1], [2,3], [1-5]
	text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text)
	text = re.sub(r"$\d+(?:[-,]\d+)*$", "", text)

	# === CLEANUP ===
	# Remove standalone special characters
	text = re.sub(r"\s+[#@*^~`\|\\]+\s+", " ", text)

	# Remove content in angle brackets (often HTML/XML artifacts)
	text = re.sub(r"<[^>]+>", "", text)

	# Remove spaces before punctuation
	text = re.sub(r"\s+([.,;:!?])", r"\1", text)

	# Ensure space after punctuation (but not before another punctuation)
	text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text)

	# === FINAL WHITESPACE NORMALIZATION ===
	# This must happen LAST after all substitutions that can create gaps

	# Collapse all whitespace (spaces, tabs, multiple spaces) to single space
	# Do this per-line to preserve intentional paragraph breaks
	lines = text.split("\n")
	normalized_lines = []
	for line in lines:
	# Replace any sequence of whitespace with single space
	line = re.sub(r"[ \t]+", " ", line)
	# Strip leading/trailing whitespace from each line
	line = line.strip()
	normalized_lines.append(line)

	text = "\n".join(normalized_lines)

	# Remove excessive blank lines (keep max 1 blank line between paragraphs)
	text = re.sub(r"\n{3,}", "\n\n", text)

	# Remove blank lines at start/end
	text = text.strip()

	return text