"""PDF text extraction and cleaning for TTS processing.""" from __future__ import annotations import io import re from collections import Counter from dataclasses import dataclass from pdfminer.high_level import extract_pages from pdfminer.layout import ( LAParams, LTAnno, LTChar, LTPage, LTTextBoxHorizontal, LTTextLineHorizontal, ) @dataclass class TextBlock: """A block of text with positional metadata.""" text: str y_ratio: float # 0.0 = bottom, 1.0 = top font_size: float page_num: int x0: float = 0.0 # Left edge position for table detection x1: float = 0.0 # Right edge position for table detection def _is_caption(text: str) -> bool: """Check if text is a figure/table caption. Captions typically start with: - "Figure 1:", "Fig. 2:", "Figure 1." - "Table 1:", "Table 2." - "Exhibit A:", "Chart 1:" - "Source:", "Note:", "Notes:" Args: text: Text to check. Returns: True if text appears to be a caption. """ text = text.strip() if not text: return False # Common caption patterns (case-insensitive start) caption_patterns = [ r"^fig(?:ure)?\.?\s*\d", r"^table\.?\s*\d", r"^exhibit\.?\s*[a-z0-9]", r"^chart\.?\s*\d", r"^graph\.?\s*\d", r"^diagram\.?\s*\d", r"^plate\.?\s*\d", r"^scheme\.?\s*\d", r"^box\.?\s*\d", r"^panel\.?\s*[a-z0-9]", r"^appendix\.?\s*[a-z0-9]", r"^source\s*:", r"^sources\s*:", r"^note\s*:", r"^notes\s*:", r"^data\s*:", r"^\*\s*p\s*[<>=]", # Statistical notes like "* p < 0.05" r"^legend\s*:", ] text_lower = text.lower() for pattern in caption_patterns: if re.match(pattern, text_lower): return True return False def _is_table_like_text(text: str) -> bool: """Check if text looks like table content. Tables often have: - Very short text fragments - Mostly numbers or single words - Lots of whitespace-separated values - Column headers or row labels - Short phrases without sentence structure Args: text: Text to check. Returns: True if the text appears to be table content. """ text = text.strip() # Very short fragments are likely table cells if len(text) < 5: return True # Count numbers vs letters digits = sum(1 for c in text if c.isdigit()) letters = sum(1 for c in text if c.isalpha()) # Mostly numbers with few letters (like "123.45" or "2024") if digits > 0 and letters < 3 and digits >= letters: return True # Check for patterns common in tables # Multiple tab-separated or heavily spaced values if "\t" in text or " " in text: parts = re.split(r"\s{2,}|\t", text) if len(parts) >= 3: # Multiple short parts suggests table row short_parts = sum(1 for p in parts if len(p.strip()) < 15) if short_parts >= len(parts) * 0.6: return True # Single words that look like column headers words = text.split() if len(words) == 1 and len(text) < 20: # Common table headers/labels table_keywords = { "total", "sum", "avg", "average", "mean", "count", "min", "max", "date", "time", "year", "month", "day", "name", "id", "no", "no.", "value", "amount", "price", "cost", "qty", "quantity", "unit", "row", "column", "col", "item", "description", "desc", "note", "status", "type", "category", "code", "ref", "reference", } if text.lower() in table_keywords: return True # Short phrases without sentence structure (likely table cells) # Table cells typically: # - Are short (< 50 chars) # - Don't end with sentence-ending punctuation # - Don't start with lowercase (unless very short) # - Have few words (< 8) if len(text) < 50 and len(words) < 8: # Doesn't end like a sentence if not text.rstrip().endswith((".", "!", "?", ":")): # Common table cell patterns text_lower = text.lower() # Technical/status phrases common in tables table_phrases = [ "supported", "not supported", "yes", "no", "n/a", "none", "required", "optional", "enabled", "disabled", "active", "inactive", "read-only", "read only", "write", "read/write", "read-write", "must be", "can be", "should be", "will be", "available", "unavailable", "pending", "completed", "failed", "true", "false", "default", "custom", "manual", "automatic", "identical", "different", "same", "other", ] for phrase in table_phrases: if phrase in text_lower: return True # Looks like a label or header (Title Case or ALL CAPS, short) if len(words) <= 4 and len(text) < 40: # Check if it's Title Case or contains common label patterns if text.istitle() or text.isupper(): return True # Two-three word phrases that look like labels if len(words) in (2, 3) and all(w[0].isupper() for w in words if w): return True return False def _filter_table_blocks(blocks: list[TextBlock]) -> list[TextBlock]: """Filter out blocks that appear to be part of tables. Detects tables by looking for: - Multiple blocks at similar Y positions (table rows) - Blocks with table-like content Args: blocks: List of text blocks. Returns: Filtered list with table content removed. """ if not blocks: return blocks # Group blocks by page and approximate Y position (row detection) # Blocks within 1% of page height are considered same row filtered = [] for page_num in set(b.page_num for b in blocks): page_blocks = [b for b in blocks if b.page_num == page_num] # Group by Y position (rounded to detect rows) y_groups: dict[float, list[TextBlock]] = {} for block in page_blocks: y_key = round(block.y_ratio, 2) # Group within ~1% of page if y_key not in y_groups: y_groups[y_key] = [] y_groups[y_key].append(block) for y_key, row_blocks in y_groups.items(): # If many blocks at same Y position, likely a table row if len(row_blocks) >= 3: # Check if most blocks look like table cells table_like = sum(1 for b in row_blocks if _is_table_like_text(b.text)) if table_like >= len(row_blocks) * 0.5: # Skip this entire row - it's a table continue # Filter individual blocks that look like table content for block in row_blocks: if not _is_table_like_text(block.text): filtered.append(block) # Sort by page and position (top to bottom) filtered.sort(key=lambda b: (b.page_num, -b.y_ratio)) return filtered def extract_text_blocks(pdf_bytes: bytes) -> list[TextBlock]: """Extract text blocks from PDF with positional information. Args: pdf_bytes: Raw PDF file content. Returns: List of TextBlock objects with text and metadata. """ blocks: list[TextBlock] = [] pdf_file = io.BytesIO(pdf_bytes) laparams = LAParams( line_margin=0.5, word_margin=0.1, char_margin=2.0, boxes_flow=0.5, ) for page_num, page_layout in enumerate(extract_pages(pdf_file, laparams=laparams), start=1): if not isinstance(page_layout, LTPage): continue page_height = page_layout.height for element in page_layout: if not isinstance(element, LTTextBoxHorizontal): continue # Extract characters with their font sizes # LTChar has font size, LTAnno is whitespace (use size=-1 to always keep) chars_with_sizes: list[tuple[str, float]] = [] for line in element: if isinstance(line, LTTextLineHorizontal): for char in line: if isinstance(char, LTChar): chars_with_sizes.append((char.get_text(), char.size)) elif isinstance(char, LTAnno): # Whitespace/newlines - always keep (use -1 as marker) chars_with_sizes.append((char.get_text(), -1)) if not chars_with_sizes: text = element.get_text().strip() if text: blocks.append( TextBlock( text=text, y_ratio=element.y0 / page_height if page_height > 0 else 0.5, font_size=10.0, page_num=page_num, ) ) continue # Find dominant font size (most common, excluding whitespace markers) font_sizes = [size for _, size in chars_with_sizes if size > 0] if not font_sizes: continue size_counts = Counter(round(s, 1) for s in font_sizes) dominant_size = max(size_counts, key=lambda x: size_counts[x]) # Filter out superscript/subscript characters (< 70% of dominant size) # Keep whitespace (size=-1) and normal-sized characters min_size = dominant_size * 0.7 filtered_text = "".join( char for char, size in chars_with_sizes if size < 0 or size >= min_size ) text = filtered_text.strip() if not text: continue # Calculate Y position as ratio (0=bottom, 1=top) y_ratio = element.y0 / page_height if page_height > 0 else 0.5 avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 10.0 blocks.append( TextBlock( text=text, y_ratio=y_ratio, font_size=avg_font_size, page_num=page_num, ) ) return blocks def get_page_count(pdf_bytes: bytes) -> int: """Get the number of pages in a PDF. Args: pdf_bytes: Raw PDF file content. Returns: Number of pages in the PDF. """ pdf_file = io.BytesIO(pdf_bytes) laparams = LAParams() page_count = sum(1 for _ in extract_pages(pdf_file, laparams=laparams)) return page_count def extract_text(pdf_bytes: bytes) -> str: """Extract and clean text from a PDF file. Args: pdf_bytes: Raw PDF file content. Returns: Cleaned text suitable for TTS. """ blocks = extract_text_blocks(pdf_bytes) if not blocks: return "" # Filter out table content first blocks = _filter_table_blocks(blocks) cleaned_blocks = clean_text_blocks(blocks) text = "\n\n".join(block.text for block in cleaned_blocks) # Apply TTS-specific normalization return normalize_for_tts(text) def clean_text_blocks(blocks: list[TextBlock]) -> list[TextBlock]: """Remove headers, footers, page numbers, and other artifacts. Applies multiple heuristics: 1. Remove blocks in top/bottom margins (likely headers/footers) 2. Remove repeated text across pages (likely running headers) 3. Remove standalone page numbers 4. Remove very short lines that look like artifacts Args: blocks: List of TextBlock objects. Returns: Filtered list of TextBlock objects. """ if not blocks: return [] # Find repeated text patterns (headers/footers) text_counts = Counter(block.text for block in blocks) total_pages = max(block.page_num for block in blocks) repeated_threshold = max(2, total_pages // 2) repeated_texts = {text for text, count in text_counts.items() if count >= repeated_threshold} # Calculate median font size for filtering font_sizes = sorted(block.font_size for block in blocks) median_font_size = font_sizes[len(font_sizes) // 2] if font_sizes else 10.0 cleaned: list[TextBlock] = [] for block in blocks: # Skip if in header zone (top 10%) if block.y_ratio > 0.90: continue # Skip if in footer zone (bottom 10%) if block.y_ratio < 0.10: continue # Skip repeated text (running headers/footers) if block.text in repeated_texts: continue # Skip standalone page numbers if is_page_number(block.text): continue # Skip figure/table captions if _is_caption(block.text): continue # Skip very short lines with small font (likely captions/footnotes) if len(block.text) < 20 and block.font_size < median_font_size * 0.8: continue cleaned.append(block) return cleaned def is_page_number(text: str) -> bool: """Check if text is likely a page number. Args: text: Text to check. Returns: True if text appears to be a page number. """ text = text.strip() # Pure number if text.isdigit(): return True # Roman numerals if re.match(r"^[ivxlcdmIVXLCDM]+$", text): return True # "Page N" or "N of M" patterns if re.match(r"^(page\s*)?\d+(\s*(of|/)\s*\d+)?$", text, re.IGNORECASE): return True # "- N -" pattern if re.match(r"^[-–—]\s*\d+\s*[-–—]$", text): return True return False def clean_text(text: str) -> str: """Clean raw text for TTS processing. This is a simpler function for cleaning already-extracted text, without the positional information. Args: text: Raw text to clean. Returns: Cleaned text suitable for TTS. """ lines = text.split("\n") cleaned_lines: list[str] = [] for line in lines: line = line.strip() # Skip empty lines if not line: continue # Skip standalone page numbers if is_page_number(line): continue # Skip very short lines (likely artifacts) if len(line) < 3: continue cleaned_lines.append(line) # Rejoin with proper spacing result = "\n".join(cleaned_lines) # === FIX HYPHENATED/SPLIT WORDS === # These are words broken across lines, common in PDFs and web content # Pattern 1: word-\nword (hyphen at end of line) -> rejoin word result = re.sub(r"(\w)-\n\s*(\w)", r"\1\2", result) # Pattern 2: word-\n word (hyphen + newline + spaces) result = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", result) # Pattern 3: word- word (hyphen + space, often from copy-paste) result = re.sub(r"(\w)- (\w)", r"\1\2", result) # Pattern 4: Lines ending with hyphen followed by lowercase (likely continuation) result = re.sub(r"-\n([a-z])", r"\1", result) # === FIX LINE BREAK ARTIFACTS === # Join lines that don't end with sentence-ending punctuation # This handles text that was wrapped at fixed width # Replace single newlines (not paragraph breaks) with spaces # Keep double newlines as paragraph separators result = re.sub(r"(? str: """Normalize text for natural TTS pronunciation. Handles special characters, punctuation, and formatting that can cause TTS models to slow down or mispronounce. Args: text: Text to normalize. Returns: Normalized text optimized for TTS. """ # === REMOVE ACADEMIC/PAPER ARTIFACTS === # Remove inline citations like (Smith et al., 2020) or (Smith, 2020; Jones, 2019) # Also handles (Chen, 2018; Lee et al., 2020) text = re.sub(r"\([^()]*\b\d{4}[a-z]?\b[^()]*\)", "", text) # Remove author-year citations like "Smith (2020)" or "Smith et al. (2020)" text = re.sub( r"\b[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*\(\d{4}[a-z]?\)", "", text ) # Clean up "by [Author]" patterns - remove the author part, keep "by" for grammar # "by Smith" -> "" (will be cleaned up), "study by Smith found" -> "study found" text = re.sub( r"\bby\s+[A-Z][a-z]+(?:\s+(?:et\s+al\.?|and|&)\s+[A-Z][a-z]+)?\s*,?\s*(?=found|showed|demonstrated|reported|observed|noted|suggested|concluded|argued|claimed|stated|proposed|discovered|revealed|indicated|confirmed)", "", text, ) # Remove orphaned "et al." and similar text = re.sub(r"\s+et\s+al\.?,?\s*", " ", text) # Remove figure/table references like "see Figure 1" or "(see Table 2)" text = re.sub( r"\(?see\s+(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph|Appendix)\s*\d+[a-z]?\)?", "", text, flags=re.IGNORECASE, ) # Remove standalone figure/table references like "Figure 1 shows" -> "shows" text = re.sub( r"(?:Figure|Fig\.?|Table|Exhibit|Chart|Graph)\s*\d+[a-z]?\s*(?:shows?|depicts?|illustrates?|presents?|displays?|summarizes?)", "", text, flags=re.IGNORECASE, ) # Remove section references like "Section 2.1" or "Chapter 3" (with surrounding context) text = re.sub( r"(?:in|see|as\s+(?:shown|described|discussed)\s+in|according\s+to)\s+(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*,?\s*", "", text, flags=re.IGNORECASE, ) text = re.sub(r"(?:Section|Chapter|Part)\s*\d+(?:\.\d+)*", "", text, flags=re.IGNORECASE) # Remove equation references like "Equation 1" or "Eq. (2)" text = re.sub(r"(?:Equation|Eq\.?)\s*\(?\d+\)?", "", text, flags=re.IGNORECASE) # Remove DOIs text = re.sub(r"(?:doi:|DOI:?)\s*10\.\d{4,}/[^\s]+", "", text, flags=re.IGNORECASE) # Remove arXiv references text = re.sub(r"arXiv:\d{4}\.\d{4,}(?:v\d+)?", "", text, flags=re.IGNORECASE) # Remove ISSN/ISBN numbers text = re.sub(r"(?:ISSN|ISBN)[:\s]*[\d-]+", "", text, flags=re.IGNORECASE) # Remove page ranges like "pp. 123-456" or "p. 42" or "pages 10-20" text = re.sub(r"(?:p{1,2}\.?|pages?)\s*\d+(?:\s*[-–—]\s*\d+)?", "", text, flags=re.IGNORECASE) # Remove volume/issue numbers like "Vol. 12, No. 3" (entire phrase) text = re.sub( r"(?:Vol(?:ume)?\.?\s*\d+,?\s*)?(?:Issue|No\.?)\s*\d+,?\s*", "", text, flags=re.IGNORECASE ) text = re.sub(r"Vol(?:ume)?\.?\s*\d+,?\s*", "", text, flags=re.IGNORECASE) # Remove copyright notices text = re.sub(r"©\s*\d{4}[^.]*\.", "", text) text = re.sub(r"Copyright\s*©?\s*\d{4}[^.]*\.", "", text, flags=re.IGNORECASE) # Remove "All rights reserved" and similar text = re.sub(r"All rights reserved\.?", "", text, flags=re.IGNORECASE) # Remove asterisks used for footnote markers text = re.sub(r"\*{1,3}(?=\s|$)", "", text) # === NORMALIZE NEWLINES FIRST === # Convert various newline formats to standard \n text = text.replace("\r\n", "\n").replace("\r", "\n") # Replace single newlines (mid-sentence line breaks) with spaces # Keep double newlines as paragraph separators # First, normalize multiple newlines to exactly two text = re.sub(r"\n{3,}", "\n\n", text) # Replace single newlines that aren't paragraph breaks with spaces # A single newline not preceded by sentence-ending punctuation is likely a line wrap text = re.sub(r"(?\"')\]]+", "", text) text = re.sub(r"www\.[^\s<>\"')\]]+", "", text) text = re.sub(r"ftp://[^\s<>\"')\]]+", "", text) # UUIDs (with or without dashes) - must come before git hash pattern uuid_pattern = ( r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-" r"[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b" ) text = re.sub(uuid_pattern, "", text) # Git commit hashes (7-40 hex chars standalone) text = re.sub(r"(? "get User Name") text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) # snake_case: replace underscores with spaces text = re.sub(r"(\w)_(\w)", r"\1 \2", text) # Function calls: "func()" -> "func" text = re.sub(r"(\w+)\(\)", r"\1", text) # Arrow functions/operators: -> and => text = text.replace("->", " returns ") text = text.replace("=>", " arrow ") # Common code operators spoken naturally text = text.replace("!=", " not equals ") text = text.replace("==", " equals ") text = text.replace("===", " strictly equals ") text = text.replace("!==", " strictly not equals ") text = text.replace("&&", " and ") text = text.replace("||", " or ") text = text.replace("++", " increment ") text = text.replace("--", " decrement ") # File extensions: ".py" -> " dot py" (only for common extensions) ext_pattern = r"\.(py|js|ts|html|css|json|xml|md|txt|csv|pdf)\b" text = re.sub(ext_pattern, r" dot \1", text, flags=re.IGNORECASE) # Remove standalone hashes/pound signs (not hashtags) text = re.sub(r"(?", " greater than ") text = text.replace("≤", " less than or equal to ") text = text.replace("≥", " greater than or equal to ") text = text.replace("≠", " not equal to ") text = text.replace("±", " plus or minus ") text = text.replace("×", " times ") text = text.replace("÷", " divided by ") # === ABBREVIATIONS AND SPECIAL CASES === # Common abbreviations that might cause issues text = re.sub(r"\be\.g\.", "for example", text, flags=re.IGNORECASE) text = re.sub(r"\bi\.e\.", "that is", text, flags=re.IGNORECASE) text = re.sub(r"\betc\.", "etcetera", text, flags=re.IGNORECASE) text = re.sub(r"\bvs\.", "versus", text, flags=re.IGNORECASE) text = re.sub(r"\bDr\.", "Doctor", text) text = re.sub(r"\bMr\.", "Mister", text) text = re.sub(r"\bMrs\.", "Missus", text) text = re.sub(r"\bMs\.", "Miss", text) text = re.sub(r"\bProf\.", "Professor", text) text = re.sub(r"\bSt\.", "Saint", text) text = re.sub(r"\bNo\.\s*(\d)", r"Number \1", text) text = re.sub(r"\bFig\.", "Figure", text, flags=re.IGNORECASE) text = re.sub(r"\bVol\.", "Volume", text, flags=re.IGNORECASE) text = re.sub(r"\bpp\.", "pages", text, flags=re.IGNORECASE) text = re.sub(r"\bp\.\s*(\d)", r"page \1", text, flags=re.IGNORECASE) # === BRACKETS AND PARENTHESES === # Remove or simplify brackets that might cause pauses text = re.sub(r"\[([^\]]+)\]", r"(\1)", text) # Square to round text = re.sub(r"\{([^}]+)\}", r"(\1)", text) # Curly to round # Remove citation numbers like [1], [2,3], [1-5] text = re.sub(r"\[\d+(?:[-,]\d+)*\]", "", text) text = re.sub(r"\(\d+(?:[-,]\d+)*\)", "", text) # === CLEANUP === # Remove standalone special characters text = re.sub(r"\s+[#@*^~`|\\]+\s+", " ", text) # Remove content in angle brackets (often HTML/XML artifacts) text = re.sub(r"<[^>]+>", "", text) # Remove spaces before punctuation text = re.sub(r"\s+([.,;:!?])", r"\1", text) # Ensure space after punctuation (but not before another punctuation) text = re.sub(r"([.,;:!?])([^\s.,;:!?'\"])", r"\1 \2", text) # === FINAL WHITESPACE NORMALIZATION === # This must happen LAST after all substitutions that can create gaps # Collapse all whitespace (spaces, tabs, multiple spaces) to single space # Do this per-line to preserve intentional paragraph breaks lines = text.split("\n") normalized_lines = [] for line in lines: # Replace any sequence of whitespace with single space line = re.sub(r"[ \t]+", " ", line) # Strip leading/trailing whitespace from each line line = line.strip() normalized_lines.append(line) text = "\n".join(normalized_lines) # Remove excessive blank lines (keep max 1 blank line between paragraphs) text = re.sub(r"\n{3,}", "\n\n", text) # Remove blank lines at start/end text = text.strip() return text