""" Text processing utilities for chunking and token counting. This module provides utilities for token counting using tiktoken and text preprocessing. """ import tiktoken import re from typing import List # Global encoder instance (cached for performance) _encoder = None def get_encoder(): """ Get or create tiktoken encoder (Claude-compatible). Returns: tiktoken.Encoding: The cl100k_base encoding used by Claude """ global _encoder if _encoder is None: _encoder = tiktoken.get_encoding("cl100k_base") return _encoder def count_tokens(text: str) -> int: """ Count tokens in text using tiktoken. Args: text: Input text to count tokens Returns: int: Number of tokens in the text """ if not text: return 0 encoder = get_encoder() return len(encoder.encode(text)) def clean_text(text: str) -> str: """ Clean extracted PDF text. Removes excessive whitespace and null bytes. Args: text: Raw text from PDF extraction Returns: str: Cleaned text """ if not text: return "" # Remove null bytes text = text.replace('\x00', '') # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove leading/trailing whitespace text = text.strip() return text def split_into_sentences(text: str) -> List[str]: """ Split text into sentences using simple heuristics. Attempts to use nltk if available, otherwise falls back to regex-based splitting. Args: text: Input text to split Returns: List[str]: List of sentences """ if not text: return [] # Try using nltk if available try: import nltk # Download punkt_tab if not already available try: return nltk.sent_tokenize(text) except LookupError: # Punkt data not available, download it try: nltk.download('punkt_tab', quiet=True) return nltk.sent_tokenize(text) except: # If download fails, fall back to regex pass except ImportError: pass # Fallback to simple regex-based splitting # Split on period, exclamation, or question mark followed by whitespace sentences = re.split(r'(?<=[.!?])\s+', text) # Filter out empty sentences and strip whitespace sentences = [s.strip() for s in sentences if s.strip()] return sentences def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str: """ Truncate text to a maximum length. Args: text: Text to truncate max_length: Maximum length in characters suffix: Suffix to add if text is truncated Returns: str: Truncated text """ if len(text) <= max_length: return text return text[:max_length - len(suffix)] + suffix def estimate_pages_from_text(text: str, chars_per_page: int = 2000) -> int: """ Estimate number of pages from text length. Args: text: Input text chars_per_page: Average characters per page (default: 2000) Returns: int: Estimated number of pages """ if not text: return 0 return max(1, len(text) // chars_per_page)