import re from typing import List, Tuple import logging logger = logging.getLogger(__name__) class TextPreprocessor: """ A utility class for preprocessing text before embedding. Includes cleaning, normalization, and chunking methods. """ @staticmethod def clean_text(text: str) -> str: """Clean text by removing extra whitespaces, newlines, etc.""" # Remove extra whitespaces and newlines text = re.sub(r'\s+', ' ', text) # Remove special characters, keeping only alphanumeric and basic punctuation text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text) # Remove extra spaces again after special character removal text = re.sub(r'\s+', ' ', text).strip() return text @staticmethod def split_by_sentences(text: str) -> List[str]: """Split text into sentences.""" # Split by sentence endings sentences = re.split(r'[.!?]+', text) # Remove empty strings and strip whitespace sentences = [s.strip() for s in sentences if s.strip()] return sentences @staticmethod def split_by_paragraphs(text: str) -> List[str]: """Split text into paragraphs.""" paragraphs = text.split('\n\n') # Remove empty strings and strip whitespace paragraphs = [p.strip() for p in paragraphs if p.strip()] return paragraphs @staticmethod def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]: """ Split text into overlapping chunks of specified size. Args: text: The input text to chunk chunk_size: Maximum size of each chunk (in characters) overlap: Number of characters to overlap between chunks Returns: List of text chunks """ if len(text) <= chunk_size: return [text] chunks = [] start = 0 while start < len(text): end = start + chunk_size # Try to break at sentence boundaries if possible if end < len(text): # Look for a sentence boundary near the end sentence_end = text.rfind('.', start, end) if sentence_end != -1 and sentence_end > start + chunk_size // 2: end = sentence_end + 1 else: # If no sentence boundary found, look for a space space_end = text.rfind(' ', start, end) if space_end != -1 and space_end > start + chunk_size // 2: end = space_end chunk = text[start:end].strip() if chunk: chunks.append(chunk) # Move start position, considering overlap start = end - overlap if overlap < end else end # If the last chunk was not processed and we've reached the end if start >= len(text) and end < len(text): final_chunk = text[end:].strip() if final_chunk: chunks.append(final_chunk) # Filter out any empty chunks chunks = [chunk for chunk in chunks if chunk] return chunks @staticmethod def extract_key_info(text: str) -> dict: """ Extract key information from text such as headers, titles, etc. This is a simple implementation that looks for common patterns. """ info = {} # Look for potential titles (lines that are short and capitalized) lines = text.split('\n') potential_titles = [ line.strip() for line in lines[:10] # Check first 10 lines if 10 < len(line.strip()) < 100 and # Length between 10-100 chars line.strip().isupper() or # All caps line.strip().istitle() # Title case ] if potential_titles: info['potential_title'] = potential_titles[0] # Extract any email addresses emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) if emails: info['emails'] = emails[:5] # Limit to first 5 emails # Extract any URLs urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) if urls: info['urls'] = urls[:5] # Limit to first 5 URLs return info