"""Text processing utilities for Francis Botcon project.""" import re from pathlib import Path from typing import List, Tuple from src.logger import get_logger logger = get_logger(__name__) class TextCleaner: """Clean and preprocess texts from Project Gutenberg.""" # Project Gutenberg header/footer patterns PG_HEADER_PATTERN = r"\*\*\*.*?START.*?PROJECT GUTENBERG.*?\*\*\*" PG_FOOTER_PATTERN = r"\*\*\*.*?END.*?PROJECT GUTENBERG.*?\*\*\*" @staticmethod def remove_pg_metadata(text: str) -> str: """Remove Project Gutenberg header and footer. Args: text: Raw text from Project Gutenberg Returns: Cleaned text """ # Remove header text = re.sub( TextCleaner.PG_HEADER_PATTERN, "", text, flags=re.DOTALL | re.IGNORECASE ) # Remove footer text = re.sub( TextCleaner.PG_FOOTER_PATTERN, "", text, flags=re.DOTALL | re.IGNORECASE ) return text @staticmethod def normalize_whitespace(text: str) -> str: """Normalize whitespace in text. Args: text: Input text Returns: Text with normalized whitespace """ # Remove multiple spaces text = re.sub(r' +', ' ', text) # Remove multiple newlines text = re.sub(r'\n\n+', '\n\n', text) # Strip leading/trailing whitespace text = text.strip() return text @staticmethod def clean_text(text: str) -> str: """Apply all cleaning operations. Args: text: Raw text Returns: Cleaned text """ text = TextCleaner.remove_pg_metadata(text) text = TextCleaner.normalize_whitespace(text) return text class TextSegmenter: """Segment text into meaningful chunks.""" @staticmethod def segment_by_paragraphs(text: str, min_length: int = 100) -> List[str]: """Segment text into paragraphs. Args: text: Input text min_length: Minimum paragraph length in characters Returns: List of paragraph segments """ paragraphs = text.split('\n\n') # Filter out very short paragraphs paragraphs = [p.strip() for p in paragraphs if len(p.strip()) >= min_length] return paragraphs @staticmethod def segment_by_length(text: str, chunk_size: int = 500, overlap: int = 100) -> List[str]: """Segment text into fixed-size chunks with overlap. Args: text: Input text chunk_size: Size of each chunk in characters overlap: Overlap between chunks Returns: List of text chunks """ chunks = [] words = text.split() current_chunk = [] current_size = 0 for word in words: current_chunk.append(word) current_size += len(word) + 1 # +1 for space if current_size >= chunk_size: chunks.append(' '.join(current_chunk)) # Create overlap current_chunk = current_chunk[-(overlap // 5):] # Approximate overlap current_size = sum(len(w) for w in current_chunk) # Add remaining chunk if current_chunk: chunks.append(' '.join(current_chunk)) return chunks @staticmethod def extract_title_and_author(text: str) -> Tuple[str, str]: """Extract title and author from text. Args: text: Input text Returns: Tuple of (title, author) """ lines = text.split('\n') title = "Unknown" author = "Francis Bacon" for i, line in enumerate(lines[:50]): # Check first 50 lines if 'by' in line.lower() and 'bacon' in line.lower(): author = line.strip() if i > 0: title = lines[i - 1].strip() break return title, author def process_raw_file(file_path: Path) -> Tuple[str, str]: """Process a raw Project Gutenberg file. Args: file_path: Path to raw text file Returns: Tuple of (cleaned_text, filename) """ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() cleaned_text = TextCleaner.clean_text(text) return cleaned_text, file_path.stem