""" Text Cleaning Module ===================== Pure functions for text preprocessing toggles. Each function operates on a single string and can be composed via apply_text_cleaning(). """ import re import unicodedata from dataclasses import dataclass from typing import List import pandas as pd @dataclass class TextCleaningConfig: """Configuration for text cleaning options.""" remove_html: bool = False remove_urls: bool = False remove_emojis: bool = False normalize_whitespace: bool = True lowercase: bool = False remove_special_chars: bool = False strip_extra_linebreaks: bool = True # --------------------------------------------------------------------------- # Individual cleaning functions # --------------------------------------------------------------------------- def remove_html_tags(text: str) -> str: """Strip all HTML tags from text.""" return re.sub(r'<[^>]+>', '', text) def remove_urls(text: str) -> str: """Remove URLs (http, https, ftp, www) from text.""" return re.sub( r'https?://\S+|ftp://\S+|www\.\S+', '', text ) _EMOJI_PATTERN = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F1E0-\U0001F1FF" # flags "\U00002702-\U000027B0" "\U000024C2-\U0001F251" "\U0001F900-\U0001F9FF" # supplemental symbols "\U0001FA00-\U0001FA6F" "\U0001FA70-\U0001FAFF" "\U00002702-\U000027B0" "]+", flags=re.UNICODE, ) def remove_emojis(text: str) -> str: """Remove emoji characters from text.""" return _EMOJI_PATTERN.sub('', text) def normalize_whitespace(text: str) -> str: """Collapse multiple spaces/tabs into a single space.""" return re.sub(r'[^\S\n]+', ' ', text).strip() def to_lowercase(text: str) -> str: """Convert text to lowercase.""" return text.lower() def remove_special_characters(text: str) -> str: """Keep only alphanumeric, basic punctuation, and whitespace.""" return re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"()\-\n]', '', text) def strip_extra_linebreaks(text: str) -> str: """Reduce three or more consecutive newlines to two.""" return re.sub(r'\n{3,}', '\n\n', text) # --------------------------------------------------------------------------- # Composed cleaner # --------------------------------------------------------------------------- def clean_text(text: str, config: TextCleaningConfig) -> str: """Apply all enabled cleaning steps to a single text string.""" if not isinstance(text, str): return str(text) if text else '' if config.remove_html: text = remove_html_tags(text) if config.remove_urls: text = remove_urls(text) if config.remove_emojis: text = remove_emojis(text) if config.remove_special_chars: text = remove_special_characters(text) if config.lowercase: text = to_lowercase(text) if config.normalize_whitespace: text = normalize_whitespace(text) if config.strip_extra_linebreaks: text = strip_extra_linebreaks(text) return text def apply_text_cleaning( df: pd.DataFrame, columns: List[str], config: TextCleaningConfig, ) -> pd.DataFrame: """Apply text cleaning to specified columns of a DataFrame.""" df = df.copy() for col in columns: if col in df.columns: df[col] = df[col].apply(lambda t: clean_text(t, config)) return df