Spaces:
Configuration error
Configuration error
| """ | |
| Text Cleaning Module | |
| ===================== | |
| Pure functions for text preprocessing toggles. | |
| Each function operates on a single string and can be | |
| composed via apply_text_cleaning(). | |
| """ | |
| import re | |
| import unicodedata | |
| from dataclasses import dataclass | |
| from typing import List | |
| import pandas as pd | |
| class TextCleaningConfig: | |
| """Configuration for text cleaning options.""" | |
| remove_html: bool = False | |
| remove_urls: bool = False | |
| remove_emojis: bool = False | |
| normalize_whitespace: bool = True | |
| lowercase: bool = False | |
| remove_special_chars: bool = False | |
| strip_extra_linebreaks: bool = True | |
| # --------------------------------------------------------------------------- | |
| # Individual cleaning functions | |
| # --------------------------------------------------------------------------- | |
| def remove_html_tags(text: str) -> str: | |
| """Strip all HTML tags from text.""" | |
| return re.sub(r'<[^>]+>', '', text) | |
| def remove_urls(text: str) -> str: | |
| """Remove URLs (http, https, ftp, www) from text.""" | |
| return re.sub( | |
| r'https?://\S+|ftp://\S+|www\.\S+', | |
| '', text | |
| ) | |
| _EMOJI_PATTERN = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F1E0-\U0001F1FF" # flags | |
| "\U00002702-\U000027B0" | |
| "\U000024C2-\U0001F251" | |
| "\U0001F900-\U0001F9FF" # supplemental symbols | |
| "\U0001FA00-\U0001FA6F" | |
| "\U0001FA70-\U0001FAFF" | |
| "\U00002702-\U000027B0" | |
| "]+", | |
| flags=re.UNICODE, | |
| ) | |
| def remove_emojis(text: str) -> str: | |
| """Remove emoji characters from text.""" | |
| return _EMOJI_PATTERN.sub('', text) | |
| def normalize_whitespace(text: str) -> str: | |
| """Collapse multiple spaces/tabs into a single space.""" | |
| return re.sub(r'[^\S\n]+', ' ', text).strip() | |
| def to_lowercase(text: str) -> str: | |
| """Convert text to lowercase.""" | |
| return text.lower() | |
| def remove_special_characters(text: str) -> str: | |
| """Keep only alphanumeric, basic punctuation, and whitespace.""" | |
| return re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"()\-\n]', '', text) | |
| def strip_extra_linebreaks(text: str) -> str: | |
| """Reduce three or more consecutive newlines to two.""" | |
| return re.sub(r'\n{3,}', '\n\n', text) | |
| # --------------------------------------------------------------------------- | |
| # Composed cleaner | |
| # --------------------------------------------------------------------------- | |
| def clean_text(text: str, config: TextCleaningConfig) -> str: | |
| """Apply all enabled cleaning steps to a single text string.""" | |
| if not isinstance(text, str): | |
| return str(text) if text else '' | |
| if config.remove_html: | |
| text = remove_html_tags(text) | |
| if config.remove_urls: | |
| text = remove_urls(text) | |
| if config.remove_emojis: | |
| text = remove_emojis(text) | |
| if config.remove_special_chars: | |
| text = remove_special_characters(text) | |
| if config.lowercase: | |
| text = to_lowercase(text) | |
| if config.normalize_whitespace: | |
| text = normalize_whitespace(text) | |
| if config.strip_extra_linebreaks: | |
| text = strip_extra_linebreaks(text) | |
| return text | |
| def apply_text_cleaning( | |
| df: pd.DataFrame, | |
| columns: List[str], | |
| config: TextCleaningConfig, | |
| ) -> pd.DataFrame: | |
| """Apply text cleaning to specified columns of a DataFrame.""" | |
| df = df.copy() | |
| for col in columns: | |
| if col in df.columns: | |
| df[col] = df[col].apply(lambda t: clean_text(t, config)) | |
| return df | |