Auto-FineTune-Ops / preprocessing /text_cleaning.py
aneeb15's picture
Initial release of Auto-FineTune-Ops
d4398e6
"""
Text Cleaning Module
=====================
Pure functions for text preprocessing toggles.
Each function operates on a single string and can be
composed via apply_text_cleaning().
"""
import re
import unicodedata
from dataclasses import dataclass
from typing import List
import pandas as pd
@dataclass
class TextCleaningConfig:
"""Configuration for text cleaning options."""
remove_html: bool = False
remove_urls: bool = False
remove_emojis: bool = False
normalize_whitespace: bool = True
lowercase: bool = False
remove_special_chars: bool = False
strip_extra_linebreaks: bool = True
# ---------------------------------------------------------------------------
# Individual cleaning functions
# ---------------------------------------------------------------------------
def remove_html_tags(text: str) -> str:
"""Strip all HTML tags from text."""
return re.sub(r'<[^>]+>', '', text)
def remove_urls(text: str) -> str:
"""Remove URLs (http, https, ftp, www) from text."""
return re.sub(
r'https?://\S+|ftp://\S+|www\.\S+',
'', text
)
_EMOJI_PATTERN = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001F900-\U0001F9FF" # supplemental symbols
"\U0001FA00-\U0001FA6F"
"\U0001FA70-\U0001FAFF"
"\U00002702-\U000027B0"
"]+",
flags=re.UNICODE,
)
def remove_emojis(text: str) -> str:
"""Remove emoji characters from text."""
return _EMOJI_PATTERN.sub('', text)
def normalize_whitespace(text: str) -> str:
"""Collapse multiple spaces/tabs into a single space."""
return re.sub(r'[^\S\n]+', ' ', text).strip()
def to_lowercase(text: str) -> str:
"""Convert text to lowercase."""
return text.lower()
def remove_special_characters(text: str) -> str:
"""Keep only alphanumeric, basic punctuation, and whitespace."""
return re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"()\-\n]', '', text)
def strip_extra_linebreaks(text: str) -> str:
"""Reduce three or more consecutive newlines to two."""
return re.sub(r'\n{3,}', '\n\n', text)
# ---------------------------------------------------------------------------
# Composed cleaner
# ---------------------------------------------------------------------------
def clean_text(text: str, config: TextCleaningConfig) -> str:
"""Apply all enabled cleaning steps to a single text string."""
if not isinstance(text, str):
return str(text) if text else ''
if config.remove_html:
text = remove_html_tags(text)
if config.remove_urls:
text = remove_urls(text)
if config.remove_emojis:
text = remove_emojis(text)
if config.remove_special_chars:
text = remove_special_characters(text)
if config.lowercase:
text = to_lowercase(text)
if config.normalize_whitespace:
text = normalize_whitespace(text)
if config.strip_extra_linebreaks:
text = strip_extra_linebreaks(text)
return text
def apply_text_cleaning(
df: pd.DataFrame,
columns: List[str],
config: TextCleaningConfig,
) -> pd.DataFrame:
"""Apply text cleaning to specified columns of a DataFrame."""
df = df.copy()
for col in columns:
if col in df.columns:
df[col] = df[col].apply(lambda t: clean_text(t, config))
return df