Spaces:
Configuration error
Configuration error
File size: 3,641 Bytes
d4398e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | """
Text Cleaning Module
=====================
Pure functions for text preprocessing toggles.
Each function operates on a single string and can be
composed via apply_text_cleaning().
"""
import re
import unicodedata
from dataclasses import dataclass
from typing import List
import pandas as pd
@dataclass
class TextCleaningConfig:
"""Configuration for text cleaning options."""
remove_html: bool = False
remove_urls: bool = False
remove_emojis: bool = False
normalize_whitespace: bool = True
lowercase: bool = False
remove_special_chars: bool = False
strip_extra_linebreaks: bool = True
# ---------------------------------------------------------------------------
# Individual cleaning functions
# ---------------------------------------------------------------------------
def remove_html_tags(text: str) -> str:
"""Strip all HTML tags from text."""
return re.sub(r'<[^>]+>', '', text)
def remove_urls(text: str) -> str:
"""Remove URLs (http, https, ftp, www) from text."""
return re.sub(
r'https?://\S+|ftp://\S+|www\.\S+',
'', text
)
_EMOJI_PATTERN = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001F900-\U0001F9FF" # supplemental symbols
"\U0001FA00-\U0001FA6F"
"\U0001FA70-\U0001FAFF"
"\U00002702-\U000027B0"
"]+",
flags=re.UNICODE,
)
def remove_emojis(text: str) -> str:
"""Remove emoji characters from text."""
return _EMOJI_PATTERN.sub('', text)
def normalize_whitespace(text: str) -> str:
"""Collapse multiple spaces/tabs into a single space."""
return re.sub(r'[^\S\n]+', ' ', text).strip()
def to_lowercase(text: str) -> str:
"""Convert text to lowercase."""
return text.lower()
def remove_special_characters(text: str) -> str:
"""Keep only alphanumeric, basic punctuation, and whitespace."""
return re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"()\-\n]', '', text)
def strip_extra_linebreaks(text: str) -> str:
"""Reduce three or more consecutive newlines to two."""
return re.sub(r'\n{3,}', '\n\n', text)
# ---------------------------------------------------------------------------
# Composed cleaner
# ---------------------------------------------------------------------------
def clean_text(text: str, config: TextCleaningConfig) -> str:
"""Apply all enabled cleaning steps to a single text string."""
if not isinstance(text, str):
return str(text) if text else ''
if config.remove_html:
text = remove_html_tags(text)
if config.remove_urls:
text = remove_urls(text)
if config.remove_emojis:
text = remove_emojis(text)
if config.remove_special_chars:
text = remove_special_characters(text)
if config.lowercase:
text = to_lowercase(text)
if config.normalize_whitespace:
text = normalize_whitespace(text)
if config.strip_extra_linebreaks:
text = strip_extra_linebreaks(text)
return text
def apply_text_cleaning(
df: pd.DataFrame,
columns: List[str],
config: TextCleaningConfig,
) -> pd.DataFrame:
"""Apply text cleaning to specified columns of a DataFrame."""
df = df.copy()
for col in columns:
if col in df.columns:
df[col] = df[col].apply(lambda t: clean_text(t, config))
return df
|