File size: 3,641 Bytes
d4398e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""

Text Cleaning Module

=====================

Pure functions for text preprocessing toggles.

Each function operates on a single string and can be

composed via apply_text_cleaning().

"""

import re
import unicodedata
from dataclasses import dataclass
from typing import List
import pandas as pd


@dataclass
class TextCleaningConfig:
    """Configuration for text cleaning options."""
    remove_html: bool = False
    remove_urls: bool = False
    remove_emojis: bool = False
    normalize_whitespace: bool = True
    lowercase: bool = False
    remove_special_chars: bool = False
    strip_extra_linebreaks: bool = True


# ---------------------------------------------------------------------------
# Individual cleaning functions
# ---------------------------------------------------------------------------

def remove_html_tags(text: str) -> str:
    """Strip all HTML tags from text."""
    return re.sub(r'<[^>]+>', '', text)


def remove_urls(text: str) -> str:
    """Remove URLs (http, https, ftp, www) from text."""
    return re.sub(
        r'https?://\S+|ftp://\S+|www\.\S+',
        '', text
    )


_EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "\U0001F900-\U0001F9FF"  # supplemental symbols
    "\U0001FA00-\U0001FA6F"
    "\U0001FA70-\U0001FAFF"
    "\U00002702-\U000027B0"
    "]+",
    flags=re.UNICODE,
)


def remove_emojis(text: str) -> str:
    """Remove emoji characters from text."""
    return _EMOJI_PATTERN.sub('', text)


def normalize_whitespace(text: str) -> str:
    """Collapse multiple spaces/tabs into a single space."""
    return re.sub(r'[^\S\n]+', ' ', text).strip()


def to_lowercase(text: str) -> str:
    """Convert text to lowercase."""
    return text.lower()


def remove_special_characters(text: str) -> str:
    """Keep only alphanumeric, basic punctuation, and whitespace."""
    return re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"()\-\n]', '', text)


def strip_extra_linebreaks(text: str) -> str:
    """Reduce three or more consecutive newlines to two."""
    return re.sub(r'\n{3,}', '\n\n', text)


# ---------------------------------------------------------------------------
# Composed cleaner
# ---------------------------------------------------------------------------

def clean_text(text: str, config: TextCleaningConfig) -> str:
    """Apply all enabled cleaning steps to a single text string."""
    if not isinstance(text, str):
        return str(text) if text else ''

    if config.remove_html:
        text = remove_html_tags(text)
    if config.remove_urls:
        text = remove_urls(text)
    if config.remove_emojis:
        text = remove_emojis(text)
    if config.remove_special_chars:
        text = remove_special_characters(text)
    if config.lowercase:
        text = to_lowercase(text)
    if config.normalize_whitespace:
        text = normalize_whitespace(text)
    if config.strip_extra_linebreaks:
        text = strip_extra_linebreaks(text)

    return text


def apply_text_cleaning(

    df: pd.DataFrame,

    columns: List[str],

    config: TextCleaningConfig,

) -> pd.DataFrame:
    """Apply text cleaning to specified columns of a DataFrame."""
    df = df.copy()
    for col in columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda t: clean_text(t, config))
    return df