File size: 1,916 Bytes
eab2256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""Reusable text cleaning utilities."""

from __future__ import annotations

import re
import string
from typing import Iterable

import pandas as pd

from .config import Config
from .logging_utils import get_logger

LOGGER = get_logger(__name__)

URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
MENTION_PATTERN = re.compile(r"@[A-Za-z0-9_]+")
HASHTAG_PATTERN = re.compile(r"#[A-Za-z0-9_]+")


def clean_text(text: str, config: Config) -> str:
    settings = config.preprocessing
    processed = text

    if settings.get("lowercase", True):
        processed = processed.lower()

    if settings.get("strip_urls", True):
        processed = URL_PATTERN.sub("", processed)

    if settings.get("strip_mentions", True):
        processed = MENTION_PATTERN.sub("", processed)

    if settings.get("strip_hashtags", False):
        processed = HASHTAG_PATTERN.sub("", processed)

    if settings.get("remove_punctuation", True):
        processed = processed.translate(str.maketrans("", "", string.punctuation))

    if settings.get("normalize_whitespace", True):
        processed = re.sub(r"\s+", " ", processed).strip()

    return processed


def preprocess_dataframe(df: pd.DataFrame, config: Config) -> pd.DataFrame:
    """Apply text cleaning and reorder target labels."""

    data_settings = config.data
    text_column = data_settings.get("text_column", "text")
    target_column = data_settings.get("target_column", "sentiment")

    df = df.copy()
    df[text_column] = df[text_column].astype(str).apply(lambda text: clean_text(text, config))

    class_order: Iterable[str] = data_settings.get("class_order") or config.model.get("class_order")
    if class_order:
        df[target_column] = pd.Categorical(df[target_column], categories=list(class_order), ordered=True)

    LOGGER.info("Completed preprocessing for %d records", len(df))
    return df


__all__ = ["clean_text", "preprocess_dataframe"]