Spaces:
Sleeping
Sleeping
File size: 1,916 Bytes
eab2256 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
"""Reusable text cleaning utilities."""
from __future__ import annotations
import re
import string
from typing import Iterable
import pandas as pd
from .config import Config
from .logging_utils import get_logger
LOGGER = get_logger(__name__)
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
MENTION_PATTERN = re.compile(r"@[A-Za-z0-9_]+")
HASHTAG_PATTERN = re.compile(r"#[A-Za-z0-9_]+")
def clean_text(text: str, config: Config) -> str:
settings = config.preprocessing
processed = text
if settings.get("lowercase", True):
processed = processed.lower()
if settings.get("strip_urls", True):
processed = URL_PATTERN.sub("", processed)
if settings.get("strip_mentions", True):
processed = MENTION_PATTERN.sub("", processed)
if settings.get("strip_hashtags", False):
processed = HASHTAG_PATTERN.sub("", processed)
if settings.get("remove_punctuation", True):
processed = processed.translate(str.maketrans("", "", string.punctuation))
if settings.get("normalize_whitespace", True):
processed = re.sub(r"\s+", " ", processed).strip()
return processed
def preprocess_dataframe(df: pd.DataFrame, config: Config) -> pd.DataFrame:
"""Apply text cleaning and reorder target labels."""
data_settings = config.data
text_column = data_settings.get("text_column", "text")
target_column = data_settings.get("target_column", "sentiment")
df = df.copy()
df[text_column] = df[text_column].astype(str).apply(lambda text: clean_text(text, config))
class_order: Iterable[str] = data_settings.get("class_order") or config.model.get("class_order")
if class_order:
df[target_column] = pd.Categorical(df[target_column], categories=list(class_order), ordered=True)
LOGGER.info("Completed preprocessing for %d records", len(df))
return df
__all__ = ["clean_text", "preprocess_dataframe"]
|