"""Text cleaning routines for the review corpus.""" from __future__ import annotations import re from dataclasses import dataclass from typing import Iterable, List import pandas as pd HTML_TAG_RE = re.compile(r"<[^>]+>") NON_ALPHA_RE = re.compile(r"[^a-zA-Z0-9\s]") MULTISPACE_RE = re.compile(r"\s+") @dataclass class ReviewCleaner: lowercase: bool = True def clean(self, text: str) -> str: if not isinstance(text, str): text = "" if self.lowercase: text = text.lower() text = HTML_TAG_RE.sub(" ", text) text = NON_ALPHA_RE.sub(" ", text) text = MULTISPACE_RE.sub(" ", text) return text.strip() def clean_series(self, series: pd.Series) -> pd.Series: return series.fillna("").map(self.clean) def remove_short_reviews(self, df: pd.DataFrame, min_chars: int = 20) -> pd.DataFrame: mask = df["reviewText"].str.len() >= min_chars return df.loc[mask].copy() def __call__(self, df: pd.DataFrame, min_chars: int = 20) -> pd.DataFrame: df = df.copy() df["clean_text"] = self.clean_series(df["reviewText"]) df = self.remove_short_reviews(df, min_chars=min_chars) df = df.drop_duplicates(subset=["clean_text"]) return df.reset_index(drop=True) __all__ = ["ReviewCleaner"]