File size: 1,331 Bytes
0116d50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | """Text cleaning routines for the review corpus."""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Iterable, List
import pandas as pd
HTML_TAG_RE = re.compile(r"<[^>]+>")
NON_ALPHA_RE = re.compile(r"[^a-zA-Z0-9\s]")
MULTISPACE_RE = re.compile(r"\s+")
@dataclass
class ReviewCleaner:
lowercase: bool = True
def clean(self, text: str) -> str:
if not isinstance(text, str):
text = ""
if self.lowercase:
text = text.lower()
text = HTML_TAG_RE.sub(" ", text)
text = NON_ALPHA_RE.sub(" ", text)
text = MULTISPACE_RE.sub(" ", text)
return text.strip()
def clean_series(self, series: pd.Series) -> pd.Series:
return series.fillna("").map(self.clean)
def remove_short_reviews(self, df: pd.DataFrame, min_chars: int = 20) -> pd.DataFrame:
mask = df["reviewText"].str.len() >= min_chars
return df.loc[mask].copy()
def __call__(self, df: pd.DataFrame, min_chars: int = 20) -> pd.DataFrame:
df = df.copy()
df["clean_text"] = self.clean_series(df["reviewText"])
df = self.remove_short_reviews(df, min_chars=min_chars)
df = df.drop_duplicates(subset=["clean_text"])
return df.reset_index(drop=True)
__all__ = ["ReviewCleaner"]
|