opinion-summarizer / src /components /data_cleaning.py
Anshrathore01's picture
Implement core pipelines and web UI
0116d50
"""Text cleaning routines for the review corpus."""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import Iterable, List
import pandas as pd
HTML_TAG_RE = re.compile(r"<[^>]+>")
NON_ALPHA_RE = re.compile(r"[^a-zA-Z0-9\s]")
MULTISPACE_RE = re.compile(r"\s+")
@dataclass
class ReviewCleaner:
lowercase: bool = True
def clean(self, text: str) -> str:
if not isinstance(text, str):
text = ""
if self.lowercase:
text = text.lower()
text = HTML_TAG_RE.sub(" ", text)
text = NON_ALPHA_RE.sub(" ", text)
text = MULTISPACE_RE.sub(" ", text)
return text.strip()
def clean_series(self, series: pd.Series) -> pd.Series:
return series.fillna("").map(self.clean)
def remove_short_reviews(self, df: pd.DataFrame, min_chars: int = 20) -> pd.DataFrame:
mask = df["reviewText"].str.len() >= min_chars
return df.loc[mask].copy()
def __call__(self, df: pd.DataFrame, min_chars: int = 20) -> pd.DataFrame:
df = df.copy()
df["clean_text"] = self.clean_series(df["reviewText"])
df = self.remove_short_reviews(df, min_chars=min_chars)
df = df.drop_duplicates(subset=["clean_text"])
return df.reset_index(drop=True)
__all__ = ["ReviewCleaner"]