Spaces:
Sleeping
Sleeping
File size: 2,466 Bytes
4e7a5c8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | from __future__ import annotations
from pathlib import Path
from typing import Iterable, List, Optional, Tuple
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
class DataPreprocessor:
"""Encapsulates preprocessing steps: dedup, clip outliers, scaling."""
def __init__(
self,
feature_columns: Optional[Iterable[str]] = None,
target_column: str = "quality",
outlier_clip_quantiles: Tuple[float, float] = (0.01, 0.99),
scale_features: bool = True,
) -> None:
self.feature_columns = list(feature_columns) if feature_columns is not None else None
self.target_column = target_column
self.outlier_clip_quantiles = outlier_clip_quantiles
self.scale_features = scale_features
self.scaler: Optional[StandardScaler] = None
def fit(self, df: pd.DataFrame) -> "DataPreprocessor":
features = self._get_feature_columns(df)
if self.scale_features:
self.scaler = StandardScaler().fit(df[features])
return self
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df_proc = df.copy()
df_proc = self._drop_duplicates(df_proc)
df_proc = self._clip_outliers(df_proc)
if self.scale_features and self.scaler is not None:
features = self._get_feature_columns(df_proc)
df_proc[features] = self.scaler.transform(df_proc[features])
return df_proc
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
return self.fit(df).transform(df)
def _get_feature_columns(self, df: pd.DataFrame) -> List[str]:
if self.feature_columns is not None:
return self.feature_columns
return [c for c in df.columns if c != self.target_column]
def _drop_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
return df.drop_duplicates().reset_index(drop=True)
def _clip_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
q_low, q_high = self.outlier_clip_quantiles
features = self._get_feature_columns(df)
for col in features:
low = df[col].quantile(q_low)
high = df[col].quantile(q_high)
df[col] = df[col].clip(lower=low, upper=high)
return df
def save_processed(df: pd.DataFrame, output_path: Path) -> Path:
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
return output_path
|