Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.preprocessing import PowerTransformer, OrdinalEncoder | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.compose import ColumnTransformer | |
| # ===== Custom Transformer for Skewness and Capping ===== | |
| class SkewnessCapper(BaseEstimator, TransformerMixin): | |
| def __init__(self, skew_threshold=0.5, cap_factor=3.0): | |
| self.skew_threshold = skew_threshold | |
| self.cap_factor = cap_factor | |
| self.skewed_features_ = None | |
| self.feature_caps_ = {} | |
| def fit(self, X, y=None): | |
| X = pd.DataFrame(X) | |
| skewness = X.apply(lambda col: col.skew(skipna=True)) | |
| self.skewed_features_ = skewness[abs(skewness) > self.skew_threshold].index.tolist() | |
| # Store caps for each feature | |
| for col in X.columns: | |
| mean, std = X[col].mean(), X[col].std() | |
| self.feature_caps_[col] = (mean - self.cap_factor * std, mean + self.cap_factor * std) | |
| return self | |
| def transform(self, X): | |
| X = pd.DataFrame(X).copy() | |
| # Cap values | |
| for col, (lower, upper) in self.feature_caps_.items(): | |
| X[col] = np.clip(X[col], lower, upper) | |
| # Transform skewed columns | |
| for col in self.skewed_features_: | |
| X[col] = np.log1p(X[col] - X[col].min() + 1) | |
| return X | |
| # ===== Example Data ===== | |
| df = pd.DataFrame({ | |
| 'num1': [1, 2, 3, 100, 5], | |
| 'num2': [10, 15, 14, 13, 1000], | |
| 'cat1': ['A', 'B', 'A', 'C', 'B'], | |
| 'cat2': ['X', 'X', 'Y', 'Z', 'Y'] | |
| }) | |
| # Separate numeric and categorical columns | |
| num_features = df.select_dtypes(include=[np.number]).columns.tolist() | |
| cat_features = df.select_dtypes(exclude=[np.number]).columns.tolist() | |
| # ===== Pipelines for Numeric & Categorical ===== | |
| numeric_pipeline = Pipeline(steps=[ | |
| ('skew_cap', SkewnessCapper()) | |
| ]) | |
| categorical_pipeline = Pipeline(steps=[ | |
| ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) | |
| ]) | |
| # ===== Combine into ColumnTransformer ===== | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ('num', numeric_pipeline, num_features), | |
| ('cat', categorical_pipeline, cat_features) | |
| ] | |
| ) | |
| # ===== Full Pipeline ===== | |
| full_pipeline = Pipeline(steps=[ | |
| ('preprocessor', preprocessor) | |
| ]) | |
| # ===== Transform the Data ===== | |
| df_transformed = full_pipeline.fit_transform(df) | |
| print(pd.DataFrame(df_transformed)) |