SuperKart_Frontend / custom_transformers.py
surnellas's picture
Upload folder using huggingface_hub
b10edef verified
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# ===== Custom Transformer for Skewness and Capping =====
class SkewnessCapper(BaseEstimator, TransformerMixin):
def __init__(self, skew_threshold=0.5, cap_factor=3.0):
self.skew_threshold = skew_threshold
self.cap_factor = cap_factor
self.skewed_features_ = None
self.feature_caps_ = {}
def fit(self, X, y=None):
X = pd.DataFrame(X)
skewness = X.apply(lambda col: col.skew(skipna=True))
self.skewed_features_ = skewness[abs(skewness) > self.skew_threshold].index.tolist()
# Store caps for each feature
for col in X.columns:
mean, std = X[col].mean(), X[col].std()
self.feature_caps_[col] = (mean - self.cap_factor * std, mean + self.cap_factor * std)
return self
def transform(self, X):
X = pd.DataFrame(X).copy()
# Cap values
for col, (lower, upper) in self.feature_caps_.items():
X[col] = np.clip(X[col], lower, upper)
# Transform skewed columns
for col in self.skewed_features_:
X[col] = np.log1p(X[col] - X[col].min() + 1)
return X
# ===== Example Data =====
df = pd.DataFrame({
'num1': [1, 2, 3, 100, 5],
'num2': [10, 15, 14, 13, 1000],
'cat1': ['A', 'B', 'A', 'C', 'B'],
'cat2': ['X', 'X', 'Y', 'Z', 'Y']
})
# Separate numeric and categorical columns
num_features = df.select_dtypes(include=[np.number]).columns.tolist()
cat_features = df.select_dtypes(exclude=[np.number]).columns.tolist()
# ===== Pipelines for Numeric & Categorical =====
numeric_pipeline = Pipeline(steps=[
('skew_cap', SkewnessCapper())
])
categorical_pipeline = Pipeline(steps=[
('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
# ===== Combine into ColumnTransformer =====
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_pipeline, num_features),
('cat', categorical_pipeline, cat_features)
]
)
# ===== Full Pipeline =====
full_pipeline = Pipeline(steps=[
('preprocessor', preprocessor)
])
# ===== Transform the Data =====
df_transformed = full_pipeline.fit_transform(df)
print(pd.DataFrame(df_transformed))