|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin |
|
|
from verstack import NaNImputer |
|
|
|
|
|
|
|
|
|
|
|
class DataCleaner(BaseEstimator, TransformerMixin): |
|
|
def __init__(self, cols_to_drop, nonnegative_cols): |
|
|
self.cols_to_drop = cols_to_drop |
|
|
self.nonnegative_cols = nonnegative_cols |
|
|
|
|
|
def fit(self, X, y=None): |
|
|
return self |
|
|
|
|
|
|
|
|
def transform(self, X): |
|
|
X_copy = X.copy() |
|
|
X_copy.drop(columns=self.cols_to_drop, errors='ignore', inplace=True) |
|
|
|
|
|
X_copy.replace(['?', 'Error'], np.nan, inplace=True) |
|
|
|
|
|
if 'avg_frequency_login_days' in X_copy.columns: |
|
|
X_copy['avg_frequency_login_days'] = X_copy['avg_frequency_login_days'].astype(float) |
|
|
|
|
|
for col in self.nonnegative_cols: |
|
|
if col in X_copy.columns: |
|
|
X_copy.loc[X_copy[col] < 0, col] = np.nan |
|
|
|
|
|
return X_copy |
|
|
|
|
|
|
|
|
|
|
|
class NaNImputerWrapper(BaseEstimator, TransformerMixin): |
|
|
def __init__(self, train_sample_size=30_000, verbose=True): |
|
|
self.train_sample_size = train_sample_size |
|
|
self.verbose = verbose |
|
|
self.imputer = NaNImputer(self.train_sample_size, self.verbose) |
|
|
|
|
|
def fit(self, X, y=None): |
|
|
return self |
|
|
|
|
|
def transform(self, X): |
|
|
return self.imputer.impute(X) |
|
|
|
|
|
|
|
|
|
|
|
class FeatureEng(BaseEstimator, TransformerMixin): |
|
|
def __init__(self): |
|
|
self.membership_order = ['No Membership', 'Basic Membership', 'Silver Membership', |
|
|
'Gold Membership', 'Platinum Membership', 'Premium Membership'] |
|
|
self.positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website'] |
|
|
self.negative_feedback = ['Poor Website', 'Poor Customer Service', 'Poor Product Quality', 'Too many ads'] |
|
|
|
|
|
def get_sentiment(self, feedback): |
|
|
if feedback in self.positive_feedback: |
|
|
return 1 |
|
|
elif feedback in self.negative_feedback: |
|
|
return -1 |
|
|
else: |
|
|
return 0 |
|
|
|
|
|
def fit(self, X, y=None): |
|
|
return self |
|
|
|
|
|
def transform(self, X): |
|
|
|
|
|
X = X[['membership_category', 'feedback', 'points_in_wallet']] |
|
|
|
|
|
|
|
|
X['membership_category'] = pd.Categorical( X['membership_category'], |
|
|
categories=self.membership_order, |
|
|
ordered=True).codes |
|
|
|
|
|
X['feedback'] = X['feedback'].apply(self.get_sentiment) |
|
|
|
|
|
|
|
|
X['points_in_wallet'] = (X['points_in_wallet'] - X['points_in_wallet'].mean()) / X['points_in_wallet'].std() |
|
|
|
|
|
return X |
|
|
|
|
|
def fit_transform(self, X, y=None): |
|
|
X_transformed = self.transform(X) |
|
|
self.feature_names_out_ = X_transformed.columns |
|
|
return X_transformed |
|
|
|
|
|
def get_feature_names_out(self, input_features=None): |
|
|
return self.feature_names_out_ |
|
|
|
|
|
|
|
|
|
|
|
class AdjustedProbClassifier(BaseEstimator, ClassifierMixin): |
|
|
def __init__(self, model, thresholds): |
|
|
self.model = model |
|
|
self.thresholds = thresholds |
|
|
|
|
|
def fit(self, X, y): |
|
|
self.model.fit(X, y) |
|
|
return self |
|
|
|
|
|
def predict_proba(self, X): |
|
|
return self.model.predict_proba(X) |
|
|
|
|
|
def predict(self, X): |
|
|
y_proba = self.predict_proba(X) |
|
|
preds = [] |
|
|
for probs in y_proba: |
|
|
predicted_class = np.argmax(probs / np.array(self.thresholds)) + 1 |
|
|
preds.append(predicted_class) |
|
|
return np.array(preds) |
|
|
|
|
|
def score(self, X, y): |
|
|
return np.mean(self.predict(X) == y) |
|
|
|
|
|
|
|
|
|
|
|
|