File size: 3,594 Bytes
bc5f977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from verstack import NaNImputer

############################################################

class DataCleaner(BaseEstimator, TransformerMixin):
	def __init__(self, cols_to_drop, nonnegative_cols):
		self.cols_to_drop = cols_to_drop
		self.nonnegative_cols = nonnegative_cols
   
	def fit(self, X, y=None):
		return self

	# X is pd.DataFrame
	def transform(self, X):
		X_copy = X.copy()
		X_copy.drop(columns=self.cols_to_drop, errors='ignore', inplace=True)	
			
		X_copy.replace(['?', 'Error'], np.nan, inplace=True)
		
		if 'avg_frequency_login_days' in X_copy.columns:
			X_copy['avg_frequency_login_days'] = X_copy['avg_frequency_login_days'].astype(float)

		for col in self.nonnegative_cols:
			if col in X_copy.columns:
				X_copy.loc[X_copy[col] < 0, col] = np.nan

		return X_copy
	
############################################################

class NaNImputerWrapper(BaseEstimator, TransformerMixin):
	def __init__(self, train_sample_size=30_000, verbose=True):
		self.train_sample_size = train_sample_size
		self.verbose = verbose
		self.imputer = NaNImputer(self.train_sample_size, self.verbose)

	def fit(self, X, y=None):
		return self

	def transform(self, X):
		return self.imputer.impute(X)

############################################################

class FeatureEng(BaseEstimator, TransformerMixin):
	def __init__(self):
		self.membership_order = ['No Membership', 'Basic Membership', 'Silver Membership',
								 'Gold Membership', 'Platinum Membership', 'Premium Membership']
		self.positive_feedback = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
		self.negative_feedback = ['Poor Website', 'Poor Customer Service', 'Poor Product Quality', 'Too many ads']

	def get_sentiment(self, feedback):
		if feedback in self.positive_feedback:
			return 1
		elif feedback in self.negative_feedback:
			return -1
		else:
			return 0

	def fit(self, X, y=None):
		return self

	def transform(self, X):
        # feature selection
		X = X[['membership_category', 'feedback', 'points_in_wallet']]

        # encoding
		X['membership_category'] = pd.Categorical( X['membership_category'], 
												  categories=self.membership_order, 
												  ordered=True).codes
		
		X['feedback'] = X['feedback'].apply(self.get_sentiment)

		# standardization
		X['points_in_wallet'] = (X['points_in_wallet'] - X['points_in_wallet'].mean()) / X['points_in_wallet'].std()
		
		return X  
	
	def fit_transform(self, X, y=None):
		X_transformed = self.transform(X)
		self.feature_names_out_ = X_transformed.columns
		return X_transformed
	
	def get_feature_names_out(self, input_features=None):
		return self.feature_names_out_

############################################################

class AdjustedProbClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model, thresholds):
        self.model = model
        self.thresholds = thresholds

    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def predict(self, X):
        y_proba = self.predict_proba(X)
        preds = []
        for probs in y_proba:
            predicted_class = np.argmax(probs / np.array(self.thresholds)) + 1
            preds.append(predicted_class)
        return np.array(preds)

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

############################################################