File size: 7,137 Bytes
537db6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import logging
import pandas as pd
from typing import Optional, Tuple, Dict, Literal, Any
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from src.enums import Feature
from src.repository.transaction_repo import get_multiple_rows
from src.entity.model import Model

logger = logging.getLogger(__name__)

def preprocess_data(df: pd.DataFrame, test_size: float = 0.2) -> Tuple:
    """
    Split the dataframe into X (features) and y (target).

    Args:
        df (pd.DataFrame): Input dataframe.

    Returns:
        Tuple: Split data (X_train, X_test, y_train, y_test).
    """
    # Format data for the model
    df_model = df

    features = [f.name for f in Feature.get_all_features()]
    X = df_model[features]
    y = df_model['is_fraud']

    # Split the data
    if test_size > 0:
        return train_test_split(X, y, test_size=test_size, stratify=df_model.is_fraud, random_state=42)
    else:
        return X, pd.DataFrame(), y, pd.DataFrame()

def train_model(
        pipeline: Pipeline,
        X_train: pd.DataFrame,
        y_train: pd.DataFrame) -> Pipeline:
    """
    Train the pipeline
    """
    # Start the timer
    import time
    start_time = time.time()
    print("Training the model...")
    pipeline.fit(X_train, y_train)
    print(f"Model trained in {time.time() - start_time:.2f} seconds")
    return pipeline

all_algorithms = Literal[
    'XGBoost',
    'RandomForest',
    'SVM',
    'GradientBoosting',
    'MLP',
    'LightGBM',
    'XGBRF',
    'DecisionTree',
    'ExtraTrees',
    'Bagging',
]

def create_and_train_model(data: pd.DataFrame,
                           evaluate: bool = False,
                           algo: all_algorithms = 'MLP') -> Pipeline:
    """
    Create and train a model on the given data
    """
    if evaluate:
        test_size = 0.2
    else:
        test_size = 0.0
    
    # Split the data
    X_train, X_test, y_train, y_test = preprocess_data(df=data, test_size=test_size)

    # Train the model
    pipeline = create_pipeline(algo)
    pipeline = train_model(pipeline, X_train, y_train)

    if evaluate:
        evaluation_results = evaluate_model(pipeline, X_test, y_test)
        logging.info(f"Evaluation results for {algo}:")
        logging.info(f"F1 Score: {evaluation_results['f1_score']}\n")
        logging.info(f"Confusion Matrix:\n{evaluation_results['confusion_matrix']}\n")

    return pipeline

def evaluate_model(pipeline: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
    """
    Evaluates the model
    """
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    return {
        "accuracy": accuracy,
        "f1_score": f1,
        "confusion_matrix": cm
    }

def train_model_from_scratch(limit: Optional[int] = None,
                             evaluate: bool = False,
                             algo: all_algorithms = 'MLP',
                             output_path: str = './data/model.pkl') -> Pipeline:
    """
    Train a model from scratch
    """
    # Load data
    data = get_multiple_rows(limit=limit)

    # Train the model
    pipeline = create_and_train_model(data=data, evaluate=evaluate, algo=algo)

    # Save the model
    metadata = {
        "model_name": algo,
        "version": "1.0",
        "training_data_size": len(data),
        "training_datetime": pd.Timestamp.now().isoformat(),
    }
    Model.save_model(pipeline, metadata, output_path)

    return pipeline

def create_pipeline(algo: all_algorithms = 'XGBoost') -> Pipeline:
    """
    Creates a machine learning pipeline.

    Returns:
        Pipeline: A scikit-learn pipeline object.
    """
    # Define the features, numerical and categorical
    cat_features = [f.name for f in Feature.get_features_by_type('category')]
    num_features = [f.name for f in Feature.get_features_by_type('number')]

    # Pipeline for numerical variables
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Pipeline for categorical variables
    cat_transformer = OneHotEncoder(handle_unknown='ignore')

    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_features),
            ('cat', cat_transformer, cat_features)
        ]
    )

    # Choose the classifier based on the algorithm
    if algo == 'XGBoost':
        from xgboost import XGBClassifier
        classifier = XGBClassifier(eval_metric='logloss')
    elif algo == 'RandomForest':
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier()
    elif algo == 'SVM':
        from sklearn.svm import SVC
        classifier = SVC(probability=False)
    elif algo == 'GradientBoosting':
        from sklearn.ensemble import GradientBoostingClassifier
        classifier = GradientBoostingClassifier()
    elif algo == 'MLP':
        from sklearn.neural_network import MLPClassifier
        classifier = MLPClassifier(max_iter=1000, verbose=True)
    elif algo == 'LightGBM':
        from lightgbm import LGBMClassifier
        classifier = LGBMClassifier()
    elif algo == 'XGBRF':
        from xgboost import XGBRFClassifier
        classifier = XGBRFClassifier(eval_metric='logloss')
    elif algo == 'DecisionTree':
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier()
    elif algo == 'ExtraTrees':
        from sklearn.ensemble import ExtraTreesClassifier
        classifier = ExtraTreesClassifier()
    elif algo == 'Bagging':
        from sklearn.ensemble import BaggingClassifier
        classifier = BaggingClassifier()
    else:
        raise ValueError(f"Unknown algorithm: {algo}")

    # Full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    return pipeline

def predict(
    pipeline: Pipeline,
    job: str,
    city: str,
    state: str,
    category: str,
    amt: float,
    city_pop: int
) -> Dict[str, Any]:
    # Built a DataFrame with the new match
    transaction = pd.DataFrame([{
        Feature.CUSTOMER_CITY.name: city,
        Feature.CUSTOMER_CITY_POP.name: city_pop,
        Feature.CUSTOMER_JOB.name: job,
        Feature.CUSTOMER_STATE.name: state,
        Feature.TRANSACTION_AMOUNT.name: amt,
        Feature.TRANSACTION_CATEGORY.name: category
    }])

    # Use the pipeline to make a prediction
    prediction = pipeline.predict(transaction)[0]
    proba = pipeline.predict_proba(transaction)[0]

    # Print the result
    logging.info(f"Is fraud: {'True' if prediction == 1 else 'False'}")
    print(f"Probability of fraud: {proba}")
    # Return the result
    return {"result": prediction.item(), "fraud_probability": proba[1].item()}