File size: 15,299 Bytes
3908f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
from sklearn.metrics import accuracy_score
from flask import Flask, request, jsonify
from flask_cors import CORS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from io import StringIO
import os

# ===============================================================================
# Input Validation Functions
# ===============================================================================
def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000):
    """

    Validates input data for negative trips and unrealistic earnings.

    Returns (True, None) if valid, else (False, error_message).

    """
    # Check for single row (dict or DataFrame)
    if isinstance(data, dict):
        trips = data.get(trips_col, None)
        earnings = data.get(earnings_col, None)
        if trips is not None and (trips < min_trips or trips > max_trips):
            return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}."
        if earnings is not None and (earnings < min_earnings or earnings > max_earnings):
            return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}."
    elif isinstance(data, pd.DataFrame):
        if trips_col in data.columns:
            invalid_trips = data[(data[trips_col] < min_trips) | (data[trips_col] > max_trips)]
            if not invalid_trips.empty:
                return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}"
        if earnings_col in data.columns:
            invalid_earnings = data[(data[earnings_col] < min_earnings) | (data[earnings_col] > max_earnings)]
            if not invalid_earnings.empty:
                return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}"
    return True, None

# ==============================================================================
# Step 1: Initialize Flask App and Model Variables
# ==============================================================================
app = Flask(__name__)
CORS(app)  # Enable CORS to allow the frontend to access this API

# Global variables to hold the trained model and features
model = None
train_features_columns = None
evaluation_metrics = {}

# ==============================================================================
# Step 2: Core ML Functions (from your original script)
# ==============================================================================
def load_and_preprocess_data(csv_path):
    """

    Loads and preprocesses the dataset.

    """
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: The file {csv_path} was not found.")
        return None, None

    target_column = 'Creditworthy'

    # Drop columns that are not features for the model
    df = df.drop(columns=['Partner ID'], errors='ignore')

    # Identify non-numeric columns
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Ensure all remaining feature columns are numeric
    for col in df.columns:
        if col != target_column:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop any rows that now have NaN values after the coercion
    df = df.dropna()

    return df, target_column

def train_model(df, target_column):
    """

    Splits data and trains an XGBoost classifier.

    """
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train, y_train)

    return model, X_test, y_test

def evaluate_model(model, X_test, y_test):
    """

    Evaluates the trained model using key metrics.

    Returns the metrics as a dictionary.

    """
    y_pred = model.predict(X_test)
    evaluation_metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
    }

    # Fairness metrics using Fairlearn (if sensitive attribute exists)
    sensitive_attr = None
    # Try common sensitive attribute names
    for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']:
        if col in X_test.columns:
            sensitive_attr = X_test[col]
            break
    if sensitive_attr is not None:
        mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate},
                         y_true=y_test,
                         y_pred=y_pred,
                         sensitive_features=sensitive_attr)
        print("\nFairness metrics by group (Fairlearn):")
        print(mf.by_group)
    else:
        print("No sensitive attribute found for group fairness metrics.")
    return evaluation_metrics

def preprocess_user_data(user_df, train_columns):
    """

    Prepares the user's data to match the format of the training data.

    """
    # Identify and one-hot encode categorical features from the user's data
    categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist()
    user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True)

    # Identify which columns are in the training data but not the user data
    missing_cols = set(train_columns) - set(user_df.columns)

    # Add any missing columns from the training data with default value 0
    for c in missing_cols:
        user_df[c] = 0

    # Drop any extra columns from the user data that were not in the training data
    # This is crucial for single-entry data
    extra_cols = set(user_df.columns) - set(train_columns)
    user_df = user_df.drop(columns=list(extra_cols), errors='ignore')

    # Reorder columns to match the training data
    user_df = user_df[train_columns]

    return user_df

# ==============================================================================
# Step 2.5: New Function to Save Data to CSV
# ==============================================================================
def save_to_csv(data_df, filename='online_testcases.csv'):
    """

    Saves a DataFrame to a CSV file.

    Removes any empty columns (like 'Creditworthy') before saving.

    """
    # Drop 'Creditworthy' if it exists and is empty or all NaN
    if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all():
        data_df = data_df.drop(columns=['Creditworthy'])
    # Drop any other columns that are all NaN
    data_df = data_df.dropna(axis=1, how='all')
    file_exists = os.path.isfile(filename)
    data_df.to_csv(filename, mode='a', header=not file_exists, index=False)
    print(f"Data successfully saved to {filename}")

# ==============================================================================
# Step 3: API Endpoint for Prediction (Single Input)
# ==============================================================================
@app.route('/predict', methods=['POST'])
def predict():
    """

    Endpoint to receive a single user input, make a prediction, and return metrics.

    """
    # Check if global variables are None. This is the correct way to handle this.
    if model is None or train_features_columns is None or evaluation_metrics is None:
        return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500

    try:
        user_input = request.json
        # Input validation
        valid, error_msg = validate_input(user_input)
        if not valid:
            return jsonify({'error': error_msg}), 400

        user_df = pd.DataFrame([user_input])
        # Preprocess the user's data to match the training data format
        user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns)
        # Make the prediction
        prediction = model.predict(user_features_processed)
        result = "Eligible" if prediction[0] == 1 else "Not Eligible"
        # Add prediction to the original DataFrame for logging
        user_df['Creditworthy_Prediction'] = result
        # Save the original user input plus prediction to the CSV file
        save_to_csv(user_df)
        # Return the prediction and evaluation metrics
        return jsonify({
            'prediction': result,
            'metrics': evaluation_metrics
        })

    except Exception as e:
        # Gracefully handle any errors during the process
        return jsonify({'error': str(e)}), 500

# ==============================================================================
# Step 4: API Endpoint for Bulk Prediction (CSV Upload)
# ==============================================================================
@app.route('/predict_csv', methods=['POST'])
def predict_csv():
    """

    Endpoint to receive a CSV file, make bulk predictions, and return results.

    """
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file:
        try:
            # Read the CSV file from the request
            csv_data = StringIO(file.read().decode('utf-8'))
            input_df = pd.read_csv(csv_data)

            # Check if ground truth is present
            has_ground_truth = 'Creditworthy' in input_df.columns

            # Remove 'Creditworthy' column from features for prediction
            if has_ground_truth:
                y_true = input_df['Creditworthy']
                input_df_features = input_df.drop(columns=['Creditworthy'])
            else:
                input_df_features = input_df

            # Remove any other empty columns
            input_df_features = input_df_features.dropna(axis=1, how='all')

            # Input validation for all rows
            valid, error_msg = validate_input(input_df_features)
            if not valid:
                return jsonify({'error': error_msg}), 400

            # Preprocess the entire DataFrame
            user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns)
            # Make the predictions
            predictions = model.predict(user_features_processed)
            # Add the predictions to the original DataFrame
            input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible')

            # Remove any empty columns again before saving/returning
            input_df = input_df.dropna(axis=1, how='all')

            # Save the entire DataFrame to the CSV file
            save_to_csv(input_df)

            # --- Fairness & Bias Reporting ---
            fairness_metrics = {}
            fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload."
            if has_ground_truth:
                # Only compute fairness if ground truth is present
                sensitive_col = 'Partner Type'
                if sensitive_col in input_df.columns:
                    y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int)
                    # If Creditworthy is string, convert to binary
                    if y_true.dtype == object:
                        y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0)
                    else:
                        y_true_bin = y_true
                    sensitive_features = input_df[sensitive_col]
                    mf = MetricFrame(
                        metrics={
                            'selection_rate': selection_rate,
                            'equal_opportunity': true_positive_rate
                        },
                        y_true=y_true_bin,
                        y_pred=y_pred,
                        sensitive_features=sensitive_features
                    )
                    fairness_metrics = {
                        'selection_rate': mf.by_group['selection_rate'].to_dict(),
                        'equal_opportunity': mf.by_group['equal_opportunity'].to_dict()
                    }
                    # Observations
                    rates = mf.by_group['selection_rate']
                    max_group = rates.idxmax()
                    min_group = rates.idxmin()
                    diff = rates[max_group] - rates[min_group]
                    fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group."
                    if abs(diff) > 0.1:
                        fairness_observation += " Mitigation recommended: Consider reweighting or post-processing."

            # Convert DataFrame to a list of dictionaries for JSON response
            results = input_df.to_dict('records')
            return jsonify({
                'predictions': results,
                'metrics': evaluation_metrics,
                'fairness_metrics': fairness_metrics,
                'fairness_observation': fairness_observation
            })
        except Exception as e:
            import traceback
            print(traceback.format_exc())
            return jsonify({'error': f"Error processing file: {str(e)}"}), 500

    return jsonify({'error': 'An unknown error occurred.'}), 500


# ==============================================================================
# Step 5: Main function to train the model once and run the server
# ==============================================================================
def main():
    """

    Initializes the model and runs the Flask server.

    """
    global model, train_features_columns, evaluation_metrics

    print("--- Starting the Nova Backend ---")
    print("Step 1: Loading and preprocessing data...")
    train_df, target_column = load_and_preprocess_data('catalyst_train.csv')

    if train_df is None:
        print("Please ensure 'catalyst_train.csv' exists. Exiting.")
        return

    print("Step 2: Training the model and evaluating performance...")
    model, X_test, y_test = train_model(train_df, target_column)
    train_features_columns = train_df.drop(columns=[target_column]).columns
    evaluation_metrics = evaluate_model(model, X_test, y_test)

    print("\nModel trained successfully! Metrics:")
    for key, value in evaluation_metrics.items():
        print(f"- {key.capitalize()}: {value:.4f}")

    print("\n--- Starting Flask server on http://127.0.0.1:5000 ---")
    # This will serve the API, ready to accept requests from the frontend
    app.run(debug=True, port=5000, use_reloader=False)

if __name__ == "__main__":
    main()