|
|
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
|
|
|
from sklearn.metrics import accuracy_score
|
|
|
from flask import Flask, request, jsonify
|
|
|
from flask_cors import CORS
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from xgboost import XGBClassifier
|
|
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
|
from io import StringIO
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000):
|
|
|
"""
|
|
|
Validates input data for negative trips and unrealistic earnings.
|
|
|
Returns (True, None) if valid, else (False, error_message).
|
|
|
"""
|
|
|
|
|
|
if isinstance(data, dict):
|
|
|
trips = data.get(trips_col, None)
|
|
|
earnings = data.get(earnings_col, None)
|
|
|
if trips is not None and (trips < min_trips or trips > max_trips):
|
|
|
return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}."
|
|
|
if earnings is not None and (earnings < min_earnings or earnings > max_earnings):
|
|
|
return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}."
|
|
|
elif isinstance(data, pd.DataFrame):
|
|
|
if trips_col in data.columns:
|
|
|
invalid_trips = data[(data[trips_col] < min_trips) | (data[trips_col] > max_trips)]
|
|
|
if not invalid_trips.empty:
|
|
|
return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}"
|
|
|
if earnings_col in data.columns:
|
|
|
invalid_earnings = data[(data[earnings_col] < min_earnings) | (data[earnings_col] > max_earnings)]
|
|
|
if not invalid_earnings.empty:
|
|
|
return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}"
|
|
|
return True, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = Flask(__name__)
|
|
|
CORS(app)
|
|
|
|
|
|
|
|
|
model = None
|
|
|
train_features_columns = None
|
|
|
evaluation_metrics = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_and_preprocess_data(csv_path):
|
|
|
"""
|
|
|
Loads and preprocesses the dataset.
|
|
|
"""
|
|
|
try:
|
|
|
df = pd.read_csv(csv_path)
|
|
|
except FileNotFoundError:
|
|
|
print(f"Error: The file {csv_path} was not found.")
|
|
|
return None, None
|
|
|
|
|
|
target_column = 'Creditworthy'
|
|
|
|
|
|
|
|
|
df = df.drop(columns=['Partner ID'], errors='ignore')
|
|
|
|
|
|
|
|
|
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
|
|
|
|
|
|
|
|
|
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
|
|
|
|
|
|
|
|
|
for col in df.columns:
|
|
|
if col != target_column:
|
|
|
df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
|
|
|
|
|
|
|
df = df.dropna()
|
|
|
|
|
|
return df, target_column
|
|
|
|
|
|
def train_model(df, target_column):
|
|
|
"""
|
|
|
Splits data and trains an XGBoost classifier.
|
|
|
"""
|
|
|
X = df.drop(target_column, axis=1)
|
|
|
y = df[target_column]
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
model = XGBClassifier(eval_metric='logloss')
|
|
|
model.fit(X_train, y_train)
|
|
|
|
|
|
return model, X_test, y_test
|
|
|
|
|
|
def evaluate_model(model, X_test, y_test):
|
|
|
"""
|
|
|
Evaluates the trained model using key metrics.
|
|
|
Returns the metrics as a dictionary.
|
|
|
"""
|
|
|
y_pred = model.predict(X_test)
|
|
|
evaluation_metrics = {
|
|
|
'accuracy': accuracy_score(y_test, y_pred),
|
|
|
'precision': precision_score(y_test, y_pred),
|
|
|
'recall': recall_score(y_test, y_pred),
|
|
|
'f1_score': f1_score(y_test, y_pred)
|
|
|
}
|
|
|
|
|
|
|
|
|
sensitive_attr = None
|
|
|
|
|
|
for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']:
|
|
|
if col in X_test.columns:
|
|
|
sensitive_attr = X_test[col]
|
|
|
break
|
|
|
if sensitive_attr is not None:
|
|
|
mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate},
|
|
|
y_true=y_test,
|
|
|
y_pred=y_pred,
|
|
|
sensitive_features=sensitive_attr)
|
|
|
print("\nFairness metrics by group (Fairlearn):")
|
|
|
print(mf.by_group)
|
|
|
else:
|
|
|
print("No sensitive attribute found for group fairness metrics.")
|
|
|
return evaluation_metrics
|
|
|
|
|
|
def preprocess_user_data(user_df, train_columns):
|
|
|
"""
|
|
|
Prepares the user's data to match the format of the training data.
|
|
|
"""
|
|
|
|
|
|
categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist()
|
|
|
user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True)
|
|
|
|
|
|
|
|
|
missing_cols = set(train_columns) - set(user_df.columns)
|
|
|
|
|
|
|
|
|
for c in missing_cols:
|
|
|
user_df[c] = 0
|
|
|
|
|
|
|
|
|
|
|
|
extra_cols = set(user_df.columns) - set(train_columns)
|
|
|
user_df = user_df.drop(columns=list(extra_cols), errors='ignore')
|
|
|
|
|
|
|
|
|
user_df = user_df[train_columns]
|
|
|
|
|
|
return user_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_csv(data_df, filename='online_testcases.csv'):
|
|
|
"""
|
|
|
Saves a DataFrame to a CSV file.
|
|
|
Removes any empty columns (like 'Creditworthy') before saving.
|
|
|
"""
|
|
|
|
|
|
if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all():
|
|
|
data_df = data_df.drop(columns=['Creditworthy'])
|
|
|
|
|
|
data_df = data_df.dropna(axis=1, how='all')
|
|
|
file_exists = os.path.isfile(filename)
|
|
|
data_df.to_csv(filename, mode='a', header=not file_exists, index=False)
|
|
|
print(f"Data successfully saved to {filename}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/predict', methods=['POST'])
|
|
|
def predict():
|
|
|
"""
|
|
|
Endpoint to receive a single user input, make a prediction, and return metrics.
|
|
|
"""
|
|
|
|
|
|
if model is None or train_features_columns is None or evaluation_metrics is None:
|
|
|
return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500
|
|
|
|
|
|
try:
|
|
|
user_input = request.json
|
|
|
|
|
|
valid, error_msg = validate_input(user_input)
|
|
|
if not valid:
|
|
|
return jsonify({'error': error_msg}), 400
|
|
|
|
|
|
user_df = pd.DataFrame([user_input])
|
|
|
|
|
|
user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns)
|
|
|
|
|
|
prediction = model.predict(user_features_processed)
|
|
|
result = "Eligible" if prediction[0] == 1 else "Not Eligible"
|
|
|
|
|
|
user_df['Creditworthy_Prediction'] = result
|
|
|
|
|
|
save_to_csv(user_df)
|
|
|
|
|
|
return jsonify({
|
|
|
'prediction': result,
|
|
|
'metrics': evaluation_metrics
|
|
|
})
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return jsonify({'error': str(e)}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/predict_csv', methods=['POST'])
|
|
|
def predict_csv():
|
|
|
"""
|
|
|
Endpoint to receive a CSV file, make bulk predictions, and return results.
|
|
|
"""
|
|
|
if 'file' not in request.files:
|
|
|
return jsonify({'error': 'No file part in the request'}), 400
|
|
|
|
|
|
file = request.files['file']
|
|
|
if file.filename == '':
|
|
|
return jsonify({'error': 'No selected file'}), 400
|
|
|
|
|
|
if file:
|
|
|
try:
|
|
|
|
|
|
csv_data = StringIO(file.read().decode('utf-8'))
|
|
|
input_df = pd.read_csv(csv_data)
|
|
|
|
|
|
|
|
|
has_ground_truth = 'Creditworthy' in input_df.columns
|
|
|
|
|
|
|
|
|
if has_ground_truth:
|
|
|
y_true = input_df['Creditworthy']
|
|
|
input_df_features = input_df.drop(columns=['Creditworthy'])
|
|
|
else:
|
|
|
input_df_features = input_df
|
|
|
|
|
|
|
|
|
input_df_features = input_df_features.dropna(axis=1, how='all')
|
|
|
|
|
|
|
|
|
valid, error_msg = validate_input(input_df_features)
|
|
|
if not valid:
|
|
|
return jsonify({'error': error_msg}), 400
|
|
|
|
|
|
|
|
|
user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns)
|
|
|
|
|
|
predictions = model.predict(user_features_processed)
|
|
|
|
|
|
input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible')
|
|
|
|
|
|
|
|
|
input_df = input_df.dropna(axis=1, how='all')
|
|
|
|
|
|
|
|
|
save_to_csv(input_df)
|
|
|
|
|
|
|
|
|
fairness_metrics = {}
|
|
|
fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload."
|
|
|
if has_ground_truth:
|
|
|
|
|
|
sensitive_col = 'Partner Type'
|
|
|
if sensitive_col in input_df.columns:
|
|
|
y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int)
|
|
|
|
|
|
if y_true.dtype == object:
|
|
|
y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0)
|
|
|
else:
|
|
|
y_true_bin = y_true
|
|
|
sensitive_features = input_df[sensitive_col]
|
|
|
mf = MetricFrame(
|
|
|
metrics={
|
|
|
'selection_rate': selection_rate,
|
|
|
'equal_opportunity': true_positive_rate
|
|
|
},
|
|
|
y_true=y_true_bin,
|
|
|
y_pred=y_pred,
|
|
|
sensitive_features=sensitive_features
|
|
|
)
|
|
|
fairness_metrics = {
|
|
|
'selection_rate': mf.by_group['selection_rate'].to_dict(),
|
|
|
'equal_opportunity': mf.by_group['equal_opportunity'].to_dict()
|
|
|
}
|
|
|
|
|
|
rates = mf.by_group['selection_rate']
|
|
|
max_group = rates.idxmax()
|
|
|
min_group = rates.idxmin()
|
|
|
diff = rates[max_group] - rates[min_group]
|
|
|
fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group."
|
|
|
if abs(diff) > 0.1:
|
|
|
fairness_observation += " Mitigation recommended: Consider reweighting or post-processing."
|
|
|
|
|
|
|
|
|
results = input_df.to_dict('records')
|
|
|
return jsonify({
|
|
|
'predictions': results,
|
|
|
'metrics': evaluation_metrics,
|
|
|
'fairness_metrics': fairness_metrics,
|
|
|
'fairness_observation': fairness_observation
|
|
|
})
|
|
|
except Exception as e:
|
|
|
import traceback
|
|
|
print(traceback.format_exc())
|
|
|
return jsonify({'error': f"Error processing file: {str(e)}"}), 500
|
|
|
|
|
|
return jsonify({'error': 'An unknown error occurred.'}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""
|
|
|
Initializes the model and runs the Flask server.
|
|
|
"""
|
|
|
global model, train_features_columns, evaluation_metrics
|
|
|
|
|
|
print("--- Starting the Nova Backend ---")
|
|
|
print("Step 1: Loading and preprocessing data...")
|
|
|
train_df, target_column = load_and_preprocess_data('catalyst_train.csv')
|
|
|
|
|
|
if train_df is None:
|
|
|
print("Please ensure 'catalyst_train.csv' exists. Exiting.")
|
|
|
return
|
|
|
|
|
|
print("Step 2: Training the model and evaluating performance...")
|
|
|
model, X_test, y_test = train_model(train_df, target_column)
|
|
|
train_features_columns = train_df.drop(columns=[target_column]).columns
|
|
|
evaluation_metrics = evaluate_model(model, X_test, y_test)
|
|
|
|
|
|
print("\nModel trained successfully! Metrics:")
|
|
|
for key, value in evaluation_metrics.items():
|
|
|
print(f"- {key.capitalize()}: {value:.4f}")
|
|
|
|
|
|
print("\n--- Starting Flask server on http://127.0.0.1:5000 ---")
|
|
|
|
|
|
app.run(debug=True, port=5000, use_reloader=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|