Project_Nova / main.py

Upload main.py

3908f31 verified 3 months ago

15.3 kB

	from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
	from sklearn.metrics import accuracy_score
	from flask import Flask, request, jsonify
	from flask_cors import CORS
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from xgboost import XGBClassifier
	from sklearn.metrics import precision_score, recall_score, f1_score
	from io import StringIO
	import os

	# ===============================================================================
	# Input Validation Functions
	# ===============================================================================
	def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000):
	"""
	Validates input data for negative trips and unrealistic earnings.
	Returns (True, None) if valid, else (False, error_message).
	"""
	# Check for single row (dict or DataFrame)
	if isinstance(data, dict):
	trips = data.get(trips_col, None)
	earnings = data.get(earnings_col, None)
	if trips is not None and (trips < min_trips or trips > max_trips):
	return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}."
	if earnings is not None and (earnings < min_earnings or earnings > max_earnings):
	return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}."
	elif isinstance(data, pd.DataFrame):
	if trips_col in data.columns:
	invalid_trips = data[(data[trips_col] < min_trips) \| (data[trips_col] > max_trips)]
	if not invalid_trips.empty:
	return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}"
	if earnings_col in data.columns:
	invalid_earnings = data[(data[earnings_col] < min_earnings) \| (data[earnings_col] > max_earnings)]
	if not invalid_earnings.empty:
	return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}"
	return True, None

	# ==============================================================================
	# Step 1: Initialize Flask App and Model Variables
	# ==============================================================================
	app = Flask(__name__)
	CORS(app) # Enable CORS to allow the frontend to access this API

	# Global variables to hold the trained model and features
	model = None
	train_features_columns = None
	evaluation_metrics = {}

	# ==============================================================================
	# Step 2: Core ML Functions (from your original script)
	# ==============================================================================
	def load_and_preprocess_data(csv_path):
	"""
	Loads and preprocesses the dataset.
	"""
	try:
	df = pd.read_csv(csv_path)
	except FileNotFoundError:
	print(f"Error: The file {csv_path} was not found.")
	return None, None

	target_column = 'Creditworthy'

	# Drop columns that are not features for the model
	df = df.drop(columns=['Partner ID'], errors='ignore')

	# Identify non-numeric columns
	categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

	# One-hot encode categorical features
	df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

	# Ensure all remaining feature columns are numeric
	for col in df.columns:
	if col != target_column:
	df[col] = pd.to_numeric(df[col], errors='coerce')

	# Drop any rows that now have NaN values after the coercion
	df = df.dropna()

	return df, target_column

	def train_model(df, target_column):
	"""
	Splits data and trains an XGBoost classifier.
	"""
	X = df.drop(target_column, axis=1)
	y = df[target_column]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	model = XGBClassifier(eval_metric='logloss')
	model.fit(X_train, y_train)

	return model, X_test, y_test

	def evaluate_model(model, X_test, y_test):
	"""
	Evaluates the trained model using key metrics.
	Returns the metrics as a dictionary.
	"""
	y_pred = model.predict(X_test)
	evaluation_metrics = {
	'accuracy': accuracy_score(y_test, y_pred),
	'precision': precision_score(y_test, y_pred),
	'recall': recall_score(y_test, y_pred),
	'f1_score': f1_score(y_test, y_pred)
	}

	# Fairness metrics using Fairlearn (if sensitive attribute exists)
	sensitive_attr = None
	# Try common sensitive attribute names
	for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']:
	if col in X_test.columns:
	sensitive_attr = X_test[col]
	break
	if sensitive_attr is not None:
	mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate},
	y_true=y_test,
	y_pred=y_pred,
	sensitive_features=sensitive_attr)
	print("\nFairness metrics by group (Fairlearn):")
	print(mf.by_group)
	else:
	print("No sensitive attribute found for group fairness metrics.")
	return evaluation_metrics

	def preprocess_user_data(user_df, train_columns):
	"""
	Prepares the user's data to match the format of the training data.
	"""
	# Identify and one-hot encode categorical features from the user's data
	categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist()
	user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True)

	# Identify which columns are in the training data but not the user data
	missing_cols = set(train_columns) - set(user_df.columns)

	# Add any missing columns from the training data with default value 0
	for c in missing_cols:
	user_df[c] = 0

	# Drop any extra columns from the user data that were not in the training data
	# This is crucial for single-entry data
	extra_cols = set(user_df.columns) - set(train_columns)
	user_df = user_df.drop(columns=list(extra_cols), errors='ignore')

	# Reorder columns to match the training data
	user_df = user_df[train_columns]

	return user_df

	# ==============================================================================
	# Step 2.5: New Function to Save Data to CSV
	# ==============================================================================
	def save_to_csv(data_df, filename='online_testcases.csv'):
	"""
	Saves a DataFrame to a CSV file.
	Removes any empty columns (like 'Creditworthy') before saving.
	"""
	# Drop 'Creditworthy' if it exists and is empty or all NaN
	if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all():
	data_df = data_df.drop(columns=['Creditworthy'])
	# Drop any other columns that are all NaN
	data_df = data_df.dropna(axis=1, how='all')
	file_exists = os.path.isfile(filename)
	data_df.to_csv(filename, mode='a', header=not file_exists, index=False)
	print(f"Data successfully saved to {filename}")

	# ==============================================================================
	# Step 3: API Endpoint for Prediction (Single Input)
	# ==============================================================================
	@app.route('/predict', methods=['POST'])
	def predict():
	"""
	Endpoint to receive a single user input, make a prediction, and return metrics.
	"""
	# Check if global variables are None. This is the correct way to handle this.
	if model is None or train_features_columns is None or evaluation_metrics is None:
	return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500

	try:
	user_input = request.json
	# Input validation
	valid, error_msg = validate_input(user_input)
	if not valid:
	return jsonify({'error': error_msg}), 400

	user_df = pd.DataFrame([user_input])
	# Preprocess the user's data to match the training data format
	user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns)
	# Make the prediction
	prediction = model.predict(user_features_processed)
	result = "Eligible" if prediction[0] == 1 else "Not Eligible"
	# Add prediction to the original DataFrame for logging
	user_df['Creditworthy_Prediction'] = result
	# Save the original user input plus prediction to the CSV file
	save_to_csv(user_df)
	# Return the prediction and evaluation metrics
	return jsonify({
	'prediction': result,
	'metrics': evaluation_metrics
	})

	except Exception as e:
	# Gracefully handle any errors during the process
	return jsonify({'error': str(e)}), 500

	# ==============================================================================
	# Step 4: API Endpoint for Bulk Prediction (CSV Upload)
	# ==============================================================================
	@app.route('/predict_csv', methods=['POST'])
	def predict_csv():
	"""
	Endpoint to receive a CSV file, make bulk predictions, and return results.
	"""
	if 'file' not in request.files:
	return jsonify({'error': 'No file part in the request'}), 400

	file = request.files['file']
	if file.filename == '':
	return jsonify({'error': 'No selected file'}), 400

	if file:
	try:
	# Read the CSV file from the request
	csv_data = StringIO(file.read().decode('utf-8'))
	input_df = pd.read_csv(csv_data)

	# Check if ground truth is present
	has_ground_truth = 'Creditworthy' in input_df.columns

	# Remove 'Creditworthy' column from features for prediction
	if has_ground_truth:
	y_true = input_df['Creditworthy']
	input_df_features = input_df.drop(columns=['Creditworthy'])
	else:
	input_df_features = input_df

	# Remove any other empty columns
	input_df_features = input_df_features.dropna(axis=1, how='all')

	# Input validation for all rows
	valid, error_msg = validate_input(input_df_features)
	if not valid:
	return jsonify({'error': error_msg}), 400

	# Preprocess the entire DataFrame
	user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns)
	# Make the predictions
	predictions = model.predict(user_features_processed)
	# Add the predictions to the original DataFrame
	input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible')

	# Remove any empty columns again before saving/returning
	input_df = input_df.dropna(axis=1, how='all')

	# Save the entire DataFrame to the CSV file
	save_to_csv(input_df)

	# --- Fairness & Bias Reporting ---
	fairness_metrics = {}
	fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload."
	if has_ground_truth:
	# Only compute fairness if ground truth is present
	sensitive_col = 'Partner Type'
	if sensitive_col in input_df.columns:
	y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int)
	# If Creditworthy is string, convert to binary
	if y_true.dtype == object:
	y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0)
	else:
	y_true_bin = y_true
	sensitive_features = input_df[sensitive_col]
	mf = MetricFrame(
	metrics={
	'selection_rate': selection_rate,
	'equal_opportunity': true_positive_rate
	},
	y_true=y_true_bin,
	y_pred=y_pred,
	sensitive_features=sensitive_features
	)
	fairness_metrics = {
	'selection_rate': mf.by_group['selection_rate'].to_dict(),
	'equal_opportunity': mf.by_group['equal_opportunity'].to_dict()
	}
	# Observations
	rates = mf.by_group['selection_rate']
	max_group = rates.idxmax()
	min_group = rates.idxmin()
	diff = rates[max_group] - rates[min_group]
	fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group."
	if abs(diff) > 0.1:
	fairness_observation += " Mitigation recommended: Consider reweighting or post-processing."

	# Convert DataFrame to a list of dictionaries for JSON response
	results = input_df.to_dict('records')
	return jsonify({
	'predictions': results,
	'metrics': evaluation_metrics,
	'fairness_metrics': fairness_metrics,
	'fairness_observation': fairness_observation
	})
	except Exception as e:
	import traceback
	print(traceback.format_exc())
	return jsonify({'error': f"Error processing file: {str(e)}"}), 500

	return jsonify({'error': 'An unknown error occurred.'}), 500


	# ==============================================================================
	# Step 5: Main function to train the model once and run the server
	# ==============================================================================
	def main():
	"""
	Initializes the model and runs the Flask server.
	"""
	global model, train_features_columns, evaluation_metrics

	print("--- Starting the Nova Backend ---")
	print("Step 1: Loading and preprocessing data...")
	train_df, target_column = load_and_preprocess_data('catalyst_train.csv')

	if train_df is None:
	print("Please ensure 'catalyst_train.csv' exists. Exiting.")
	return

	print("Step 2: Training the model and evaluating performance...")
	model, X_test, y_test = train_model(train_df, target_column)
	train_features_columns = train_df.drop(columns=[target_column]).columns
	evaluation_metrics = evaluate_model(model, X_test, y_test)

	print("\nModel trained successfully! Metrics:")
	for key, value in evaluation_metrics.items():
	print(f"- {key.capitalize()}: {value:.4f}")

	print("\n--- Starting Flask server on http://127.0.0.1:5000 ---")
	# This will serve the API, ready to accept requests from the frontend
	app.run(debug=True, port=5000, use_reloader=False)

	if __name__ == "__main__":
	main()