Project_Nova / main.py
AyushMann29's picture
Upload main.py
3908f31 verified
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
from sklearn.metrics import accuracy_score
from flask import Flask, request, jsonify
from flask_cors import CORS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from io import StringIO
import os
# ===============================================================================
# Input Validation Functions
# ===============================================================================
def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000):
"""
Validates input data for negative trips and unrealistic earnings.
Returns (True, None) if valid, else (False, error_message).
"""
# Check for single row (dict or DataFrame)
if isinstance(data, dict):
trips = data.get(trips_col, None)
earnings = data.get(earnings_col, None)
if trips is not None and (trips < min_trips or trips > max_trips):
return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}."
if earnings is not None and (earnings < min_earnings or earnings > max_earnings):
return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}."
elif isinstance(data, pd.DataFrame):
if trips_col in data.columns:
invalid_trips = data[(data[trips_col] < min_trips) | (data[trips_col] > max_trips)]
if not invalid_trips.empty:
return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}"
if earnings_col in data.columns:
invalid_earnings = data[(data[earnings_col] < min_earnings) | (data[earnings_col] > max_earnings)]
if not invalid_earnings.empty:
return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}"
return True, None
# ==============================================================================
# Step 1: Initialize Flask App and Model Variables
# ==============================================================================
app = Flask(__name__)
CORS(app) # Enable CORS to allow the frontend to access this API
# Global variables to hold the trained model and features
model = None
train_features_columns = None
evaluation_metrics = {}
# ==============================================================================
# Step 2: Core ML Functions (from your original script)
# ==============================================================================
def load_and_preprocess_data(csv_path):
"""
Loads and preprocesses the dataset.
"""
try:
df = pd.read_csv(csv_path)
except FileNotFoundError:
print(f"Error: The file {csv_path} was not found.")
return None, None
target_column = 'Creditworthy'
# Drop columns that are not features for the model
df = df.drop(columns=['Partner ID'], errors='ignore')
# Identify non-numeric columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
# One-hot encode categorical features
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Ensure all remaining feature columns are numeric
for col in df.columns:
if col != target_column:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Drop any rows that now have NaN values after the coercion
df = df.dropna()
return df, target_column
def train_model(df, target_column):
"""
Splits data and trains an XGBoost classifier.
"""
X = df.drop(target_column, axis=1)
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)
return model, X_test, y_test
def evaluate_model(model, X_test, y_test):
"""
Evaluates the trained model using key metrics.
Returns the metrics as a dictionary.
"""
y_pred = model.predict(X_test)
evaluation_metrics = {
'accuracy': accuracy_score(y_test, y_pred),
'precision': precision_score(y_test, y_pred),
'recall': recall_score(y_test, y_pred),
'f1_score': f1_score(y_test, y_pred)
}
# Fairness metrics using Fairlearn (if sensitive attribute exists)
sensitive_attr = None
# Try common sensitive attribute names
for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']:
if col in X_test.columns:
sensitive_attr = X_test[col]
break
if sensitive_attr is not None:
mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate},
y_true=y_test,
y_pred=y_pred,
sensitive_features=sensitive_attr)
print("\nFairness metrics by group (Fairlearn):")
print(mf.by_group)
else:
print("No sensitive attribute found for group fairness metrics.")
return evaluation_metrics
def preprocess_user_data(user_df, train_columns):
"""
Prepares the user's data to match the format of the training data.
"""
# Identify and one-hot encode categorical features from the user's data
categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist()
user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True)
# Identify which columns are in the training data but not the user data
missing_cols = set(train_columns) - set(user_df.columns)
# Add any missing columns from the training data with default value 0
for c in missing_cols:
user_df[c] = 0
# Drop any extra columns from the user data that were not in the training data
# This is crucial for single-entry data
extra_cols = set(user_df.columns) - set(train_columns)
user_df = user_df.drop(columns=list(extra_cols), errors='ignore')
# Reorder columns to match the training data
user_df = user_df[train_columns]
return user_df
# ==============================================================================
# Step 2.5: New Function to Save Data to CSV
# ==============================================================================
def save_to_csv(data_df, filename='online_testcases.csv'):
"""
Saves a DataFrame to a CSV file.
Removes any empty columns (like 'Creditworthy') before saving.
"""
# Drop 'Creditworthy' if it exists and is empty or all NaN
if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all():
data_df = data_df.drop(columns=['Creditworthy'])
# Drop any other columns that are all NaN
data_df = data_df.dropna(axis=1, how='all')
file_exists = os.path.isfile(filename)
data_df.to_csv(filename, mode='a', header=not file_exists, index=False)
print(f"Data successfully saved to {filename}")
# ==============================================================================
# Step 3: API Endpoint for Prediction (Single Input)
# ==============================================================================
@app.route('/predict', methods=['POST'])
def predict():
"""
Endpoint to receive a single user input, make a prediction, and return metrics.
"""
# Check if global variables are None. This is the correct way to handle this.
if model is None or train_features_columns is None or evaluation_metrics is None:
return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500
try:
user_input = request.json
# Input validation
valid, error_msg = validate_input(user_input)
if not valid:
return jsonify({'error': error_msg}), 400
user_df = pd.DataFrame([user_input])
# Preprocess the user's data to match the training data format
user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns)
# Make the prediction
prediction = model.predict(user_features_processed)
result = "Eligible" if prediction[0] == 1 else "Not Eligible"
# Add prediction to the original DataFrame for logging
user_df['Creditworthy_Prediction'] = result
# Save the original user input plus prediction to the CSV file
save_to_csv(user_df)
# Return the prediction and evaluation metrics
return jsonify({
'prediction': result,
'metrics': evaluation_metrics
})
except Exception as e:
# Gracefully handle any errors during the process
return jsonify({'error': str(e)}), 500
# ==============================================================================
# Step 4: API Endpoint for Bulk Prediction (CSV Upload)
# ==============================================================================
@app.route('/predict_csv', methods=['POST'])
def predict_csv():
"""
Endpoint to receive a CSV file, make bulk predictions, and return results.
"""
if 'file' not in request.files:
return jsonify({'error': 'No file part in the request'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
try:
# Read the CSV file from the request
csv_data = StringIO(file.read().decode('utf-8'))
input_df = pd.read_csv(csv_data)
# Check if ground truth is present
has_ground_truth = 'Creditworthy' in input_df.columns
# Remove 'Creditworthy' column from features for prediction
if has_ground_truth:
y_true = input_df['Creditworthy']
input_df_features = input_df.drop(columns=['Creditworthy'])
else:
input_df_features = input_df
# Remove any other empty columns
input_df_features = input_df_features.dropna(axis=1, how='all')
# Input validation for all rows
valid, error_msg = validate_input(input_df_features)
if not valid:
return jsonify({'error': error_msg}), 400
# Preprocess the entire DataFrame
user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns)
# Make the predictions
predictions = model.predict(user_features_processed)
# Add the predictions to the original DataFrame
input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible')
# Remove any empty columns again before saving/returning
input_df = input_df.dropna(axis=1, how='all')
# Save the entire DataFrame to the CSV file
save_to_csv(input_df)
# --- Fairness & Bias Reporting ---
fairness_metrics = {}
fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload."
if has_ground_truth:
# Only compute fairness if ground truth is present
sensitive_col = 'Partner Type'
if sensitive_col in input_df.columns:
y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int)
# If Creditworthy is string, convert to binary
if y_true.dtype == object:
y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0)
else:
y_true_bin = y_true
sensitive_features = input_df[sensitive_col]
mf = MetricFrame(
metrics={
'selection_rate': selection_rate,
'equal_opportunity': true_positive_rate
},
y_true=y_true_bin,
y_pred=y_pred,
sensitive_features=sensitive_features
)
fairness_metrics = {
'selection_rate': mf.by_group['selection_rate'].to_dict(),
'equal_opportunity': mf.by_group['equal_opportunity'].to_dict()
}
# Observations
rates = mf.by_group['selection_rate']
max_group = rates.idxmax()
min_group = rates.idxmin()
diff = rates[max_group] - rates[min_group]
fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group."
if abs(diff) > 0.1:
fairness_observation += " Mitigation recommended: Consider reweighting or post-processing."
# Convert DataFrame to a list of dictionaries for JSON response
results = input_df.to_dict('records')
return jsonify({
'predictions': results,
'metrics': evaluation_metrics,
'fairness_metrics': fairness_metrics,
'fairness_observation': fairness_observation
})
except Exception as e:
import traceback
print(traceback.format_exc())
return jsonify({'error': f"Error processing file: {str(e)}"}), 500
return jsonify({'error': 'An unknown error occurred.'}), 500
# ==============================================================================
# Step 5: Main function to train the model once and run the server
# ==============================================================================
def main():
"""
Initializes the model and runs the Flask server.
"""
global model, train_features_columns, evaluation_metrics
print("--- Starting the Nova Backend ---")
print("Step 1: Loading and preprocessing data...")
train_df, target_column = load_and_preprocess_data('catalyst_train.csv')
if train_df is None:
print("Please ensure 'catalyst_train.csv' exists. Exiting.")
return
print("Step 2: Training the model and evaluating performance...")
model, X_test, y_test = train_model(train_df, target_column)
train_features_columns = train_df.drop(columns=[target_column]).columns
evaluation_metrics = evaluate_model(model, X_test, y_test)
print("\nModel trained successfully! Metrics:")
for key, value in evaluation_metrics.items():
print(f"- {key.capitalize()}: {value:.4f}")
print("\n--- Starting Flask server on http://127.0.0.1:5000 ---")
# This will serve the API, ready to accept requests from the frontend
app.run(debug=True, port=5000, use_reloader=False)
if __name__ == "__main__":
main()