Update my_pages/multiverse.py
Browse files- my_pages/multiverse.py +79 -0
my_pages/multiverse.py
CHANGED
|
@@ -3,6 +3,16 @@ import streamlit as st
|
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
from utils import add_navigation
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
choices_list = [
|
| 7 |
{"label": "Data Scaling", "options": [
|
| 8 |
"MinMax Scaler",
|
|
@@ -203,3 +213,72 @@ def render():
|
|
| 203 |
)
|
| 204 |
|
| 205 |
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
from utils import add_navigation
|
| 5 |
|
| 6 |
+
import random
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from sklearn.model_selection import train_test_split
|
| 11 |
+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
| 12 |
+
from sklearn.neural_network import MLPClassifier
|
| 13 |
+
from sklearn.linear_model import SGDClassifier
|
| 14 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 15 |
+
|
| 16 |
choices_list = [
|
| 17 |
{"label": "Data Scaling", "options": [
|
| 18 |
"MinMax Scaler",
|
|
|
|
| 213 |
)
|
| 214 |
|
| 215 |
st.plotly_chart(fig, use_container_width=True)
|
| 216 |
+
|
| 217 |
+
##########################
|
| 218 |
+
##########################
|
| 219 |
+
##########################
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def split_and_scale(features_raw, features, label, group, test_split=0.2, preprocess_scale=True):
|
| 223 |
+
X_raw_train, X_raw_test, X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
|
| 224 |
+
features_raw, features, label, group, test_size=test_split, random_state=0)
|
| 225 |
+
|
| 226 |
+
if preprocess_scale:
|
| 227 |
+
scaler = MinMaxScaler()
|
| 228 |
+
scaler.fit(X_train)
|
| 229 |
+
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
|
| 230 |
+
|
| 231 |
+
return X_raw_train, X_raw_test, X_train, X_test, y_train, y_test, group_train, group_test
|
| 232 |
+
|
| 233 |
+
def get_stackoverflow_dataset(test_split=0.2, preprocess_scale=True):
|
| 234 |
+
raw_data = pd.read_csv('stackoverflow_full.csv')
|
| 235 |
+
raw_data = raw_data[raw_data['Country']=='Canada']
|
| 236 |
+
|
| 237 |
+
features = raw_data[['Age', 'EdLevel', 'Employment', 'Gender', 'MainBranch', 'YearsCode', 'YearsCodePro', 'PreviousSalary', 'ComputerSkills']]
|
| 238 |
+
features_raw = features.copy()
|
| 239 |
+
|
| 240 |
+
categorical_cols = ['Age', 'EdLevel', 'Employment', 'Gender', 'MainBranch']
|
| 241 |
+
if len(categorical_cols) > 0:
|
| 242 |
+
features = pd.get_dummies(features, columns=categorical_cols)
|
| 243 |
+
|
| 244 |
+
label = np.array(raw_data['Employed'].astype(int))
|
| 245 |
+
group = features['Gender_Man'].astype('category').cat.codes
|
| 246 |
+
|
| 247 |
+
features, label, group = np.array(features), np.array(label), np.array(group)
|
| 248 |
+
|
| 249 |
+
return split_and_scale(features_raw, features, label, group, test_split, preprocess_scale)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
### Main Code Starts Here
|
| 253 |
+
scaler, arch, iterations, seed = selected_path[0], selected_path[1], int(selected_path[2]), int(selected_path[3])
|
| 254 |
+
random_seed = seed
|
| 255 |
+
|
| 256 |
+
modellist = ['mlp', 'mlpbig', 'lr', 'rf']
|
| 257 |
+
modeltype = "Logistic Regression",
|
| 258 |
+
"Random Forest",
|
| 259 |
+
"Neural Network (Small)"
|
| 260 |
+
|
| 261 |
+
X_raw_train, X_raw_test, X_train, X_test, y_train, y_test, group_train, group_test = get_stackoverflow_dataset()
|
| 262 |
+
|
| 263 |
+
placeholder = st.empty()
|
| 264 |
+
modelclass_dict = {'Neural Network (Small)': MLPClassifier([10], random_state=random_seed, max_iter=iterations),
|
| 265 |
+
'Logistic Regression': SGDClassifier(random_state=random_seed, max_iter=iterations),
|
| 266 |
+
'Random Forest': RandomForestClassifier(random_state=random_seed, max_iter=iterations)}
|
| 267 |
+
model = modelclass_dict[arch]
|
| 268 |
+
placeholder.write("Training your model.")
|
| 269 |
+
model.fit(X_train, y_train)
|
| 270 |
+
placeholder.empty()
|
| 271 |
+
|
| 272 |
+
acc = model.score(X_test, y_test)
|
| 273 |
+
|
| 274 |
+
acc_men = model.score(X_test[group_test==1], y_test[group_test==1])
|
| 275 |
+
acc_women = model.score(X_test[group_test==0], y_test[group_test==0])
|
| 276 |
+
|
| 277 |
+
disp = abs(acc_men - acc_women)
|
| 278 |
+
|
| 279 |
+
st.subheader("📊 Model Performance Metrics")
|
| 280 |
+
st.markdown(f"""
|
| 281 |
+
Your model was tested on a separate test dataset, and you achieved the following overall model accuracy as well as disparity in accuracy across men and women in the dataset.
|
| 282 |
+
""")
|
| 283 |
+
st.metric(label="Model Accuracy", value=f"{acc * 100:.1f}%")
|
| 284 |
+
st.metric(label="Gender Disparity in Accuracy", value=f"{disp * 100:.1f}%")
|