| |
| import streamlit as st |
| import plotly.graph_objects as go |
| from utils import add_navigation, add_instruction_text, add_red_text |
|
|
| import random |
| import pandas as pd |
| import numpy as np |
|
|
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.linear_model import SGDClassifier |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.preprocessing import LabelEncoder |
| from sklearn.feature_selection import SelectKBest, f_classif |
| from sklearn.decomposition import PCA |
|
|
| choices_list = [ |
| {"label": "Data Scaling", "options": [ |
| "MinMax Scaler", |
| "Standard Scaler", |
| "Robust Scaler" |
| ]}, |
| {"label": "Feature Selection", "options": [ |
| "Select K Best (k=5)", |
| "PCA (n=5)", |
| "All Features" |
| ]}, |
| {"label": "Model Architecture", "options": [ |
| "Logistic Regression", |
| "Decision Tree", |
| "Neural Network (Small)" |
| ]}, |
| {"label": "Random Seed", "options": [ |
| "1", "2", "3", "4", "5", "6", "7", "8", "9", "10" |
| ]} |
| ] |
|
|
| def build_tree_and_trace_path(selected_path, spread_factor=10000): |
| """ |
| Build tree nodes and edges. Then trace selected_path (one choice per stage) |
| by walking children of the current node to find the matching label at each stage. |
| |
| Parameters |
| ---------- |
| selected_path : list of str |
| The path to highlight. |
| spread_factor : float (default=5.0) |
| Controls vertical spread. Larger values => more separation early, |
| less separation as depth increases. |
| |
| Returns |
| ------- |
| node_labels, node_positions, edges, highlight_edges, highlight_nodes |
| """ |
| |
| node_labels = ["Start"] |
| node_positions = [(0.0, 0.0)] |
| node_stage = [0] |
| edges = [] |
|
|
| prev_nodes = [0] |
| y_spacing_base = 1.0 |
|
|
| |
| for stage_idx, stage in enumerate(choices_list, start=1): |
| options = stage["options"] |
| next_nodes = [] |
|
|
| |
| scale = spread_factor ** (1.0 / stage_idx**(0.2)) |
|
|
| for parent_order, parent_idx in enumerate(prev_nodes): |
| px, py = node_positions[parent_idx] |
|
|
| |
| parent_block_size = len(options) * y_spacing_base * scale |
| base_y = py + (parent_order - (len(prev_nodes) - 1) / 2.0) * parent_block_size |
|
|
| for opt_idx, opt in enumerate(options): |
| child_x = float(stage_idx) |
|
|
| offset = (opt_idx - (len(options) - 1) / 2.0) * (y_spacing_base * scale) |
|
|
| child_y = base_y + offset |
| node_index = len(node_labels) |
| node_labels.append(opt) |
| node_positions.append((child_x, child_y)) |
| node_stage.append(stage_idx) |
| edges.append((parent_idx, node_index)) |
| next_nodes.append(node_index) |
|
|
| prev_nodes = next_nodes |
|
|
| |
| highlight_edges = set() |
| highlight_nodes = set([0]) |
| current_node = 0 |
|
|
| for stage_idx, chosen_label in enumerate(selected_path, start=1): |
| children = [dst for (src, dst) in edges if src == current_node] |
| found_child = None |
| for c in children: |
| if node_labels[c] == chosen_label: |
| found_child = c |
| break |
| if found_child is None: |
| break |
| highlight_edges.add((current_node, found_child)) |
| highlight_nodes.add(found_child) |
| current_node = found_child |
|
|
| return node_labels, node_positions, edges, highlight_edges, highlight_nodes |
|
|
|
|
|
|
| def render(): |
| add_navigation("txt_multiverse", "txt_conclusion") |
|
|
| add_instruction_text( |
| """ |
| Visually explore the multiverse of AI models to judge loan applications.<br> |
| We are using a publicly available loan approval dataset.<br> |
| Make a choice, and scroll down to see the properties of the trained model.<br> |
| Not sure what choice to make? Just pick something and see what happens. |
| """ |
| ) |
|
|
| |
| cols_list = st.columns([1, 1]) |
| selected_path = [] |
| for ite, stage in enumerate(choices_list): |
| with cols_list[ite%2]: |
| |
| key = f"multiverse_choice_{stage['label']}" |
| choice = st.selectbox(f"{stage['label']}", stage["options"], key=key) |
| selected_path.append(choice) |
|
|
| |
| labels, positions, edges, highlight_edges, highlight_nodes = build_tree_and_trace_path(selected_path) |
|
|
| |
| x_vals = [pos[0] for pos in positions] |
| y_vals = [pos[1] for pos in positions] |
|
|
| node_colors = [] |
| for idx in range(len(labels)): |
| if idx in highlight_nodes: |
| node_colors.append("rgba(34,139,34,0.95)") |
| elif idx == 0: |
| node_colors.append("rgba(30,144,255,0.9)") |
| else: |
| node_colors.append("rgba(135,206,250,0.6)") |
|
|
| node_trace = go.Scatter( |
| x=x_vals, y=y_vals, |
| mode='markers', |
| text=labels, |
| |
| marker=dict(size=18, color=node_colors, line=dict(width=1, color='black')), |
| hoverinfo="text" |
| ) |
|
|
| edge_traces = [] |
| for src, dst in edges: |
| if (src, dst) in highlight_edges: |
| color = "rgba(0,128,0,0.9)" |
| width = 4 |
| else: |
| color = "rgba(120,120,120,0.4)" |
| width = 1.5 |
| edge_traces.append(go.Scatter( |
| x=[positions[src][0], positions[dst][0]], |
| y=[positions[src][1], positions[dst][1]], |
| mode='lines', |
| line=dict(width=width, color=color), |
| hoverinfo='none' |
| )) |
|
|
| |
| stage_label_traces = [] |
| for stage_idx, stage in enumerate(choices_list, start=1): |
| |
| stage_nodes = [i for i, (x, y) in enumerate(positions) if x == float(stage_idx)] |
| if not stage_nodes: |
| continue |
| |
| max_y = max(positions[i][1] for i in stage_nodes) |
| x = float(stage_idx) |
| y = max_y + 20000 |
| stage_label_traces.append(go.Scatter( |
| x=[x], y=[y], |
| text=[stage["label"]], |
| mode="text", |
| textfont=dict(size=16, color="white"), |
| hoverinfo="none", |
| showlegend=False |
| )) |
|
|
|
|
| |
| |
| fig = go.Figure(data=edge_traces + stage_label_traces + [node_trace]) |
| fig.update_layout( |
| showlegend=False, |
| xaxis=dict(visible=False), |
| yaxis=dict(visible=False), |
| |
| |
| paper_bgcolor='black', |
| plot_bgcolor='black', |
| font=dict(color='white'), |
| margin=dict(l=10, r=10, t=10, b=10), |
| hovermode="closest" |
| ) |
|
|
| st.plotly_chart(fig, use_container_width=True) |
|
|
| |
| |
| |
|
|
|
|
| def split_and_scale(features, label, test_split=0.2, preprocess_scale=None): |
| X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_split, random_state=0) |
| |
| if preprocess_scale is not None: |
| if preprocess_scale=="MinMax Scaler": |
| scaler = MinMaxScaler() |
| elif preprocess_scale=="Standard Scaler": |
| scaler = StandardScaler() |
| elif preprocess_scale=="Robust Scaler": |
| scaler = RobustScaler() |
| scaler.fit(X_train) |
| X_train, X_test = scaler.transform(X_train), scaler.transform(X_test) |
| |
| return X_train, X_test, y_train, y_test |
| |
| def get_stackoverflow_dataset(test_split=0.2, preprocess_scale=None): |
| data = pd.read_csv('loan_approval_dataset.csv') |
|
|
| features = data.drop(columns=["loan_id", " loan_status"]) |
| features = pd.get_dummies(features, columns=[" education", " self_employed"], drop_first=True).values |
| |
| le = LabelEncoder() |
| label = le.fit_transform(data[" loan_status"]) |
| |
| features, label = np.array(features), np.array(label) |
| |
| return split_and_scale(features, label, test_split, preprocess_scale) |
| |
| def model_train_and_pred(scaler, feature_sel, arch, seed): |
| X_train, X_test, y_train, y_test = get_stackoverflow_dataset(preprocess_scale=scaler) |
|
|
| if feature_sel=="Select K Best (k=5)": |
| selector = SelectKBest(score_func=f_classif, k=5) |
| X_train = selector.fit_transform(X_train, y_train) |
| X_test = selector.transform(X_test) |
| elif feature_sel=="PCA (n=5)": |
| pca = PCA(n_components=2) |
| X_train = pca.fit_transform(X_train, y_train) |
| X_test = pca.transform(X_test) |
| |
| modelclass_dict = {'Neural Network (Small)': MLPClassifier([10], random_state=seed, max_iter=500), |
| 'Logistic Regression': SGDClassifier(random_state=seed, max_iter=500), |
| 'Decision Tree': DecisionTreeClassifier(random_state=seed)} |
| model = modelclass_dict[arch] |
| model.fit(X_train, y_train) |
| |
| y_pred = model.predict(X_test) |
| return y_pred |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| scaler, feature_sel, arch, seed = selected_path[0], selected_path[1], selected_path[2], int(selected_path[3]) |
| y_pred = model_train_and_pred(scaler, feature_sel, arch, seed) |
| all_preds_numpy = np.load("all_predictions.npy") |
|
|
| prop_ones = np.mean(all_preds_numpy == 1, axis=0) |
| condition_rej = (y_pred == 0) & (prop_ones > 0.5) |
| |
| uniq_count_rej = np.sum(condition_rej) |
|
|
| |
| condition_acc = (y_pred == 1) & (prop_ones < 0.5) |
| uniq_count_acc = np.sum(condition_acc) |
| |
| add_red_text(f""" |
| <b>Based on your choices:</b><br> |
| Number of loans accepted by the majority, but rejected by you: {uniq_count_rej}<br> |
| Number of loans rejected by the majority, but accepted by you: {uniq_count_acc}<br><br> |
| <b>Reasons you might want to conform:</b><br> |
| To take lower risks and to avoid facing a justification crisis, i.e., |
| not able to explain why you rejected an applicant who would have been accepted by most other models.<br><br> |
| <b>Reasons you might want to be unique:</b><br> |
| To avoid competiting for the same loan applicants with others.<br> |
| To give a chance to unique applicants and deal with the concerns of homogenization.<br><br> |
| """) |