verifiability

Sleeping

File size: 12,143 Bytes

ce9a7cc
67606c8
 
f0db6f0
67606c8
754f471
 
 
 
 
ea5b2dd
754f471
 
 
ea5b2dd
cf1521b
ea5b2dd
 
754f471
147708f
930fa5e
 
ea5b2dd
 
 
 
 
 
 
147708f
930fa5e
 
ea5b2dd
930fa5e
147708f
930fa5e
 
147708f
 
 
0be3098
ce9a7cc
 
 
5be171f
 
 
 
 
 
 
 
 
 
 
 
ce9a7cc
0be3098
1bb56b0
ce9a7cc
2a198b1
ce9a7cc
1bb56b0
ce9a7cc
 
1bb56b0
ce9a7cc
147708f
930fa5e
ce9a7cc
 
5be171f
0be3098
4f4f1a5
9826556
1bb56b0
9826556
4f4f1a5
 
9826556
 
1bb56b0
ce9a7cc
9826556
4f4f1a5
9826556
 
1bb56b0
 
 
 
 
ce9a7cc
1bb56b0
ce9a7cc
 
 
2a198b1
ce9a7cc
2a198b1
ce9a7cc
 
 
 
 
 
 
 
 
 
 
 
 
2a198b1
 
1bb56b0
ce9a7cc
5be171f
1bb56b0
0adaec1
9d8a2eb
 
 
f0db6f0
5d12644
cd4d1bc
70e9e71
147708f
 
 
ce9a7cc
0adaec1
67606c8
0adaec1
 
 
 
 
 
67606c8
ce9a7cc
 
1bb56b0
ce9a7cc
1bb56b0
 
ce9a7cc
 
 
 
 
 
 
 
 
1bb56b0
 
 
4f4f1a5
1bb56b0
4f4f1a5
ce9a7cc
 
1bb56b0
 
 
 
ce9a7cc
 
 
 
 
 
1bb56b0
 
 
 
 
 
 
 
d9ec769
 
 
95e63f4
 
d9ec769
 
95e63f4
d9ec769
 
cd6e32f
d9ec769
 
 
 
 
 
 
 
 
95e63f4
ce9a7cc
d9ec769
 
1bb56b0
 
 
 
f80acf5
 
 
 
dd9ea9c
ce9a7cc
 
1bb56b0
67606c8
 
754f471
 
 
 
 
 
ea5b2dd
cf1521b
754f471
ea5b2dd
 
 
 
 
 
 
754f471
 
 
cf1521b
754f471
ea5b2dd
cf1521b
 
 
 
754f471
cf1521b
 
754f471
cf1521b
754f471
cf1521b
754f471
4c9cd31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754f471
4c9cd31
 
f0db6f0
58c7241
 
 
 
 
 
 
 
 
 
 
 
fb21fe1
58c7241
 
 
 
fb21fe1
58c7241
 
 
 
 
 
 
 
 
35bdbcb
58c7241
35bdbcb
2a64223
dd765d7
 
 
 
 
 
 
4c9cd31
f0db6f0
ff83251
5d12644
 
cd4d1bc
5d12644
 
cd4d1bc
5d12644
 
f0db6f0

# pages/multiverse.py
import streamlit as st
import plotly.graph_objects as go
from utils import add_navigation, add_instruction_text, add_red_text

import random
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

choices_list = [
    {"label": "Data Scaling", "options": [
        "MinMax Scaler",
        "Standard Scaler",
        "Robust Scaler"
    ]},
    {"label": "Feature Selection", "options": [
        "Select K Best (k=5)",
        "PCA (n=5)",
        "All Features"
    ]},
    {"label": "Model Architecture", "options": [
        "Logistic Regression",
        "Decision Tree",
        "Neural Network (Small)"
    ]},
    {"label": "Random Seed", "options": [
        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"
    ]}
]

def build_tree_and_trace_path(selected_path, spread_factor=10000):
    """
    Build tree nodes and edges. Then trace selected_path (one choice per stage)
    by walking children of the current node to find the matching label at each stage.

    Parameters
    ----------
    selected_path : list of str
        The path to highlight.
    spread_factor : float (default=5.0)
        Controls vertical spread. Larger values => more separation early,
        less separation as depth increases.

    Returns
    -------
    node_labels, node_positions, edges, highlight_edges, highlight_nodes
    """
    
    node_labels = ["Start"]
    node_positions = [(0.0, 0.0)]
    node_stage = [0]
    edges = []

    prev_nodes = [0]  # nodes at previous stage (start)
    y_spacing_base = 1.0

    # Build nodes and edges stage by stage
    for stage_idx, stage in enumerate(choices_list, start=1):
        options = stage["options"]
        next_nodes = []

        # scaling: huge spread at stage 1, tapering off deeper
        scale = spread_factor ** (1.0 / stage_idx**(0.2))

        for parent_order, parent_idx in enumerate(prev_nodes):
            px, py = node_positions[parent_idx]

            # each parent gets its own block of vertical space
            parent_block_size = len(options) * y_spacing_base * scale
            base_y = py + (parent_order - (len(prev_nodes) - 1) / 2.0) * parent_block_size

            for opt_idx, opt in enumerate(options):
                child_x = float(stage_idx)

                offset = (opt_idx - (len(options) - 1) / 2.0) * (y_spacing_base * scale)

                child_y = base_y + offset
                node_index = len(node_labels)
                node_labels.append(opt)
                node_positions.append((child_x, child_y))
                node_stage.append(stage_idx)
                edges.append((parent_idx, node_index))
                next_nodes.append(node_index)

        prev_nodes = next_nodes

    # Trace the single chosen path by walking children
    highlight_edges = set()
    highlight_nodes = set([0])
    current_node = 0

    for stage_idx, chosen_label in enumerate(selected_path, start=1):
        children = [dst for (src, dst) in edges if src == current_node]
        found_child = None
        for c in children:
            if node_labels[c] == chosen_label:
                found_child = c
                break
        if found_child is None:
            break
        highlight_edges.add((current_node, found_child))
        highlight_nodes.add(found_child)
        current_node = found_child

    return node_labels, node_positions, edges, highlight_edges, highlight_nodes



def render():
    add_navigation("txt_multiverse", "txt_conclusion")

    add_instruction_text(
        """
        Visually explore the multiverse of AI models to judge loan applications.<br> 
        We are using a publicly available loan approval dataset.<br>
        Make a choice, and scroll down to see the properties of the trained model.<br>
        Not sure what choice to make? Just pick something and see what happens.
        """
    )

    # --- User picks one choice per stage via dropdowns ---
    cols_list = st.columns([1, 1])
    selected_path = []
    for ite, stage in enumerate(choices_list):
        with cols_list[ite%2]:
            # use a stable key per stage to avoid conflicts
            key = f"multiverse_choice_{stage['label']}"
            choice = st.selectbox(f"{stage['label']}", stage["options"], key=key)
            selected_path.append(choice)

    # --- Build tree and compute which edges/nodes to highlight ---
    labels, positions, edges, highlight_edges, highlight_nodes = build_tree_and_trace_path(selected_path)

    # --- Prepare node and edge traces for Plotly ---
    x_vals = [pos[0] for pos in positions]
    y_vals = [pos[1] for pos in positions]

    node_colors = []
    for idx in range(len(labels)):
        if idx in highlight_nodes:
            node_colors.append("rgba(34,139,34,0.95)")  # green for selected path nodes
        elif idx == 0:
            node_colors.append("rgba(30,144,255,0.9)")  # start node distinct
        else:
            node_colors.append("rgba(135,206,250,0.6)")  # default skyblue

    node_trace = go.Scatter(
        x=x_vals, y=y_vals,
        mode='markers',
        text=labels,
        # textposition="top center",
        marker=dict(size=18, color=node_colors, line=dict(width=1, color='black')),
        hoverinfo="text"
    )

    edge_traces = []
    for src, dst in edges:
        if (src, dst) in highlight_edges:
            color = "rgba(0,128,0,0.9)"  # bright green
            width = 4
        else:
            color = "rgba(120,120,120,0.4)"
            width = 1.5
        edge_traces.append(go.Scatter(
            x=[positions[src][0], positions[dst][0]],
            y=[positions[src][1], positions[dst][1]],
            mode='lines',
            line=dict(width=width, color=color),
            hoverinfo='none'
        ))

    # --- Add stage labels at the top of each layer ---
    stage_label_traces = []
    for stage_idx, stage in enumerate(choices_list, start=1):
        # find all nodes belonging to this stage (x == stage_idx)
        stage_nodes = [i for i, (x, y) in enumerate(positions) if x == float(stage_idx)]
        if not stage_nodes:
            continue
        # max y among these nodes
        max_y = max(positions[i][1] for i in stage_nodes)
        x = float(stage_idx)
        y = max_y + 20000  # offset above top node
        stage_label_traces.append(go.Scatter(
            x=[x], y=[y],
            text=[stage["label"]],
            mode="text",
            textfont=dict(size=16, color="white"),
            hoverinfo="none",
            showlegend=False
        ))


    # --- Render figure ---
    # fig = go.Figure(data=edge_traces + [node_trace])
    fig = go.Figure(data=edge_traces + stage_label_traces + [node_trace])
    fig.update_layout(
        showlegend=False,
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        # paper_bgcolor='rgba(0,0,0,0)',   # transparent
        # plot_bgcolor='rgba(0,0,0,0)',    # transparent
        paper_bgcolor='black',
        plot_bgcolor='black',
        font=dict(color='white'),
        margin=dict(l=10, r=10, t=10, b=10),
        hovermode="closest"
    )

    st.plotly_chart(fig, use_container_width=True)

    ##########################
    ##########################
    ##########################


    def split_and_scale(features, label, test_split=0.2, preprocess_scale=None):
        X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_split, random_state=0)
    
        if preprocess_scale is not None:
            if preprocess_scale=="MinMax Scaler":
                scaler = MinMaxScaler()
            elif preprocess_scale=="Standard Scaler":
                scaler = StandardScaler()
            elif preprocess_scale=="Robust Scaler":
                scaler = RobustScaler()
            scaler.fit(X_train)
            X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
    
        return X_train, X_test, y_train, y_test
    
    def get_stackoverflow_dataset(test_split=0.2, preprocess_scale=None):
        data = pd.read_csv('loan_approval_dataset.csv')

        features = data.drop(columns=["loan_id", " loan_status"])
        features = pd.get_dummies(features, columns=[" education", " self_employed"], drop_first=True).values
        
        le = LabelEncoder()
        label = le.fit_transform(data[" loan_status"])
    
        features, label = np.array(features), np.array(label)
    
        return split_and_scale(features, label, test_split, preprocess_scale)
    
    def model_train_and_pred(scaler, feature_sel, arch, seed):
        X_train, X_test, y_train, y_test = get_stackoverflow_dataset(preprocess_scale=scaler)

        if feature_sel=="Select K Best (k=5)":
            selector = SelectKBest(score_func=f_classif, k=5)
            X_train = selector.fit_transform(X_train, y_train)
            X_test = selector.transform(X_test)
        elif feature_sel=="PCA (n=5)":
            pca = PCA(n_components=2)
            X_train = pca.fit_transform(X_train, y_train)
            X_test = pca.transform(X_test)
        
        modelclass_dict = {'Neural Network (Small)': MLPClassifier([10], random_state=seed, max_iter=500),
                           'Logistic Regression': SGDClassifier(random_state=seed, max_iter=500),
                           'Decision Tree': DecisionTreeClassifier(random_state=seed)}
        model = modelclass_dict[arch]
        model.fit(X_train, y_train)
    
        y_pred = model.predict(X_test)
        return y_pred

    # all_preds = []
    # for scaler in choices_list[0]["options"]:
    #     for feature_sel in choices_list[1]["options"]:
    #         for arch in choices_list[2]["options"]:
    #             for seed in choices_list[3]["options"]:
    #                 seed = int(seed)
    #                 y_pred_local = model_train_and_pred(scaler, feature_sel, arch, seed)
    #                 all_preds.append(y_pred_local)
    #             st.markdown(scaler + feature_sel + arch)

    # all_preds_numpy = np.array(all_preds)
    # from io import BytesIO
    
    # # Create a BytesIO object
    # buffer = BytesIO()
    # np.save(buffer, all_preds_numpy)
    # buffer.seek(0)  # move to start
    
    # st.download_button(
    #     label="Download predictions",
    #     data=buffer,
    #     file_name="all_predictions.npy",
    #     mime="application/octet-stream"
    # )

    ### Main Code Starts Here
    scaler, feature_sel, arch, seed = selected_path[0], selected_path[1], selected_path[2], int(selected_path[3])
    y_pred = model_train_and_pred(scaler, feature_sel, arch, seed)
    all_preds_numpy = np.load("all_predictions.npy")

    prop_ones = np.mean(all_preds_numpy == 1, axis=0)
    condition_rej = (y_pred == 0) & (prop_ones > 0.5)
    # uniq_perc = 100 * np.sum(condition) / len(y_pred)
    uniq_count_rej = np.sum(condition_rej)

    
    condition_acc = (y_pred == 1) & (prop_ones < 0.5)
    uniq_count_acc = np.sum(condition_acc)
    
    add_red_text(f"""
        <b>Based on your choices:</b><br>
        Number of loans accepted by the majority, but rejected by you: {uniq_count_rej}<br>
        Number of loans rejected by the majority, but accepted by you: {uniq_count_acc}<br><br>
        <b>Reasons you might want to conform:</b><br>
        To take lower risks and to avoid facing a justification crisis, i.e., 
        not able to explain why you rejected an applicant who would have been accepted by most other models.<br><br>
        <b>Reasons you might want to be unique:</b><br>
        To avoid competiting for the same loan applicants with others.<br> 
        To give a chance to unique applicants and deal with the concerns of homogenization.<br><br>
    """)