File size: 12,143 Bytes
ce9a7cc
67606c8
 
f0db6f0
67606c8
754f471
 
 
 
 
ea5b2dd
754f471
 
 
ea5b2dd
cf1521b
ea5b2dd
 
754f471
147708f
930fa5e
 
ea5b2dd
 
 
 
 
 
 
147708f
930fa5e
 
ea5b2dd
930fa5e
147708f
930fa5e
 
147708f
 
 
0be3098
ce9a7cc
 
 
5be171f
 
 
 
 
 
 
 
 
 
 
 
ce9a7cc
0be3098
1bb56b0
ce9a7cc
2a198b1
ce9a7cc
1bb56b0
ce9a7cc
 
1bb56b0
ce9a7cc
147708f
930fa5e
ce9a7cc
 
5be171f
0be3098
4f4f1a5
9826556
1bb56b0
9826556
4f4f1a5
 
9826556
 
1bb56b0
ce9a7cc
9826556
4f4f1a5
9826556
 
1bb56b0
 
 
 
 
ce9a7cc
1bb56b0
ce9a7cc
 
 
2a198b1
ce9a7cc
2a198b1
ce9a7cc
 
 
 
 
 
 
 
 
 
 
 
 
2a198b1
 
1bb56b0
ce9a7cc
5be171f
1bb56b0
0adaec1
9d8a2eb
 
 
f0db6f0
5d12644
cd4d1bc
70e9e71
147708f
 
 
ce9a7cc
0adaec1
67606c8
0adaec1
 
 
 
 
 
67606c8
ce9a7cc
 
1bb56b0
ce9a7cc
1bb56b0
 
ce9a7cc
 
 
 
 
 
 
 
 
1bb56b0
 
 
4f4f1a5
1bb56b0
4f4f1a5
ce9a7cc
 
1bb56b0
 
 
 
ce9a7cc
 
 
 
 
 
1bb56b0
 
 
 
 
 
 
 
d9ec769
 
 
95e63f4
 
d9ec769
 
95e63f4
d9ec769
 
cd6e32f
d9ec769
 
 
 
 
 
 
 
 
95e63f4
ce9a7cc
d9ec769
 
1bb56b0
 
 
 
f80acf5
 
 
 
dd9ea9c
ce9a7cc
 
1bb56b0
67606c8
 
754f471
 
 
 
 
 
ea5b2dd
cf1521b
754f471
ea5b2dd
 
 
 
 
 
 
754f471
 
 
cf1521b
754f471
ea5b2dd
cf1521b
 
 
 
754f471
cf1521b
 
754f471
cf1521b
754f471
cf1521b
754f471
4c9cd31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754f471
4c9cd31
 
f0db6f0
58c7241
 
 
 
 
 
 
 
 
 
 
 
fb21fe1
58c7241
 
 
 
fb21fe1
58c7241
 
 
 
 
 
 
 
 
35bdbcb
58c7241
35bdbcb
2a64223
dd765d7
 
 
 
 
 
 
4c9cd31
f0db6f0
ff83251
5d12644
 
cd4d1bc
5d12644
 
cd4d1bc
5d12644
 
f0db6f0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
# pages/multiverse.py
import streamlit as st
import plotly.graph_objects as go
from utils import add_navigation, add_instruction_text, add_red_text

import random
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

choices_list = [
    {"label": "Data Scaling", "options": [
        "MinMax Scaler",
        "Standard Scaler",
        "Robust Scaler"
    ]},
    {"label": "Feature Selection", "options": [
        "Select K Best (k=5)",
        "PCA (n=5)",
        "All Features"
    ]},
    {"label": "Model Architecture", "options": [
        "Logistic Regression",
        "Decision Tree",
        "Neural Network (Small)"
    ]},
    {"label": "Random Seed", "options": [
        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"
    ]}
]

def build_tree_and_trace_path(selected_path, spread_factor=10000):
    """
    Build tree nodes and edges. Then trace selected_path (one choice per stage)
    by walking children of the current node to find the matching label at each stage.

    Parameters
    ----------
    selected_path : list of str
        The path to highlight.
    spread_factor : float (default=5.0)
        Controls vertical spread. Larger values => more separation early,
        less separation as depth increases.

    Returns
    -------
    node_labels, node_positions, edges, highlight_edges, highlight_nodes
    """
    
    node_labels = ["Start"]
    node_positions = [(0.0, 0.0)]
    node_stage = [0]
    edges = []

    prev_nodes = [0]  # nodes at previous stage (start)
    y_spacing_base = 1.0

    # Build nodes and edges stage by stage
    for stage_idx, stage in enumerate(choices_list, start=1):
        options = stage["options"]
        next_nodes = []

        # scaling: huge spread at stage 1, tapering off deeper
        scale = spread_factor ** (1.0 / stage_idx**(0.2))

        for parent_order, parent_idx in enumerate(prev_nodes):
            px, py = node_positions[parent_idx]

            # each parent gets its own block of vertical space
            parent_block_size = len(options) * y_spacing_base * scale
            base_y = py + (parent_order - (len(prev_nodes) - 1) / 2.0) * parent_block_size

            for opt_idx, opt in enumerate(options):
                child_x = float(stage_idx)

                offset = (opt_idx - (len(options) - 1) / 2.0) * (y_spacing_base * scale)

                child_y = base_y + offset
                node_index = len(node_labels)
                node_labels.append(opt)
                node_positions.append((child_x, child_y))
                node_stage.append(stage_idx)
                edges.append((parent_idx, node_index))
                next_nodes.append(node_index)

        prev_nodes = next_nodes

    # Trace the single chosen path by walking children
    highlight_edges = set()
    highlight_nodes = set([0])
    current_node = 0

    for stage_idx, chosen_label in enumerate(selected_path, start=1):
        children = [dst for (src, dst) in edges if src == current_node]
        found_child = None
        for c in children:
            if node_labels[c] == chosen_label:
                found_child = c
                break
        if found_child is None:
            break
        highlight_edges.add((current_node, found_child))
        highlight_nodes.add(found_child)
        current_node = found_child

    return node_labels, node_positions, edges, highlight_edges, highlight_nodes



def render():
    add_navigation("txt_multiverse", "txt_conclusion")

    add_instruction_text(
        """
        Visually explore the multiverse of AI models to judge loan applications.<br> 
        We are using a publicly available loan approval dataset.<br>
        Make a choice, and scroll down to see the properties of the trained model.<br>
        Not sure what choice to make? Just pick something and see what happens.
        """
    )

    # --- User picks one choice per stage via dropdowns ---
    cols_list = st.columns([1, 1])
    selected_path = []
    for ite, stage in enumerate(choices_list):
        with cols_list[ite%2]:
            # use a stable key per stage to avoid conflicts
            key = f"multiverse_choice_{stage['label']}"
            choice = st.selectbox(f"{stage['label']}", stage["options"], key=key)
            selected_path.append(choice)

    # --- Build tree and compute which edges/nodes to highlight ---
    labels, positions, edges, highlight_edges, highlight_nodes = build_tree_and_trace_path(selected_path)

    # --- Prepare node and edge traces for Plotly ---
    x_vals = [pos[0] for pos in positions]
    y_vals = [pos[1] for pos in positions]

    node_colors = []
    for idx in range(len(labels)):
        if idx in highlight_nodes:
            node_colors.append("rgba(34,139,34,0.95)")  # green for selected path nodes
        elif idx == 0:
            node_colors.append("rgba(30,144,255,0.9)")  # start node distinct
        else:
            node_colors.append("rgba(135,206,250,0.6)")  # default skyblue

    node_trace = go.Scatter(
        x=x_vals, y=y_vals,
        mode='markers',
        text=labels,
        # textposition="top center",
        marker=dict(size=18, color=node_colors, line=dict(width=1, color='black')),
        hoverinfo="text"
    )

    edge_traces = []
    for src, dst in edges:
        if (src, dst) in highlight_edges:
            color = "rgba(0,128,0,0.9)"  # bright green
            width = 4
        else:
            color = "rgba(120,120,120,0.4)"
            width = 1.5
        edge_traces.append(go.Scatter(
            x=[positions[src][0], positions[dst][0]],
            y=[positions[src][1], positions[dst][1]],
            mode='lines',
            line=dict(width=width, color=color),
            hoverinfo='none'
        ))

    # --- Add stage labels at the top of each layer ---
    stage_label_traces = []
    for stage_idx, stage in enumerate(choices_list, start=1):
        # find all nodes belonging to this stage (x == stage_idx)
        stage_nodes = [i for i, (x, y) in enumerate(positions) if x == float(stage_idx)]
        if not stage_nodes:
            continue
        # max y among these nodes
        max_y = max(positions[i][1] for i in stage_nodes)
        x = float(stage_idx)
        y = max_y + 20000  # offset above top node
        stage_label_traces.append(go.Scatter(
            x=[x], y=[y],
            text=[stage["label"]],
            mode="text",
            textfont=dict(size=16, color="white"),
            hoverinfo="none",
            showlegend=False
        ))


    # --- Render figure ---
    # fig = go.Figure(data=edge_traces + [node_trace])
    fig = go.Figure(data=edge_traces + stage_label_traces + [node_trace])
    fig.update_layout(
        showlegend=False,
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        # paper_bgcolor='rgba(0,0,0,0)',   # transparent
        # plot_bgcolor='rgba(0,0,0,0)',    # transparent
        paper_bgcolor='black',
        plot_bgcolor='black',
        font=dict(color='white'),
        margin=dict(l=10, r=10, t=10, b=10),
        hovermode="closest"
    )

    st.plotly_chart(fig, use_container_width=True)

    ##########################
    ##########################
    ##########################


    def split_and_scale(features, label, test_split=0.2, preprocess_scale=None):
        X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_split, random_state=0)
    
        if preprocess_scale is not None:
            if preprocess_scale=="MinMax Scaler":
                scaler = MinMaxScaler()
            elif preprocess_scale=="Standard Scaler":
                scaler = StandardScaler()
            elif preprocess_scale=="Robust Scaler":
                scaler = RobustScaler()
            scaler.fit(X_train)
            X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
    
        return X_train, X_test, y_train, y_test
    
    def get_stackoverflow_dataset(test_split=0.2, preprocess_scale=None):
        data = pd.read_csv('loan_approval_dataset.csv')

        features = data.drop(columns=["loan_id", " loan_status"])
        features = pd.get_dummies(features, columns=[" education", " self_employed"], drop_first=True).values
        
        le = LabelEncoder()
        label = le.fit_transform(data[" loan_status"])
    
        features, label = np.array(features), np.array(label)
    
        return split_and_scale(features, label, test_split, preprocess_scale)
    
    def model_train_and_pred(scaler, feature_sel, arch, seed):
        X_train, X_test, y_train, y_test = get_stackoverflow_dataset(preprocess_scale=scaler)

        if feature_sel=="Select K Best (k=5)":
            selector = SelectKBest(score_func=f_classif, k=5)
            X_train = selector.fit_transform(X_train, y_train)
            X_test = selector.transform(X_test)
        elif feature_sel=="PCA (n=5)":
            pca = PCA(n_components=2)
            X_train = pca.fit_transform(X_train, y_train)
            X_test = pca.transform(X_test)
        
        modelclass_dict = {'Neural Network (Small)': MLPClassifier([10], random_state=seed, max_iter=500),
                           'Logistic Regression': SGDClassifier(random_state=seed, max_iter=500),
                           'Decision Tree': DecisionTreeClassifier(random_state=seed)}
        model = modelclass_dict[arch]
        model.fit(X_train, y_train)
    
        y_pred = model.predict(X_test)
        return y_pred

    # all_preds = []
    # for scaler in choices_list[0]["options"]:
    #     for feature_sel in choices_list[1]["options"]:
    #         for arch in choices_list[2]["options"]:
    #             for seed in choices_list[3]["options"]:
    #                 seed = int(seed)
    #                 y_pred_local = model_train_and_pred(scaler, feature_sel, arch, seed)
    #                 all_preds.append(y_pred_local)
    #             st.markdown(scaler + feature_sel + arch)

    # all_preds_numpy = np.array(all_preds)
    # from io import BytesIO
    
    # # Create a BytesIO object
    # buffer = BytesIO()
    # np.save(buffer, all_preds_numpy)
    # buffer.seek(0)  # move to start
    
    # st.download_button(
    #     label="Download predictions",
    #     data=buffer,
    #     file_name="all_predictions.npy",
    #     mime="application/octet-stream"
    # )

    ### Main Code Starts Here
    scaler, feature_sel, arch, seed = selected_path[0], selected_path[1], selected_path[2], int(selected_path[3])
    y_pred = model_train_and_pred(scaler, feature_sel, arch, seed)
    all_preds_numpy = np.load("all_predictions.npy")

    prop_ones = np.mean(all_preds_numpy == 1, axis=0)
    condition_rej = (y_pred == 0) & (prop_ones > 0.5)
    # uniq_perc = 100 * np.sum(condition) / len(y_pred)
    uniq_count_rej = np.sum(condition_rej)

    
    condition_acc = (y_pred == 1) & (prop_ones < 0.5)
    uniq_count_acc = np.sum(condition_acc)
    
    add_red_text(f"""
        <b>Based on your choices:</b><br>
        Number of loans accepted by the majority, but rejected by you: {uniq_count_rej}<br>
        Number of loans rejected by the majority, but accepted by you: {uniq_count_acc}<br><br>
        <b>Reasons you might want to conform:</b><br>
        To take lower risks and to avoid facing a justification crisis, i.e., 
        not able to explain why you rejected an applicant who would have been accepted by most other models.<br><br>
        <b>Reasons you might want to be unique:</b><br>
        To avoid competiting for the same loan applicants with others.<br> 
        To give a chance to unique applicants and deal with the concerns of homogenization.<br><br>
    """)