File size: 15,816 Bytes
9936999
1e8143b
9936999
 
 
1e8143b
9936999
 
1e8143b
9936999
 
 
 
 
 
 
1e8143b
9936999
 
1e8143b
 
9936999
 
 
 
 
1e8143b
9936999
 
1e8143b
9936999
1e8143b
9936999
1e8143b
9936999
1e8143b
9936999
 
1e8143b
9936999
 
 
 
 
 
 
 
 
 
1e8143b
 
9936999
1e8143b
9936999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e8143b
9936999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e8143b
9936999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e8143b
9936999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e8143b
 
 
 
9936999
 
 
 
 
 
 
 
 
 
 
 
1e8143b
9936999
 
 
 
 
 
 
 
 
 
 
1e8143b
9936999
 
 
 
 
 
 
 
1e8143b
 
9936999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import numpy as np
import io
import sys
import os
import pandas as pd

import gc #garabage collector
from io import BytesIO
import panel as pn
import holoviews as hv
import hvplot.pandas
from warnings import filterwarnings
'''
development env: panel serve script.py --autoreload
prod prep: panel convert script.py --to pyodide-worker --out pyodide
'''

filterwarnings("ignore")
hv.extension('bokeh')


text = """
#  Feature Distribution and Stats
## AUTHOR: [`FIRAS ALI OBEID`](https://www.linkedin.com/in/feras-obeid/) 
###  GNU General Public License v3.0 (GPL-3.0)
#### Developed while working at [OppFi Inc.](https://www.oppfi.com/)

This tool performs feature binning by equal intervals and by equal pouplations in each interval vs bad rate/target binary variable
To get the feature deep dive feature distribution:

1. Upload a CSV (only numerical data)

2. Choose & press on the binary (0 / 1) target column in the `Select Target Variable` section below

3. Press Run Analysis

4. Wait few seconds and analyze the updated charts
"""

file_input = pn.widgets.FileInput(align='center')
selector = pn.widgets.MultiSelect(name='Select Target Variable')
button = pn.widgets.Button(name='Run Analysis')
widgets = pn.WidgetBox(
    pn.panel(text, margin=(0, 10)),
    pn.panel('Upload a CSV containing  (X) features and  (y) binary variable:', margin=(0, 10)),
    file_input,
    selector,
    button
)


def closest(lst, K):
    try:
        return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))]
    except:
        return K
control_max = lambda x: x.max() * 1.01 if x.max() > 0 else (x.max() * 0.99 if x.max() < 0 else x.max() + 0.01)
control_min = lambda x: x.min() * 0.99 if x.min() > 0 else (x.min() * 1.01 if x.min() < 0 else x.min() - 0.01)

def get_data():
    global target, New_Refit_routing
    if file_input.value is None:
        New_Refit_routing = pd.DataFrame({"Open_accounts": np.random.randint(1,50,100000),
                                          "Income": np.random.randint(1000,20000,100000),
                                          "Years_of_experience": np.random.randint(0,20,100000),
                                          "default": np.random.random_integers(0,1,100000)})
        target = "default"
    else:
        New_Refit_routing = BytesIO()
        New_Refit_routing.write(file_input.value)
        New_Refit_routing.seek(0)
        try:
            New_Refit_routing = pd.read_csv(New_Refit_routing, error_bad_lines=False).apply(pd.to_numeric, errors='ignore')#.set_index("id")
        except:
            New_Refit_routing = pd.read_csv(New_Refit_routing, error_bad_lines=False)
        target = None
        New_Refit_routing = New_Refit_routing.select_dtypes(exclude=['datetime', "category","object"])
        New_Refit_routing = New_Refit_routing.replace([np.inf, -np.inf], np.nan)
        # New_Refit_routing = New_Refit_routing[[cols for cols in New_Refit_routing.columns if New_Refit_routing[cols].nunique() >= 2]] #remove columns with less then 2 unique values
    return target, New_Refit_routing


def update_target(event):
    _ , New_Refit_routing = get_data()
    target = list(New_Refit_routing.columns)
    selector.set_param(options=target, value=target)

file_input.param.watch(update_target, 'value')
update_target(None)



def stats_():
    global stats
    stats = New_Refit_routing.describe().T
    stats["Missing_Values(%)"] = (New_Refit_routing.isna().sum() / len(New_Refit_routing)) * 100
    stats = pd.concat([stats, New_Refit_routing.quantile(q = [.01, .05, .95, .99]).T.rename(columns = {0.01: '1%', 0.05: '5%', 0.95: '95%', 0.99:'99%'})], axis = 1)
    stats = stats[['count', 'mean', 'std', 'min', '1%', '5%' ,'25%', '50%', '75%', '95%', '99%', 'max','Missing_Values(%)']]
    stats = stats.round(4).astype(str)

def cuts_(target):
    global test, test2, final_df , outlier_removed_stats
    df = New_Refit_routing.copy() 
    neglect = [target] + [cols for cols in df.columns if df[cols].nunique() <= 2] #remove binary and target variable
    cols = df.columns.difference(neglect)  # Getting all columns except the ones in []

    #REMOVE OUTIERS#
    df[cols] = df[cols].apply(lambda col: col.clip(lower = col.quantile(.01), 
                                        upper = closest(col[col < col.quantile(.99)].dropna().values, 
                                        col.quantile(.99))),axis = 0)

    outlier_removed_stats = df.describe().T
    remove_feature = list(outlier_removed_stats[(outlier_removed_stats["mean"]==outlier_removed_stats["max"]) & 
                        (outlier_removed_stats["mean"]==outlier_removed_stats["min"])].index)
    outlier_removed_stats = pd.concat([outlier_removed_stats, df.quantile(q = [.01, .05, .95, .99]).T.rename(columns = {0.01: '1%', 0.05: '5%', 0.95: '95%', 0.99:'99%'})], axis = 1)
    outlier_removed_stats = outlier_removed_stats[['count', 'mean', 'std', 'min', '1%', '5%' ,'25%', '50%', '75%', '95%', '99%', 'max']]                    
    outlier_removed_stats = outlier_removed_stats.round(4).astype(str)

    neglect += remove_feature
    cols = df.columns.difference(neglect)  # Getting all columns except the ones in []

    
    df[cols] = df[cols].apply(lambda col: pd.cut(col.fillna(np.nan),
                                                bins = pd.interval_range(start=float(np.apply_along_axis(control_min , 0,col.dropna())), end = float(np.apply_along_axis(control_max , 0,col.dropna())), 
                                                periods = 10), include_lowest=True).cat.add_categories(pd.Categorical(f"Missing_{col.name}")).fillna(f"Missing_{col.name}"), axis=0)


    test = pd.concat([df[cols].value_counts(normalize = True) for cols in df[cols]], axis = 1)
    cols = test.columns
    test = test.reset_index().melt(id_vars="index", 
                                var_name='column', 
                                value_name='value').dropna().reset_index(drop = True)


    test = test.rename(columns={"index":"IntervalCuts", "column":"feature", "value":"Count_Pct"})
    test.Count_Pct = test.Count_Pct.round(4)
    test.IntervalCuts = test.IntervalCuts.astype(str)
    test.IntervalCuts = test.IntervalCuts.apply(lambda x: "("+str(round(float(x.split(",")[0].strip("(")),4)) +', ' + str(round(float(x.split(",")[-1].strip("]")),4)) +"]" if (x.split(",")[0].strip("(").strip("-")[0]).isdigit() else x)

    test2 = pd.concat([df.groupby(col)[target].mean().fillna(0) for col in df[cols]], axis = 1)
    test2.columns = cols
    test2 = test2.reset_index().melt(id_vars="index", var_name='column', value_name='value').dropna().reset_index(drop = True)
    test2 = test2.rename(columns={"index":"IntervalCuts", "column":"feature", "value":"Bad_Rate_Pct"})
    test2.Bad_Rate_Pct = test2.Bad_Rate_Pct.round(4)
    test2.IntervalCuts = test2.IntervalCuts.astype(str)
    test2.IntervalCuts = test2.IntervalCuts.apply(lambda x: "("+str(round(float(x.split(",")[0].strip("(")),4)) +', ' + str(round(float(x.split(",")[-1].strip("]")),4)) +"]" if (x.split(",")[0].strip("(").strip("-")[0]).isdigit() else x)


    test["index"] = test["feature"] + "_" + test["IntervalCuts"]
    test = test.set_index("index").sort_index()
    test2["index"] = test2["feature"] + "_" + test2["IntervalCuts"]
    test2 = test2.set_index("index").sort_index()
    final_df = pd.merge(test2, test[test.columns.difference(test2.columns)], on = "index")
   

## QCUT ##
def qcuts_(target):
    global test_q, test2_q, final_df_q
    df2 = New_Refit_routing.copy()
    neglect = [target] + [cols for cols in df2.columns if df2[cols].nunique() <= 2] #remove binary and target variable
    cols = df2.columns.difference(neglect)  # Getting all columns except the ones in []

    #DEBUGGING CODE#####################################################################################
    # for i in df2[cols].columns:
    #     print(i)
    #     print(df2[i][df2[i] < df2[i].quantile(.99)].dropna().values)
    #     print(df2[i].quantile(.99))
    #     print(closest(df2[i][df2[i] < df2[i].quantile(.99)].dropna().values, df2[i].quantile(.99)))
        # df2.apply(lambda col: col.clip(lower = col.quantile(.01), 
        #                                 upper = closest(col[col < col.quantile(.99)].dropna().values, 
        #                                 col.quantile(.99))),axis = 0)
    
    ####################################################################################################
    #REMOVE OUTIERS#

    df2[cols] = df2[cols].apply(lambda col: col.clip(lower = col.quantile(.01), 
                                        upper = closest(col[col < col.quantile(.99)].dropna().values, 
                                        col.quantile(.99))),axis = 0)

    temp = df2.describe().T
    remove_feature = list(temp[(temp["mean"]==temp["max"]) & 
                        (temp["mean"]==temp["min"])].index)

    neglect+= remove_feature
    cols = df2.columns.difference(neglect)  # Getting all columns except the ones in []
    # rank(method='first') is a must in qcut 
    # df2[cols] = df2[cols].apply(lambda col: pd.qcut(col.fillna(np.nan).rank(method='first'),
    #                                                 q = 10, duplicates = "drop").cat.add_categories(pd.Categorical(f"Qcut_Missing_{col.name}")).fillna(f"Qcut_Missing_{col.name}"), axis=0)
    df2[cols] = df2[cols].apply(lambda col: pd.qcut(col.fillna(np.nan).rank(method='first'),q = 10, labels=range(1,11)).cat.rename_categories({10:"Last"}).astype(str).replace(dict(dict(pd.concat([col,
           pd.qcut(col.fillna(np.nan).rank(method='first'),q = 10, labels=range(1,11)).cat.rename_categories({10:"Last"})
           .apply(str)], axis = 1, keys= ["feature", "qcuts"]).groupby("qcuts").agg([min, max]).reset_index().astype(str).set_index("qcuts",drop = False)
     .apply(lambda x :x[0]+"_"+"("+str(round(float(x[1]),2))+","+str(round(float(x[2]),2))+"]",axis = 1)),**{"nan":f"Qcut_Missing_{col.name}"})), axis=0)

    test_q = pd.concat([df2[cols].value_counts(normalize = True) for cols in df2[cols]], axis = 1)
    cols = test_q.columns
    test_q = test_q.reset_index().melt(id_vars="index", 
                                var_name='column', 
                                value_name='value').dropna().reset_index(drop = True)


    test_q = test_q.rename(columns={"index":"IntervalCuts", "column":"feature", "value":"Count_Pct"})
    test_q.Count_Pct = test_q.Count_Pct.round(4)
    test_q.IntervalCuts = test_q.IntervalCuts.astype(str)
    # test_q.IntervalCuts = test_q.IntervalCuts.apply(lambda x: "("+str(round(float(x.split(",")[0].strip("(")),4)) +', ' + str(round(float(x.split(",")[-1].strip("]")),4)) +"]" if (x.split(",")[0].strip("(")[0]).isdigit() else x)


    test2_q = pd.concat([df2.groupby(col)[target].mean().fillna(0) for col in df2[cols]], axis = 1)
    test2_q.columns = cols
    test2_q = test2_q.reset_index().melt(id_vars="index", var_name='column', value_name='value').dropna().reset_index(drop = True)
    test2_q = test2_q.rename(columns={"index":"IntervalCuts", "column":"feature", "value":"Bad_Rate_Pct"})
    test2_q.Bad_Rate_Pct = test2_q.Bad_Rate_Pct.round(4)
    test2_q.IntervalCuts = test2_q.IntervalCuts.astype(str)
    # test2_q.IntervalCuts = test2_q.IntervalCuts.apply(lambda x: "("+str(round(float(x.split(",")[0].strip("(")),4)) +', ' + str(round(float(x.split(",")[-1].strip("]")),4)) +"]" if (x.split(",")[0].strip("(")[0]).isdigit() else x)

    test_q["index"] = test_q["feature"] + "_" + test_q["IntervalCuts"]
    test_q = test_q.set_index("index").sort_index()
    test2_q["index"] = test2_q["feature"] + "_" + test2_q["IntervalCuts"]
    test2_q = test2_q.set_index("index").sort_index()
    final_df_q = pd.merge(test2_q, test_q[test_q.columns.difference(test2_q.columns)], on = "index")
    



@pn.depends(button.param.clicks)
def run(_):
    target, New_Refit_routing = get_data()
    if target == None:
        target = str(selector.value[0])
    else:
        target = "default"
    print(str(selector.value[0]))
    print(target)
    # print(type(file_input.value))
    # print(type(New_Refit_routing))
    print(New_Refit_routing.head())

    stats_()
    cuts_(target)
    qcuts_(target)
    test2_plot = test2.set_index("IntervalCuts").hvplot.scatter(yaxis = "left", y = "Bad_Rate_Pct",
            groupby = "feature", xlabel = "Intervals(Bins)", ylabel = "%Count vs %BadRate",height = 500,
            width = 1000, title = "Features Segments Cuts by Count", legend = True,label = "Bad Rate(%)").opts(xrotation=45, yformatter = "%.04f",show_grid=True, 
                                                                                        framewise=True, color = "red", legend_position='top_right')
    test_plot = test.set_index("IntervalCuts").hvplot.bar(y = "Count_Pct",
                groupby = "feature", xlabel = "Intervals(Bins)", ylabel = "%Count vs %BadRate",height = 500,
                width = 1000, title = "Features Segments Cuts by Count", legend=True, alpha=0.3, label ="Equal Intervals Data Points(%)").opts(xrotation=45, yformatter = "%.04f",show_grid=True, framewise=True, yaxis='left')
    final_table = final_df.hvplot.table(groupby = "feature", width=400)

    test2_plot_q = test2_q.set_index("IntervalCuts").hvplot.scatter(yaxis = "left", y = "Bad_Rate_Pct",
                groupby = "feature", xlabel = "Intervals(Bins)", ylabel = "%Count vs %BadRate",height = 500,
                width = 1000, title = "Features Segments Q_Cuts by Count", legend = True).opts(xrotation=45, yformatter = "%.04f",show_grid=True, 
                                                                                                framewise=True, color = "red")
    test_plot_q = test_q.set_index("IntervalCuts").hvplot.bar(y = "Count_Pct",
                groupby = "feature", xlabel = "Intervals(Bins)", ylabel = "%Count vs %BadRate",height = 500,
                width = 1000, title = "Features Segments Q_Cuts by Count", legend=True, alpha=0.3, label ="Equal Population Data Points(%)").opts(xrotation=45, yformatter = "%.04f",show_grid=True, framewise=True, yaxis='left')
    final_table_q = final_df_q.hvplot.table(groupby = "feature", width=400)


    stats_table = stats.reset_index().hvplot.table(width = 1000,title="Summary Statistics of the Data", hover = True, responsive=True, 
                    shared_axes= False, fit_columns = True,
                    padding=True, height=500, index_position = 0, fontscale = 1.5)
    stats_table_no_outliers = outlier_removed_stats.reset_index().hvplot.table(width = 1000,title="Summary Statistics of the Capped Outliers Data", hover = True, responsive=True, 
                    shared_axes= False, fit_columns = True,
                    padding=True, height=500, index_position = 0, fontscale = 1.5)
    #PANEL
    pn.extension( template="fast")
    pn.state.template.param.update(
        # site_url="",
        site="CreditRisk",
        title="Feature Distribution & Statistics",
        # favicon="https://raw.githubusercontent.com/opploans/DS_modelling_tools/main/docs/Resources/favicon.ico?token=GHSAT0AAAAAABYR5F6VDZ2PU33UY6NN7NQEY3C2ASA"
        # favicon="",
    )
    
    title = pn.pane.Markdown(
    """
    ### Feature Distribution (Bin Count & Bad Rate)
    """,
    width=800,
    )

    return pn.Column(
                title,
                (test2_plot * test_plot * test2_plot_q * test_plot_q + (final_table + final_table_q)).cols(3),
                (stats_table + stats_table_no_outliers).cols(2),
            )



profiles = '''
### Other Web Apps:

* [Twitter Sentiment Analysis Flask App](https://firobeid.pythonanywhere.com/)

* [Personal Lectures @ UCBerkley Using Panel App](https://firobeid.github.io/compose-plots/script.html)
'''
pn.Row(pn.Column(widgets, profiles), pn.layout.Spacer(width=20), run).servable(target='main')