firobeid commited on
Commit
e24a78d
·
verified ·
1 Parent(s): 231d381

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +761 -124
app.py CHANGED
@@ -1,147 +1,784 @@
 
 
 
1
  import io
2
- import random
3
- from typing import List, Tuple
4
-
5
- import aiohttp
 
 
6
  import panel as pn
7
- from PIL import Image
8
- from transformers import CLIPModel, CLIPProcessor
 
 
 
 
 
 
9
 
10
- pn.extension(design="bootstrap", sizing_mode="stretch_width")
 
 
11
 
12
- ICON_URLS = {
13
- "brand-github": "https://github.com/holoviz/panel",
14
- "brand-twitter": "https://twitter.com/Panel_Org",
15
- "brand-linkedin": "https://www.linkedin.com/company/panel-org",
16
- "message-circle": "https://discourse.holoviz.org/",
17
- "brand-discord": "https://discord.gg/AXRHnJU6sP",
18
- }
 
 
 
 
 
 
 
19
 
 
 
 
 
 
 
 
 
20
 
21
- async def random_url(_):
22
- pet = random.choice(["cat", "dog"])
23
- api_url = f"https://api.the{pet}api.com/v1/images/search"
24
- async with aiohttp.ClientSession() as session:
25
- async with session.get(api_url) as resp:
26
- return (await resp.json())[0]["url"]
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- @pn.cache
30
- def load_processor_model(
31
- processor_name: str, model_name: str
32
- ) -> Tuple[CLIPProcessor, CLIPModel]:
33
- processor = CLIPProcessor.from_pretrained(processor_name)
34
- model = CLIPModel.from_pretrained(model_name)
35
- return processor, model
 
 
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- async def open_image_url(image_url: str) -> Image:
39
- async with aiohttp.ClientSession() as session:
40
- async with session.get(image_url) as resp:
41
- return Image.open(io.BytesIO(await resp.read()))
42
 
 
 
 
 
 
 
 
 
 
43
 
44
- def get_similarity_scores(class_items: List[str], image: Image) -> List[float]:
45
- processor, model = load_processor_model(
46
- "openai/clip-vit-base-patch32", "openai/clip-vit-base-patch32"
47
- )
48
- inputs = processor(
49
- text=class_items,
50
- images=[image],
51
- return_tensors="pt", # pytorch tensors
52
- )
53
- outputs = model(**inputs)
54
- logits_per_image = outputs.logits_per_image
55
- class_likelihoods = logits_per_image.softmax(dim=1).detach().numpy()
56
- return class_likelihoods[0]
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- async def process_inputs(class_names: List[str], image_url: str):
60
- """
61
- High level function that takes in the user inputs and returns the
62
- classification results as panel objects.
63
- """
64
- try:
65
- main.disabled = True
66
- if not image_url:
67
- yield "##### ⚠️ Provide an image URL"
68
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- yield "##### ⚙ Fetching image and running model..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  try:
72
- pil_img = await open_image_url(image_url)
73
- img = pn.pane.Image(pil_img, height=400, align="center")
74
- except Exception as e:
75
- yield f"##### 😔 Something went wrong, please try a different URL!"
76
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- class_items = class_names.split(",")
79
- class_likelihoods = get_similarity_scores(class_items, pil_img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # build the results column
82
- results = pn.Column("##### 🎉 Here are the results!", img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- for class_item, class_likelihood in zip(class_items, class_likelihoods):
85
- row_label = pn.widgets.StaticText(
86
- name=class_item.strip(), value=f"{class_likelihood:.2%}", align="center"
87
- )
88
- row_bar = pn.indicators.Progress(
89
- value=int(class_likelihood * 100),
90
- sizing_mode="stretch_width",
91
- bar_color="secondary",
92
- margin=(0, 10),
93
- design=pn.theme.Material,
94
- )
95
- results.append(pn.Column(row_label, row_bar))
96
- yield results
97
- finally:
98
- main.disabled = False
99
-
100
-
101
- # create widgets
102
- randomize_url = pn.widgets.Button(name="Randomize URL", align="end")
103
-
104
- image_url = pn.widgets.TextInput(
105
- name="Image URL to classify",
106
- value=pn.bind(random_url, randomize_url),
107
- )
108
- class_names = pn.widgets.TextInput(
109
- name="Comma separated class names",
110
- placeholder="Enter possible class names, e.g. cat, dog",
111
- value="cat, dog, parrot",
112
- )
113
 
114
- input_widgets = pn.Column(
115
- "##### 😊 Click randomize or paste a URL to start classifying!",
116
- pn.Row(image_url, randomize_url),
117
- class_names,
118
- )
119
 
120
- # add interactivity
121
- interactive_result = pn.panel(
122
- pn.bind(process_inputs, image_url=image_url, class_names=class_names),
123
- height=600,
124
- )
 
 
 
 
 
125
 
126
- # add footer
127
- footer_row = pn.Row(pn.Spacer(), align="center")
128
- for icon, url in ICON_URLS.items():
129
- href_button = pn.widgets.Button(icon=icon, width=35, height=35)
130
- href_button.js_on_click(code=f"window.open('{url}')")
131
- footer_row.append(href_button)
132
- footer_row.append(pn.Spacer())
133
-
134
- # create dashboard
135
- main = pn.WidgetBox(
136
- input_widgets,
137
- interactive_result,
138
- footer_row,
139
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- title = "Panel Demo - Image Classification"
142
- pn.template.BootstrapTemplate(
143
- title=title,
144
- main=main,
145
- main_max_width="min(50%, 698px)",
146
- header_background="#F08080",
147
- ).servable(title=title)
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ import numpy as np
4
  import io
5
+ import sys
6
+ import os
7
+ import pandas as pd
8
+ import datetime
9
+ import gc #garabage collector
10
+ from io import BytesIO
11
  import panel as pn
12
+ import holoviews as hv
13
+ import hvplot.pandas
14
+ import xlsxwriter
15
+ from warnings import filterwarnings
16
+ '''
17
+ development env: panel serve script.py --autoreload
18
+ prod prep: panel convert script.py --to pyodide-worker --out pyodide
19
+ '''
20
 
21
+ filterwarnings("ignore")
22
+ # hv.extension('bokeh')
23
+ pn.extension( "plotly", template="fast")
24
 
25
+ pn.state.template.param.update(
26
+ # site_url="",
27
+ site="ModelMonitor",
28
+ title="Classification Model Metrics",
29
+ # favicon="https://raw.githubusercontent.com/firobeid/firobeid.github.io/main/docs/compose-plots/Resources/favicon.ico",
30
+ )
31
+ #######################
32
+ ###UTILITY FUNCTIONS###
33
+ #######################
34
+ def percentage(df):
35
+ def segment(df):
36
+ return round(df["Count"]/df["Count"].sum(),4)
37
+ df["percent"] = segment(df)
38
+ return df
39
 
40
+ def AUC(group):
41
+ from sklearn.metrics import roc_auc_score
42
+ auc = roc_auc_score(group['TARGET'],group['SCORE'])
43
+ # N = sum(group["N"])
44
+ N = round(len(group.loc[group["TARGET"].notna()]),0)
45
+ cols = ["AUC","Count"]
46
+ # return trapezoidal_rule(FPR.to_numpy(),TPR.to_numpy())
47
+ return pd.Series([auc, N], index = cols)
48
 
49
+ def ROC(group):
50
+ from sklearn.metrics import roc_curve
51
+ FPR,TPR,T = roc_curve(group['TARGET'],group['SCORE'])
52
+ cols = ['TPR', 'FPR']
53
+ return pd.concat([pd.Series(TPR),pd.Series(FPR)], keys = cols, axis = 1)
 
54
 
55
+ def ks(group):
56
+ from scipy.stats import ks_2samp
57
+ y_real = group['TARGET']
58
+ y_proba = group['SCORE']
59
+
60
+ df = pd.DataFrame()
61
+ df['real'] = y_real
62
+ df['proba'] = y_proba
63
+
64
+ # Recover each class
65
+ class0 = df[df['real'] == 0]
66
+ class1 = df[df['real'] == 1]
67
+
68
+ ks_ = ks_2samp(class0['proba'], class1['proba'])
69
+
70
+ N = round(len(group.loc[group["TARGET"].notna()]),0)
71
+ cols = ["KS","Count"]
72
+
73
+ return pd.Series([ks_[0], N], index = cols)
74
 
75
+ def psi(df):
76
+ '''
77
+ https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html#:~:text=To%20calculate%20the%20PSI%20we,the%20percents%20in%20each%20bucket.
78
+ '''
79
+ df[df == 0] = 0.001
80
+ sub = df.copy()
81
+ sub = sub.iloc[:,:-1].sub(df.validation,axis = 0)
82
+ div = df.copy()
83
+ div= div.iloc[:,:-1].div(df.validation, axis=0)
84
+ div = np.log(div)
85
+ return (sub*div).sum(axis = 0)
86
 
87
+ def add_extremes_OOT(df, name:str, score:str):
88
+ '''
89
+ Mitigate bias in OOT/Serving/baseline set that might not have high confidence scores or low confidence scores
90
+ :param: name: str, name of the appid column
91
+ :param: score: str, name of the score column
92
+ '''
93
+ # df.loc[len(df.index)] = [np.nan, "Extreme_Case_Max", np.nan, np.nan, np.nan,994.0,0.0009,np.nan,np.nan,np.nan,np.nan]
94
+ # df.loc[len(df.index)] = [np.nan, "Extreme_Case_Min", np.nan, np.nan, np.nan,158.0,0.9999,np.nan,np.nan,np.nan,np.nan]
95
+ df.loc[len(df.index)] = [np.nan for i in range(0,df.shape[1])]
96
+ df.loc[(len(df.index)-1), [name, score]] = ["Extreme_Case_Max", 0.0009]
97
+ df.loc[len(df.index)] = [np.nan for i in range(0,df.shape[1])]
98
+ df.loc[(len(df.index)-1), [name, score]] = ["Extreme_Case_Min", 0.9999]
99
+ return df
100
 
101
+ # def last_3months(df):
102
+ # from datetime import datetime
103
+ # from dateutil.relativedelta import relativedelta
104
+ # from pandas.tseries.offsets import MonthEnd
105
 
106
+ # end_of_month = ((pd.Timestamp(datetime.now().strftime('%Y-%m-%d')) - pd.Timedelta(70, unit='D')) + relativedelta(months=-1)) + MonthEnd(0)
107
+ # start_of_month = end_of_month + MonthEnd(-3) + relativedelta(days=1)
108
+ # end_of_month = end_of_month +relativedelta(hours=23, minutes=59, seconds=59)
109
+ # print('Start Month %r --- End Month %r' % (start_of_month, end_of_month))
110
+ # try:
111
+ # date_column = list(filter(lambda x:x.endswith("DATE"),gains_df.columns))[0]
112
+ # except:
113
+ # date_column = 'CREATED_DATE'
114
+ # return df[df[date_column].between(start_of_month, end_of_month)]
115
 
116
+ def gains_table_proba(data=None,target=None, prob=None):
117
+ data = data.copy()
118
+ data['target0'] = 1 - data[target]
119
+ data['bucket'] = pd.qcut(data[prob], 10)
120
+ grouped = data.groupby('bucket', as_index = False)
121
+ kstable = pd.DataFrame()
122
+ kstable['min_prob'] = grouped.min()[prob]
123
+ kstable['max_prob'] = grouped.max()[prob]
124
+ kstable['count'] = grouped.count()['target0']
125
+ kstable['cum_total']=(kstable['count'] / kstable['count'].sum()).cumsum()
126
+ kstable['events'] = grouped.sum()[target]
127
+ kstable['nonevents'] = grouped.sum()['target0']
128
+ kstable['interval_rate'] = kstable['events'] / kstable['count']
129
+ kstable = kstable.sort_values(by="min_prob", ascending=0).reset_index(drop = True)
130
+ kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
131
+ kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
132
+ kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
133
+ kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
134
+ kstable['mid_point'] = np.nan
135
+ kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 4) * 100
136
 
137
+ #Formating
138
+ kstable["cum_total"] = kstable["cum_total"].sort_values().values
139
+ kstable = kstable.rename(columns={"min_prob":"low", "max_prob":"high"})
140
+ kstable['mid_point'] = round((kstable['high'] + kstable['low']) / 2, 4)
141
+ kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
142
+ kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
143
+ kstable.index = range(1,11)
144
+ kstable.index.rename('Decile', inplace=True)
145
+ pd.set_option('display.max_columns', 15)
146
+ # print(kstable)
147
+ #Display KS
148
+ from colorama import Fore
149
+ ks_3mnths = "KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0]))
150
+ print("KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
151
+ kstable['cum_eventrate']= kstable['cum_eventrate'].str.replace("%","").astype(float)
152
+ kstable['cum_noneventrate']= kstable['cum_noneventrate'].str.replace("%","").astype(float)
153
+ kstable.index = list(range(10,0,-1))
154
+ kstable = kstable.iloc[::-1]
155
+ return(kstable, ks_3mnths)
156
 
157
+ def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
158
+ # https://www.kaggle.com/code/podsyp/population-stability-index
159
+ '''Calculate the PSI (population stability index) across all variables
160
+ Args:
161
+ expected: numpy matrix of original values
162
+ actual: numpy matrix of new values, same size as expected
163
+ buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
164
+ buckets: number of quantiles to use in bucketing variables
165
+ axis: axis by which variables are defined, 0 for vertical, 1 for horizontal
166
+ Returns:
167
+ psi_values: ndarray of psi values for each variable
168
+ Author:
169
+ Matthew Burke
170
+ github.com/mwburke
171
+ worksofchart.com
172
+ '''
173
+
174
+ def psi(expected_array, actual_array, buckets):
175
+ '''Calculate the PSI for a single variable
176
+ Args:
177
+ expected_array: numpy array of original values
178
+ actual_array: numpy array of new values, same size as expected
179
+ buckets: number of percentile ranges to bucket the values into
180
+ Returns:
181
+ psi_value: calculated PSI value
182
+ '''
183
+
184
+ def scale_range (input, min, max):
185
+ input += -(np.min(input))
186
+ input /= np.max(input) / (max - min)
187
+ input += min
188
+ return input
189
+
190
+
191
+ breakpoints = np.arange(0, buckets + 1) / (buckets) * 100
192
+
193
+ if buckettype == 'bins':
194
+ breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
195
+ elif buckettype == 'quantiles':
196
+ breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])
197
+
198
+
199
+
200
+ expected_percents = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
201
+ actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)
202
+
203
+ def sub_psi(e_perc, a_perc):
204
+ '''Calculate the actual PSI value from comparing the values.
205
+ Update the actual value to a very small number if equal to zero
206
+ '''
207
+ if a_perc == 0:
208
+ a_perc = 0.0001
209
+ if e_perc == 0:
210
+ e_perc = 0.0001
211
+
212
+ value = (e_perc - a_perc) * np.log(e_perc / a_perc)
213
+ return(value)
214
+
215
+ psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))
216
+
217
+ return(psi_value)
218
+
219
+ if len(expected.shape) == 1:
220
+ psi_values = np.empty(len(expected.shape))
221
+ else:
222
+ psi_values = np.empty(expected.shape[axis])
223
+
224
+ for i in range(0, len(psi_values)):
225
+ if len(psi_values) == 1:
226
+ psi_values = psi(expected, actual, buckets)
227
+ elif axis == 0:
228
+ psi_values[i] = psi(expected[:,i], actual[:,i], buckets)
229
+ elif axis == 1:
230
+ psi_values[i] = psi(expected[i,:], actual[i,:], buckets)
231
+
232
+ return(psi_values)
233
+
234
+ return round(10 **((158.313177 - UW5_Score) /274.360149), 18)
235
+
236
+ def lift_init(df:pd.DataFrame, baseline = None, is_baseline = True):
237
+ from tqdm import tqdm
238
+ # global standalone_scores_OOT
239
+ cols = ['SCORE']
240
 
241
+ lift_chart_data_OOT = pd.DataFrame()
242
+ for q in tqdm([10,20,50,100]):
243
+ # df_new["QUARTER"] = pd.PeriodIndex(df_new.CREATE_DATE, freq='Q')
244
+ # fd = baseline.dropna(subset = period_metrics.value)[cols].apply(lambda col: pd.qcut(col.rank(method='first'),q = q, ), axis = 0).copy()
245
+ # pd.cut(prod['SCORE'], bins = pd.qcut(baseline['SCORE'],10, retbins = True)[1])
246
+ if is_baseline == True:
247
+ # print(df)
248
+ # print(df.dropna(subset = ['MONTHLY']))
249
+ fd = df.dropna(subset = [period_metrics.value])[cols].apply(lambda col: pd.cut(col, bins = pd.qcut(col,q=q, retbins = True)[1]) , axis = 0).copy()
250
+ fd = pd.concat([df.dropna(subset = [period_metrics.value])[period_metrics.value], df.dropna(subset = [period_metrics.value])['TARGET'], fd], axis = 1)
251
+ fd = pd.concat([fd.groupby(x)['TARGET'].mean().fillna(0) for x in fd[cols]], axis = 1, keys = cols)
252
+ fd.index.name = 'SCORE_BAND'
253
+
254
+ else:
255
+ # print(baseline.dropna(subset = [period_metrics.value])[cols].values.ravel().shape)
256
+ # print(pd.qcut(baseline.dropna(subset = [period_metrics.value])[cols].values.ravel(),q=q, retbins = True))
257
+ bins_ = pd.qcut(baseline.dropna(subset = [period_metrics.value])[cols].values.ravel(),q=q, retbins = True)[1]
258
+ fd = df.groupby([period_metrics.value]).apply(lambda col: col[cols].apply(lambda col: pd.cut(col, bins = bins_), axis = 0)).copy()
259
+ # fd = df.groupby(period_metrics.value).apply(lambda col: col[cols].apply(lambda col: pd.cut(col, bins = pd.qcut(col,q=q, retbins = True)[1]), axis = 0)).copy()
260
+ fd = pd.concat([df[period_metrics.value], df['TARGET'], fd], axis = 1)
261
+ fd = fd.groupby(period_metrics.value).apply(lambda col: pd.concat([col.groupby(x)['TARGET'].mean().fillna(0) for x in col[cols]], axis = 1, keys = cols))
262
+ fd.index.names = [period_metrics.value, 'SCORE_BAND']
263
+ # fd['APPLICATION_MONTH'] = fd['APPLICATION_MONTH'].astype(str)
264
+ fd = fd.reset_index()
265
+ fd['BINS'] = q
266
+ lift_chart_data_OOT = lift_chart_data_OOT.append(fd)
267
+ if is_baseline == True:
268
+ lift_chart_data_OOT[period_metrics.value] = 'Baseline'
269
+
270
+ standalone_scores_OOT = lift_chart_data_OOT.melt(id_vars=[period_metrics.value,'BINS','SCORE_BAND'],value_vars=cols,
271
+ var_name='SCORE',
272
+ value_name='BAD_RATE').dropna().reset_index(drop = True).copy()
273
+ standalone_scores_OOT[['BINS', 'SCORE_BAND']] = standalone_scores_OOT[['BINS', 'SCORE_BAND']].astype(str)
274
+ standalone_scores_OOT = pd.concat([standalone_scores_OOT["BINS"] + "-" + standalone_scores_OOT["SCORE_BAND"] + "-" + standalone_scores_OOT["SCORE"],
275
+ standalone_scores_OOT[[period_metrics.value,'BAD_RATE']]], axis = 1).rename(columns = {0:'BINS_SCOREBAND_SCORE'})
276
+ standalone_scores_OOT = standalone_scores_OOT.pivot(index = 'BINS_SCOREBAND_SCORE', columns=period_metrics.value)['BAD_RATE'].reset_index()
277
+ standalone_scores_OOT.index.name = ""
278
+ standalone_scores_OOT.columns.name = ""
279
+ standalone_scores_OOT = pd.concat([standalone_scores_OOT['BINS_SCOREBAND_SCORE'].str.split('-', expand=True),
280
+ standalone_scores_OOT],axis = 1).rename(columns ={0:'BINS', 1: 'SCORE_BAND', 2: 'SCORE'}).drop(columns = 'BINS_SCOREBAND_SCORE')
281
+ # standalone_scores_OOT[['BINS', 'SCORE_BAND']] = standalone_scores_OOT[['BINS', 'SCORE_BAND']]#.astype(int)
282
+ standalone_scores_OOT['BINS'] = standalone_scores_OOT['BINS']
283
+ standalone_scores_OOT.sort_values(['SCORE', 'SCORE_BAND'], inplace = True)
284
+ return standalone_scores_OOT, lift_chart_data_OOT
285
+
286
+ def lift_init_plots(df:pd.DataFrame, is_baseline = True):
287
+ from tqdm import tqdm
288
+ # global standalone_scores_OOT
289
+ cols = ['SCORE']
290
+
291
+ lift_chart_data_OOT = pd.DataFrame()
292
+ for q in tqdm([10,20,50,100]):
293
+ # df_new["QUARTER"] = pd.PeriodIndex(df_new.CREATE_DATE, freq='Q')
294
+ # fd = baseline.dropna(subset = period_metrics.value)[cols].apply(lambda col: pd.qcut(col.rank(method='first'),q = q, ), axis = 0).copy()
295
+ # pd.cut(prod['SCORE'], bins = pd.qcut(baseline['SCORE'],10, retbins = True)[1])
296
+ # fd = df.dropna(subset = period_metrics.value)[cols].apply(lambda col: pd.cut(col, bins = pd.qcut(col,q=q, retbins = True)[1]) , axis = 0).copy()
297
+ if is_baseline == True:
298
+ fd = df.dropna(subset = period_metrics.value)[cols].apply(lambda col: pd.qcut(col.rank(method='first'),q = q, labels=range(1, q + 1)), axis = 0).copy()
299
+ fd = pd.concat([df.dropna(subset = period_metrics.value)[period_metrics.value], df.dropna(subset = period_metrics.value)['TARGET'], fd], axis = 1)
300
+ fd = pd.concat([fd.groupby(x)['TARGET'].mean().fillna(0) for x in fd[cols]], axis = 1, keys = cols)
301
+ fd.index.name = 'SCORE_BAND'
302
+
303
+ else:
304
+ fd = df.groupby(period_metrics.value).apply(lambda col: col[cols].apply(lambda col: pd.qcut(col.rank(method='first'),q = q, labels=range(1,q + 1)), axis = 0)).copy()
305
+ fd = pd.concat([df[period_metrics.value], df['TARGET'], fd], axis = 1)
306
+ fd = fd.groupby(period_metrics.value).apply(lambda col: pd.concat([col.groupby(x)['TARGET'].mean().fillna(0) for x in col[cols]], axis = 1, keys = cols))
307
+ # print(fd.index)
308
+ fd.index.names = [period_metrics.value, 'SCORE_BAND']
309
+ # fd = fd.reset_index(names = ['APPLICATION_MONTH', 'SCORE_BAND'])
310
+ fd = fd.reset_index()
311
+ # fd['APPLICATION_MONTH'] = fd['APPLICATION_MONTH'].astype(str)
312
+ fd['BINS'] = q
313
+ lift_chart_data_OOT = lift_chart_data_OOT.append(fd)
314
+ if is_baseline == True:
315
+ lift_chart_data_OOT[period_metrics.value] = 'Baseline'
316
+ lift_chart_data_OOT.sort_values(['SCORE', 'SCORE_BAND'], inplace = True)
317
+ standalone_scores_OOT = lift_chart_data_OOT.melt(id_vars=[period_metrics.value,'BINS','SCORE_BAND'],value_vars=cols,
318
+ var_name='SCORE',
319
+ value_name='BAD_RATE').dropna().reset_index(drop = True).copy()
320
+ standalone_scores_OOT[['BINS', 'SCORE_BAND']] = standalone_scores_OOT[['BINS', 'SCORE_BAND']].astype(str)
321
+ standalone_scores_OOT = pd.concat([standalone_scores_OOT["BINS"] + "-" + standalone_scores_OOT["SCORE_BAND"] + "-" + standalone_scores_OOT["SCORE"],
322
+ standalone_scores_OOT[[period_metrics.value,'BAD_RATE']]], axis = 1).rename(columns = {0:'BINS_SCOREBAND_SCORE'})
323
+ standalone_scores_OOT = standalone_scores_OOT.pivot(index = 'BINS_SCOREBAND_SCORE', columns=period_metrics.value)['BAD_RATE'].reset_index()
324
+ standalone_scores_OOT.index.name = ""
325
+ standalone_scores_OOT.columns.name = ""
326
+ standalone_scores_OOT = pd.concat([standalone_scores_OOT['BINS_SCOREBAND_SCORE'].str.split('-', expand=True),
327
+ standalone_scores_OOT],axis = 1).rename(columns ={0:'BINS', 1: 'SCORE_BAND', 2: 'SCORE'}).drop(columns = 'BINS_SCOREBAND_SCORE')
328
+ standalone_scores_OOT[['BINS', 'SCORE_BAND']] = standalone_scores_OOT[['BINS', 'SCORE_BAND']].astype(int)
329
+ standalone_scores_OOT['BINS'] = standalone_scores_OOT['BINS']
330
+ standalone_scores_OOT.sort_values(['SCORE', 'SCORE_BAND'], inplace = True)
331
+ return standalone_scores_OOT
332
+
333
+ def save_csv(df, metric):
334
+ from io import StringIO
335
+ sio = StringIO()
336
+ df.to_csv(sio)
337
+ sio.seek(0)
338
+ return pn.widgets.FileDownload(sio, embed=True, filename='%s.csv'%metric)
339
+
340
+ def get_xlsx(df1,df2,df3,df4,df5,df6):
341
+ from io import BytesIO
342
+ output = BytesIO()
343
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
344
+ df1.to_excel(writer, sheet_name="PSI")
345
+ df2.to_excel(writer, sheet_name="AUC")
346
+ df3.to_excel(writer, sheet_name="KS")
347
+ df4.to_excel(writer, sheet_name="LABEL_DRIFT")
348
+ df5.to_excel(writer, sheet_name="LABEL_Tables")
349
+ df6.to_excel(writer, sheet_name="GAINS_Tables")
350
+ writer.save() # Important!
351
+ output.seek(0) # Important!
352
+ return pn.widgets.FileDownload(output,embed=True, filename='results.csv', button_type="primary")
353
+
354
+
355
+ def expected_calibration_error(y, proba, bins = 'fd'):
356
+ import numpy as np
357
+ bin_count, bin_edges = np.histogram(proba, bins = bins)
358
+ n_bins = len(bin_count)
359
+ bin_edges[0] -= 1e-8 # because left edge is not included
360
+ bin_id = np.digitize(proba, bin_edges, right = True) - 1
361
+ bin_ysum = np.bincount(bin_id, weights = y, minlength = n_bins)
362
+ bin_probasum = np.bincount(bin_id, weights = proba, minlength = n_bins)
363
+ bin_ymean = np.divide(bin_ysum, bin_count, out = np.zeros(n_bins), where = bin_count > 0)
364
+ bin_probamean = np.divide(bin_probasum, bin_count, out = np.zeros(n_bins), where = bin_count > 0)
365
+ ece = np.abs((bin_probamean - bin_ymean) * bin_count).sum() / len(proba)
366
+ return ece, bin_probamean, bin_ymean, bin_id, bin_count, bin_edges
367
+ ###############################
368
+ ###END OFF UTILITY FUNCTIONS###
369
+ ###############################
370
+
371
+ text = """
372
+ #Classification Model Metrics
373
+ ## AUTHOR: [`FIRAS ALI OBEID`](https://www.linkedin.com/in/feras-obeid/)
374
+ ### GNU General Public License v3.0 (GPL-3.0)
375
+ #### Developed while working at [OppFi Inc.](https://www.oppfi.com/)
376
+
377
+ This tool performs ML model ,in production, monitoring across time,
378
+ where production weeks/months/quarters are compared too a selective baseline.
379
+
380
+ 1. Upload a CSV containing:
381
+
382
+ **(Date)** Highly Recommended but **optional**
383
+ **(Score)** Probability Predictions
384
+ **(Target)** Binary Target/True Label
385
+
386
+ 2. Check the box if you CSV has a DATE column, otherwise dates are generated based on current timestamp and spanning back by
387
+ timedelta of csv length in hourly frequency.
388
+
389
+ 3. Choose & press the right columns in the `Select Boxes` below when you upload a csv
390
+
391
+ 4. Select a baseline date slice **mandatory**. If your baseline is from a different time then the production time,
392
+ make sure to append it to the csv before uploading.
393
+
394
+ 5. Press Get Metrics
395
+
396
+ 6. Wait few seconds and analyze the updated charts
397
+ """
398
+
399
+
400
+
401
+ # date = str(input('What is the name off the date column: ').upper())
402
+ # id_ = str(input('What is the name off the APP name/ID column: ').upper())
403
+ # score = str(input('What is the name off the score column (i.e UW5,DM_QL...): ').upper())
404
+ # target = str(input('What is the name off the Target column (i.e Real target values such as PD70_RATIO...: ').upper())
405
+
406
+ file_input = pn.widgets.FileInput(align='center')
407
+ date_selector = pn.widgets.Select(name='Select Date Column',)
408
+ check_date = pn.widgets.Checkbox(name = '<--',value = False) # T/F
409
+ target_selector = pn.widgets.Select(name='Select Target Variable(True Label)')
410
+ score_selector = pn.widgets.Select(name='Select Predictions Column(Raw Probaility)')
411
+ period_metrics = pn.widgets.Select(name='Select Period', options = ['MONTHLY','WEEKLY', 'QUARTERLY'])
412
+
413
+ date_range_ = pn.widgets.DateRangeSlider(name='Baseline Period',) #value=(start, end), start=start, end=end
414
+
415
+ random_seed = pn.widgets.IntSlider(name='Random Seed for Random Generated Data (OnSet)', value=42, start=0, end=1000, step=1)
416
+
417
+ button = pn.widgets.Button(name='Get Metrics')
418
+ widgets = pn.WidgetBox(
419
+ pn.panel(text, margin=(0, 20)),
420
+ pn.panel('**Check box if your data has a date column *before uploading the file* \n (otherwise keep it empty)**'),
421
+ check_date,
422
+ file_input,
423
+ random_seed,
424
+ pn.panel('\n'),
425
+ date_selector,
426
+ target_selector,
427
+ score_selector,
428
+ period_metrics,
429
+ date_range_,
430
+ button
431
+ )
432
+
433
+ # start, end = stocks.index.min(), stocks.index.max()
434
+ # year = pn.widgets.DateRangeSlider(name='Year', value=(start, end), start=start, end=end)
435
+ # ,id_:'ID',
436
+
437
+
438
+ def get_data():
439
+ global df
440
+ if file_input.value is None:
441
+ np.random.seed(random_seed.value)
442
  try:
443
+ df = pd.DataFrame({'DATE': pd.date_range(start = (datetime.datetime.today() - pd.DateOffset(hours = 9999)), end = datetime.datetime.today(), tz = "US/Eastern", freq = "H"),
444
+ 'ID': [i for i in range(10000)],
445
+ 'SCORE':np.random.uniform(size = 10000),
446
+ 'TARGET': np.random.choice([0,1],10000, p=[0.9,0.1])})
447
+ except:
448
+ try:
449
+ df = pd.DataFrame({'DATE': pd.date_range(start = (datetime.datetime.today() - pd.DateOffset(hours = 9999 + 1)), end = datetime.datetime.today(), tz = "US/Eastern", freq = "H"),
450
+ 'ID': [i for i in range(10000)],
451
+ 'SCORE':np.random.uniform(size = 10000),
452
+ 'TARGET': np.random.choice([0,1],10000, p=[0.9,0.1])})
453
+ except:
454
+ df = pd.DataFrame({'DATE': pd.date_range(start = (datetime.datetime.today() - pd.DateOffset(hours = 9999 - 1)), end = datetime.datetime.today(), tz = "US/Eastern", freq = "H"),
455
+ 'ID': [i for i in range(10000)],
456
+ 'SCORE':np.random.uniform(size = 10000),
457
+ 'TARGET': np.random.choice([0,1],10000, p=[0.9,0.1])})
458
+ # df.to_csv("test_upload.csv")
459
+ else:
460
+ df = BytesIO()
461
+ df.write(file_input.value)
462
+ df.seek(0)
463
+ try:
464
+ df = pd.read_csv(df, error_bad_lines=False).apply(pd.to_numeric, errors='ignore')
465
+ except:
466
+ df = pd.read_csv(df, error_bad_lines=False)
467
+
468
+ df = df.select_dtypes(exclude=["category"])
469
+ df = df.replace([np.inf, -np.inf], np.nan)
470
+ df.columns = [i.upper() for i in df.columns]
471
+ return df
472
+
473
+ def update_target(event):
474
+ df = get_data()
475
+ cols = list(df.columns)
476
+ date_selector.set_param(options=cols)
477
+ target_selector.set_param(options=cols)
478
+ score_selector.set_param(options=cols)
479
+ # print(check_date.value)
480
+ # print(type(df.DATE.min()))
481
+ if check_date.value == True:
482
+ date_column = [i.find("DATE") for i in df.columns]
483
+ date_column = [date_column.index(i) for i in [i for i in date_column if i !=-1]]
484
+ if len(date_column) > 0:
485
+ df = df.iloc[:,date_column].iloc[:,[0]]
486
+ df.columns = ['DATE']
487
+ print(type(df.DATE.min()))
488
+ start, end = pd.Timestamp(df.DATE.min()), pd.Timestamp(df.DATE.max())
489
+ try:
490
+ date_range_.set_param(value=(start, end), start=start, end=end)
491
+ except:
492
+ date_range_.set_param(value=(end, start), start=end, end=start)
493
+ else:
494
+ print('Creating synthetic dates')
495
+ synthetic_date = pd.date_range(start = (datetime.datetime.today() - pd.DateOffset(hours = len(df))), end = datetime.datetime.today(), tz = "US/Eastern", freq = "H") #remove len(df) - 1
496
+ df['DATE'] = synthetic_date[:len(df)]
497
+ start, end = df.DATE.min(), df.DATE.max()
498
+ date_range_.set_param(value=(start, end), start=start, end=end)
499
+ else:
500
+ print('Creating synthetic dates')
501
+ synthetic_date = pd.date_range(start = (datetime.datetime.today() - pd.DateOffset(hours = len(df))), end = datetime.datetime.today(), tz = "US/Eastern", freq = "H")
502
+ df['DATE'] = synthetic_date[:len(df)]
503
+ start, end = df.DATE.min(), df.DATE.max()
504
+ date_range_.set_param(value=(start, end), start=start, end=end)
505
+ file_input.param.watch(update_target, 'value')
506
+ update_target(None)
507
+
508
+ @pn.depends(button.param.clicks)
509
+ def run(_):
510
+ print(random_seed.value)
511
+ print(score_selector.value)
512
+ df = get_data()
513
+ try:
514
+ if file_input.value is None:
515
+ pass
516
+ elif check_date.value == True:
517
+ df = df.rename(columns={date_selector.value:'DATE',score_selector.value:'SCORE',target_selector.value:'TARGET'})
518
+ else:
519
+ synthetic_date = pd.date_range(start = (datetime.datetime.today() - pd.DateOffset(hours = len(df) - 1)), end = datetime.datetime.today(), tz = "US/Eastern", freq = "H")
520
+ df['DATE'] = synthetic_date[:len(df)]
521
+ df = df.rename(columns={score_selector.value:'SCORE',target_selector.value:'TARGET'})
522
+ except Exception as e:
523
+ return pn.pane.Markdown(f"""{e}""")
524
+ try:
525
+ df.DATE = pd.to_datetime(df.DATE, format="%Y-%m-%d %H:%M:%S", utc = True)
526
+ # print(pd.to_datetime(df.DATE,utc = True))
527
+ df["MONTHLY"] = df["DATE"].dt.strftime('%Y-%m')
528
+ print(f"J - DAYS COUNT: {datetime.datetime.now() - pd.Timestamp('2023-03-06 03:27')}" )
529
+ df['QUARTERLY'] = pd.PeriodIndex(df.DATE, freq='Q').astype(str)
530
+ df['WEEKLY'] = pd.PeriodIndex(df.DATE, freq='W').astype(str)
531
+ except Exception as e:
532
+ return pn.pane.Markdown(f"""{e}""")
533
+ df = df.reset_index().rename(columns={df.index.name:'ID'}) #crate synthetic prediction ID for my code to run
534
+ # df = df.dropna(subset = 'TARGET', axis = 1)
535
+ df = df[~(df.TARGET.isna()) | (df.SCORE.isna())]
536
+ if df.TARGET.nunique() > 2:
537
+ df.TARGET = np.where(df.TARGET > 0 , 1 , 0)
538
+ df.SCORE = df.SCORE.astype(np.float64)
539
+
540
 
541
+
542
+ # baselines
543
+ # try:
544
+ # baseline = df.set_index('MONTHLY').loc[date_range_.value[0]: date_range_.value[1]].reset_index().copy()
545
+ # except:
546
+ # baseline = df.copy()
547
+ # baseline = baseline.set_index('MONTHLY')
548
+ # baseline.index = pd.to_datetime(baseline.index)
549
+ # baseline = baseline.loc[date_range_.value[0]: date_range_.value[1]].reset_index()
550
+ # baseline["MONTHLY"] = baseline["MONTHLY"] .dt.strftime('%Y-%m')
551
+ print(date_range_.value[0])
552
+ print(date_range_.value[1])
553
+
554
+ baseline = df.set_index('DATE').sort_index().loc[date_range_.value[0]: date_range_.value[1]].reset_index()
555
+ print(baseline.DATE.min())
556
+ print(baseline.DATE.max())
557
+ print(df.DATE.max())
558
+
559
+ # print(df.set_index('DATE').loc[date_range_.value[0]: date_range_.value[1]].index.max())
560
+ #prods
561
+ # prod = df.loc[~df.MONTHLY.isin(list(baseline.MONTHLY.unique()))].copy()
562
+ prod_dates = df.set_index('DATE').sort_index().index.difference(baseline.set_index('DATE').index)
563
+ # print(prod_dates)
564
+ prod = df.set_index('DATE').loc[prod_dates].reset_index()
565
+ if len(baseline) > len(prod):
566
+ prod = baseline
567
+ ##START##
568
+ intiate = pn.pane.Alert('''### Baseline Period: \n%s to %s
569
+ '''%(baseline.DATE.min(),baseline.DATE.max()), alert_type="info")
570
+ intiate2 = pn.pane.Alert('''### Production Period: \n%s to %s
571
+ '''%(prod.DATE.min(),prod.DATE.max()), alert_type="info")
572
+ if prod.equals(baseline):
573
+ intiate3 = pn.pane.Alert('''### Baseline Set is identical to Production Set \n Please choose a slice to be a baseline set''', alert_type="danger")
574
+ else:
575
+ intiate3 = None
576
+ ##PSI##
577
+ baseline_psi = baseline.copy()
578
+ prod_psi = prod.copy()
579
+
580
+ baseline_psi = add_extremes_OOT(baseline_psi, name = 'ID', score = 'SCORE')
581
+ prod_psi["DEC_BANDS"] = pd.cut(prod_psi['SCORE'], bins = pd.qcut(baseline_psi['SCORE'],10, retbins = True)[1])
582
+ prod_psi = prod_psi.groupby([period_metrics.value,
583
+ "DEC_BANDS"]).agg(Count = ("DEC_BANDS",
584
+ "count")).sort_index(level = 0).reset_index()
585
+ prod_psi = prod_psi.groupby(period_metrics.value).apply(percentage).drop("Count",axis = 1)
586
+
587
+ baseline_psi["DECILE"] = pd.cut(baseline_psi['SCORE'], bins = pd.qcut(baseline_psi['SCORE'],10, retbins = True)[1])
588
+ baseline_psi = baseline_psi["DECILE"].value_counts()
589
+ baseline_psi = baseline_psi / sum(baseline_psi)
590
+ baseline_psi = baseline_psi.reset_index().rename(columns={"index":"DEC_BANDS", "DECILE": "percent"})
591
+ baseline_psi[period_metrics.value] = "validation"
592
+ baseline_psi = baseline_psi[[period_metrics.value, "DEC_BANDS", "percent"]]
593
+
594
+ prod_psi = pd.concat([prod_psi,baseline_psi])
595
+
596
+ prod_psi = prod_psi.pivot(index = "DEC_BANDS", columns=period_metrics.value)["percent"]
597
+ if len(baseline) < len(prod):
598
+ psi_ = psi(prod_psi).to_frame("%s_PSI"%period_metrics.value)
599
+ psi_results = pn.widgets.DataFrame(psi_)
600
+ else:
601
+ psi_ = pd.DataFrame()
602
+ psi_results = pn.pane.Alert("### Choose a Baseline in the left banner to get PSI results", alert_type="warning")
603
+ #CONFIGS
604
+ baseline['QUARTERLY'] = 'Baseline: '+ baseline['QUARTERLY'].unique()[0] + '_' + baseline['QUARTERLY'].unique()[-1]
605
+ baseline['MONTHLY'] = 'Baseline: '+ baseline['MONTHLY'].unique()[0] + '_' + baseline['MONTHLY'].unique()[-1]
606
+ baseline['WEEKLY'] = 'Baseline: '+ baseline['WEEKLY'].unique()[0] + '_' + baseline['WEEKLY'].unique()[-1]
607
+ #AUC
608
+ auc_b = baseline.groupby([period_metrics.value]).apply(AUC)
609
+ auc_p = prod.groupby([period_metrics.value]).apply(AUC)
610
+ baseline_auc = pn.widgets.DataFrame(auc_b)
611
+ prod_auc = pn.widgets.DataFrame(auc_p,name = 'AUC') #autosize_mode='fit_columns'
612
 
613
+ from sklearn.metrics import roc_curve
614
+ from holoviews import Slope
615
+ b_label = baseline.MONTHLY.min()
616
+ FPR,TPR,T = roc_curve(baseline['TARGET'],baseline['SCORE'])
617
+ roc_baseline = pd.concat([pd.Series(TPR), pd.Series(FPR)], keys = ['TPR', 'FPR'], axis = 1)
618
+ roc_baseline_p = roc_baseline.hvplot.line(x ='FPR', y = 'TPR', label = "Baseline", color = 'red')
619
+
620
+ roc_plot = prod.groupby([period_metrics.value]).apply(ROC).reset_index(level = 0).hvplot.line(x ='FPR', y = 'TPR', title = "%s ROC (Production VS %s)"%(period_metrics.value, b_label),
621
+ groupby = period_metrics.value, width = 600, height = 500, label = "Prod",
622
+ xlim = (0,1), ylim = (0,1), grid = True) * Slope(slope=1, y_intercept=0).opts(color='black', line_dash='dashed') * roc_baseline_p
623
+ #KS
624
+ ks_b = baseline.groupby([period_metrics.value]).apply(ks)
625
+ ks_p = prod.groupby([period_metrics.value]).apply(ks)
626
+ baseline_ks = pn.widgets.DataFrame(ks_b)
627
+ prod_ks = pn.widgets.DataFrame(ks_p,name = 'AUC') #autosize_mode='fit_columns'
628
+
629
+ #LIFT
630
+ baseline_lift_raw, baseline_lift_raw_bins = lift_init(df = baseline)
631
+ baseline_lift_raw = baseline_lift_raw.rename(columns = {'Baseline': b_label})
632
+ prod_lift_raw, prod_lift_raw_bins = lift_init(df = prod, baseline = baseline, is_baseline = False)
633
+ cols_b = baseline_lift_raw.columns.drop(['BINS', 'SCORE'])
634
+ cols = prod_lift_raw.columns.drop(['BINS', 'SCORE'])
635
+
636
+ baseline_lift = baseline_lift_raw.loc[baseline_lift_raw.BINS =='10',cols_b]
637
+ prod_lift = prod_lift_raw.loc[prod_lift_raw.BINS =='10',cols]
638
+ # prod_lift = pd.concat([prod_lift.dropna(subset = [col]).dropna(axis = 1).reset_index(drop = 1) for col in prod_lift][1:], axis = 1)
639
+ lift_table = prod_lift_raw.loc[prod_lift_raw.BINS =='10',cols].melt(id_vars="SCORE_BAND",
640
+ var_name='column',
641
+ value_name='value').dropna().reset_index(drop = True).rename(columns = {'column':period_metrics.value , 'value': 'Target_PCT'})
642
+ # print(prod_lift_raw_bins.loc[prod_lift_raw_bins.BINS ==10])
643
+ lift_table = lift_table.hvplot.table(groupby = period_metrics.value, title="%s Lift Table"%period_metrics.value, hover = True, responsive=True,
644
+ shared_axes= False, fit_columns = True,
645
+ padding=True , index_position = 0, fontscale = 1.5)
646
+ # print(prod_lift_raw.loc[prod_lift_raw.BINS =='10',cols])
647
+ # print(baseline_lift_raw.loc[baseline_lift_raw.BINS == '10',cols_b])
648
+ prod_lift_raw['BINS'] = prod_lift_raw['BINS'].astype(int)
649
+ baseline_lift_raw['BINS'] = baseline_lift_raw['BINS'].astype(int)
650
 
651
+ prod_lift_raw_bins['SCORE_BAND'] = prod_lift_raw_bins['SCORE_BAND'].astype(str)
652
+ # prod_lift_raw_bins['BINS'] = prod_lift_raw_bins['BINS'].astype(str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
 
654
+ baseline_lift_raw_bins['SCORE_BAND'] = baseline_lift_raw_bins['SCORE_BAND'].astype(str)
655
+ # baseline_lift_raw_bins['BINS'] = baseline_lift_raw_bins['BINS'].astype(str)
 
 
 
656
 
657
+ # print(prod_lift_raw.loc[:,list(cols)+['BINS']])
658
+ p1 = prod_lift_raw_bins.set_index('SCORE_BAND'
659
+ ).reset_index().hvplot.line(x = 'SCORE_BAND', groupby = ['BINS', period_metrics.value],
660
+ grid = True, width = 1200, height = 500,
661
+ label = 'Production', rot = 45)
662
+
663
+ # print(baseline_lift_raw_bins)
664
+ # print(prod_lift_raw_bins)
665
+ p2 = prod_lift_raw_bins.set_index('SCORE_BAND'
666
+ ).reset_index().hvplot.scatter(x = 'SCORE_BAND', groupby = ['BINS', period_metrics.value], grid = True, color='DarkBlue', label='Production', rot = 45)
667
 
668
+ b_label = baseline.MONTHLY.min()
669
+ # print(baseline_lift_raw.loc[baseline_lift_raw.BINS == '10',cols_b][b_label])
670
+ b1 = baseline_lift_raw_bins.hvplot.line(x = 'SCORE_BAND', groupby = ['BINS'],
671
+ grid = True, width = 1200, height = 500,
672
+ line_dash='dashed', color = 'black', label = b_label, rot = 45)
673
+
674
+ b2 = baseline_lift_raw_bins.hvplot.scatter(x = 'SCORE_BAND', groupby = ['BINS'], grid = True, color='DarkGreen', label = b_label, rot = 45)
675
+
676
+ final_lift_plots = (p1*p2*b1*b2).opts(ylabel = '%target_rate_mean', title = "%s Lift Chart " % (period_metrics.value.title()))
677
+
678
+ #LABEL_DRIFT
679
+ mean_score_prod = prod.groupby(period_metrics.value).agg(MEAN_SCORE=("SCORE","mean"), MEAN_TARGET=("TARGET","mean"),Count = ("TARGET","count"))
680
+ mean_score_base = baseline.groupby(period_metrics.value).agg(MEAN_SCORE=("SCORE","mean"), MEAN_TARGET=("TARGET","mean"),Count = ("TARGET","count"))
681
+ baseline_label_drift = pn.widgets.DataFrame(mean_score_base)
682
+ prod_label_drift = pn.widgets.DataFrame(mean_score_prod,name = 'DRIFT')
683
+
684
+ #Lift Tables
685
+ # gains_final_all,_ = gains_table_proba(prod,'TARGET', 'SCORE')
686
+ lift_data = pd.concat([baseline_lift, prod_lift], axis = 0)
687
+ lift_data = pd.concat([lift_data.dropna(subset = [col]).dropna(axis = 1).reset_index(drop = 1) for col in lift_data][1:], axis = 1).dropna(axis = 1, how = 'any')
688
+ lift_data = lift_data.loc[:,~lift_data.columns.duplicated()].set_index('SCORE_BAND')
689
+ if (lift_data.shape[1] > 4) | (lift_data.shape[0] > 10):
690
+ prod_lift = pn.pane.Markdown('### Please download the csv as the lift table will congest the screen')
691
+ else:
692
+ prod_lift = pn.widgets.DataFrame(lift_data,name = 'LIFT')
693
+ #GAINS_TABLE
694
+ gains_final_prod,_ = gains_table_proba(prod,'TARGET', 'SCORE')
695
+ gains_final_base,_ = gains_table_proba(baseline,'TARGET', 'SCORE')
696
+ gains_final_base.index.names = [b_label]
697
+ gains_final_p = pn.widgets.DataFrame(gains_final_prod.set_index(['low','high']),name = 'GAINS',)
698
+ gains_final_b = pn.widgets.DataFrame(gains_final_base.set_index(['low','high']),name = 'GAINS',)
699
+
700
+ ece, bin_probamean, bin_ymean, bin_id, bin_count, bin_edges = expected_calibration_error(prod.TARGET.values, prod.SCORE.values)
701
+ error = pd.DataFrame(np.array([bin_probamean, bin_ymean]).T,columns= ["SCORE_MEAN", "TARGET_MEAN"])
702
+ error_plot = error.hvplot.scatter(x ='SCORE_MEAN', y = 'TARGET_MEAN', width = 800, height = 500, label = "Bin (Score vs Target Mean)", title = 'Model Scores Calibration (--- Perfect Calibration)',
703
+ xlim = (0,1), ylim = (0,1), grid = True, xlabel = 'Bins Mean of Scores', ylabel = 'Bins Mean of Target') * Slope(slope=1, y_intercept=0,legend = 'Perfect Calibration').opts(color='black', line_dash='dashed')
704
+ variable_ = pn.pane.Alert('''### FJ Day Count: \n%s
705
+ '''%(datetime.datetime.now() - pd.Timestamp('2023-03-06 03:27')), alert_type="success")
706
+ return pn.Tabs(
707
+ ('Metrics', pn.Column(
708
+ pn.Row(intiate, intiate2, intiate3, width = 1200),
709
+ '# PSI',
710
+ pn.Row(psi_results, save_csv(psi_, 'PSI')),
711
+ '# AUC',
712
+ pn.Row(prod_auc, baseline_auc, save_csv(pd.concat([auc_b, auc_p], axis = 0), 'AUC')),
713
+ '# KS',
714
+ pn.Row(prod_ks, baseline_ks, save_csv(pd.concat([ks_b, ks_p], axis = 0), 'KS')),
715
+ '# LABEL DRIFT',
716
+ pn.Row(prod_label_drift, baseline_label_drift, save_csv(pd.concat([mean_score_base, mean_score_prod], axis = 0), 'LABEL_DRIFT')),
717
+ '# LIFT TABLES',
718
+ pn.Row(prod_lift, save_csv(lift_data, 'LIFT_TABLES')),
719
+ '# GAINS TABLE',
720
+ pn.Row(gains_final_b, gains_final_p, save_csv(pd.concat([gains_final_base, gains_final_prod], axis = 1), 'GAINS_TABLES')),
721
+ get_xlsx(psi_, pd.concat([auc_b, auc_p], axis = 0), pd.concat([ks_b, ks_p], axis = 0), pd.concat([mean_score_base, mean_score_prod], axis = 0), lift_data, pd.concat([gains_final_base, gains_final_prod], axis = 1)),
722
+ pn.Row(variable_, width = 200),
723
+ )
724
+ ), #sizing_mode='stretch_width'
725
+ ('Charts', pn.Column(pn.Row(roc_plot.opts(legend_position = 'bottom_right'), error_plot.opts(legend_position = 'top_left')) ,
726
+ lift_table,
727
+ final_lift_plots.opts(legend_position = 'bottom_right')
728
+ )
729
+ )
730
+
731
+ )
732
+
733
+
734
+ # return pn.Tabs(
735
+ # ('Analysis', pn.Column(
736
+ # pn.Row(vol_ret, pn.layout.Spacer(width=20), pn.Column(div, table), sizing_mode='stretch_width'),
737
+ # pn.Column(pn.Row(year, investment), return_curve, sizing_mode='stretch_width'),
738
+ # sizing_mode='stretch_width')),
739
+ # ('Timeseries', timeseries),
740
+ # ('Log Return', pn.Column(
741
+ # '## Daily normalized log returns',
742
+ # 'Width of distribution indicates volatility and center of distribution the mean daily return.',
743
+ # log_ret_hists,
744
+ # sizing_mode='stretch_width'
745
+ # ))
746
+ # )
747
+
748
+ pn.Row(pn.Column(widgets), pn.layout.Spacer(width=30), run).servable()
749
+
750
+
751
+
752
+
753
+
754
+
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+ # Caveats
763
+ # The maximum sizes set in either Bokeh or Tornado refer to the maximum size of the message that
764
+ # is transferred through the web socket connection, which is going to be larger than the actual
765
+ # size of the uploaded file since the file content is encoded in a base64 string. So if you set a
766
+ # maximum size of 100 MB for your application, you should indicate your users that the upload
767
+ # limit is a value that is less than 100 MB.
768
+
769
+ # When a file whose size is larger than the limits is selected by a user, their browser/tab may
770
+ # just crash. Alternatively the web socket connection can close (sometimes with an error message
771
+ # printed in the browser console such as [bokeh] Lost websocket 0 connection, 1009 (message too
772
+ # big)) which means the application will become unresponsive and needs to be refreshed.
773
+
774
+ # app = ...
775
+
776
+ # MAX_SIZE_MB = 150
777
 
778
+ # pn.serve(
779
+ # app,
780
+ # # Increase the maximum websocket message size allowed by Bokeh
781
+ # websocket_max_message_size=MAX_SIZE_MB*1024*1014,
782
+ # # Increase the maximum buffer size allowed by Tornado
783
+ # http_server_kwargs={'max_buffer_size': MAX_SIZE_MB*1024*1014}
784
+ # )