Synav commited on
Commit
d1146a6
·
verified ·
1 Parent(s): c73f26c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -0
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime
3
+ import numpy as np
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import joblib
7
+ import shap
8
+ import matplotlib.pyplot as plt
9
+
10
+ from sklearn.pipeline import Pipeline
11
+ from sklearn.compose import ColumnTransformer
12
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
13
+ from sklearn.impute import SimpleImputer
14
+ from sklearn.linear_model import LogisticRegression
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.metrics import roc_auc_score, accuracy_score
17
+
18
+
19
+ # ============================================================
20
+ # Fixed schema definition (PLACEHOLDER FRAMEWORK)
21
+ # ============================================================
22
+ FEATURE_COLS = [chr(ord("A") + i) for i in range(26)] # A..Z
23
+ NUM_COLS = FEATURE_COLS[:13] # A–M → numeric
24
+ CAT_COLS = FEATURE_COLS[13:] # N–Z → categorical
25
+ LABEL_COL = "AA"
26
+
27
+
28
+ # ============================================================
29
+ # Model pipeline
30
+ # ============================================================
31
+ def build_pipeline():
32
+ num_pipe = Pipeline([
33
+ ("imputer", SimpleImputer(strategy="median")),
34
+ ("scaler", StandardScaler())
35
+ ])
36
+
37
+ cat_pipe = Pipeline([
38
+ ("imputer", SimpleImputer(strategy="most_frequent")),
39
+ ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
40
+ ])
41
+
42
+ preprocessor = ColumnTransformer(
43
+ transformers=[
44
+ ("num", num_pipe, NUM_COLS),
45
+ ("cat", cat_pipe, CAT_COLS)
46
+ ],
47
+ remainder="drop",
48
+ verbose_feature_names_out=False
49
+ )
50
+
51
+ clf = LogisticRegression(max_iter=2000, solver="lbfgs")
52
+
53
+ return Pipeline([
54
+ ("preprocess", preprocessor),
55
+ ("clf", clf)
56
+ ])
57
+
58
+
59
+ # ============================================================
60
+ # Validation utilities
61
+ # ============================================================
62
+ def validate_schema(df: pd.DataFrame) -> pd.DataFrame:
63
+ missing = [c for c in FEATURE_COLS + [LABEL_COL] if c not in df.columns]
64
+ if missing:
65
+ raise ValueError(
66
+ f"Missing required columns: {missing}. "
67
+ f"Excel must contain columns A..Z and AA exactly."
68
+ )
69
+ return df[FEATURE_COLS + [LABEL_COL]].copy()
70
+
71
+
72
+ def coerce_binary_label(y: pd.Series):
73
+ y_clean = y.dropna()
74
+ uniq = list(pd.unique(y_clean))
75
+ if len(uniq) != 2:
76
+ raise ValueError(f"AA must be binary (2 unique values). Found: {uniq}")
77
+
78
+ if pd.api.types.is_numeric_dtype(y_clean):
79
+ pos = sorted(uniq)[-1]
80
+ return (y == pos).astype(int).to_numpy(), pos
81
+
82
+ if y_clean.dtype == bool:
83
+ return y.astype(int).to_numpy(), True
84
+
85
+ uniq_str = sorted([str(u) for u in uniq])
86
+ pos = uniq_str[-1]
87
+ return y.astype(str).eq(pos).astype(int).to_numpy(), pos
88
+
89
+
90
+ # ============================================================
91
+ # Training + persistence
92
+ # ============================================================
93
+ def train_and_save(df: pd.DataFrame):
94
+ df = validate_schema(df)
95
+
96
+ X = df[FEATURE_COLS].copy()
97
+ y_raw = df[LABEL_COL].copy()
98
+
99
+ for c in NUM_COLS:
100
+ X[c] = pd.to_numeric(X[c], errors="coerce")
101
+ for c in CAT_COLS:
102
+ X[c] = X[c].astype("string")
103
+
104
+ y01, pos_class = coerce_binary_label(y_raw)
105
+
106
+ X_train, X_test, y_train, y_test = train_test_split(
107
+ X, y01, test_size=0.2, random_state=42, stratify=y01
108
+ )
109
+
110
+ pipe = build_pipeline()
111
+ pipe.fit(X_train, y_train)
112
+
113
+ proba = pipe.predict_proba(X_test)[:, 1]
114
+ pred = (proba >= 0.5).astype(int)
115
+
116
+ metrics = {
117
+ "roc_auc": float(roc_auc_score(y_test, proba)),
118
+ "accuracy@0.5": float(accuracy_score(y_test, pred)),
119
+ "n_train": int(len(X_train)),
120
+ "n_test": int(len(X_test)),
121
+ }
122
+
123
+ joblib.dump(pipe, "model.joblib")
124
+
125
+ meta = {
126
+ "framework": "LogiSHAP Studio",
127
+ "model": "Logistic Regression",
128
+ "created_at_utc": datetime.utcnow().isoformat(),
129
+ "schema": {
130
+ "features": FEATURE_COLS,
131
+ "numeric": NUM_COLS,
132
+ "categorical": CAT_COLS,
133
+ "label": LABEL_COL
134
+ },
135
+ "positive_class": str(pos_class),
136
+ "metrics": metrics
137
+ }
138
+
139
+ with open("meta.json", "w") as f:
140
+ json.dump(meta, f, indent=2)
141
+
142
+ return pipe, meta, X
143
+
144
+
145
+ # ============================================================
146
+ # SHAP
147
+ # ============================================================
148
+ def build_shap_explainer(pipe, X_bg, max_bg=200):
149
+ if len(X_bg) > max_bg:
150
+ X_bg = X_bg.sample(max_bg, random_state=42)
151
+
152
+ pre = pipe.named_steps["preprocess"]
153
+ clf = pipe.named_steps["clf"]
154
+
155
+ X_bg_t = pre.transform(X_bg)
156
+ explainer = shap.LinearExplainer(
157
+ clf, X_bg_t, feature_perturbation="interventional"
158
+ )
159
+ return explainer
160
+
161
+
162
+ # ============================================================
163
+ # Streamlit UI
164
+ # ============================================================
165
+ st.set_page_config(page_title="LogiSHAP Studio", layout="wide")
166
+
167
+ st.title("LogiSHAP Studio")
168
+ st.caption("Logistic Regression framework with SHAP explainability (A–Z features, AA label)")
169
+
170
+ with st.expander("Required Excel format", expanded=True):
171
+ st.markdown("""
172
+ - **A–M** → Numeric variables
173
+ - **N–Z** → Categorical variables
174
+ - **AA** → Binary label (0/1, Yes/No, True/False)
175
+ Column names **must be exactly A..Z and AA**
176
+ """)
177
+
178
+ tab_train, tab_predict = st.tabs(["1️⃣ Train", "2️⃣ Predict + SHAP"])
179
+
180
+ if "pipe" not in st.session_state:
181
+ st.session_state.pipe = None
182
+ if "explainer" not in st.session_state:
183
+ st.session_state.explainer = None
184
+
185
+
186
+ # ---------------- TRAIN ----------------
187
+ with tab_train:
188
+ train_file = st.file_uploader("Upload training Excel (.xlsx)", type=["xlsx"])
189
+ if train_file:
190
+ df = pd.read_excel(train_file, engine="openpyxl")
191
+ st.dataframe(df.head())
192
+
193
+ if st.button("Train model"):
194
+ with st.spinner("Training model..."):
195
+ pipe, meta, X_bg = train_and_save(df)
196
+ explainer = build_shap_explainer(pipe, X_bg)
197
+
198
+ st.session_state.pipe = pipe
199
+ st.session_state.explainer = explainer
200
+
201
+ st.success("Training complete. model.joblib and meta.json created.")
202
+ m = meta["metrics"]
203
+ c1, c2, c3, c4 = st.columns(4)
204
+ c1.metric("ROC AUC", f"{m['roc_auc']:.3f}")
205
+ c2.metric("Accuracy", f"{m['accuracy@0.5']:.3f}")
206
+ c3.metric("Train N", m["n_train"])
207
+ c4.metric("Test N", m["n_test"])
208
+
209
+
210
+ # ---------------- PREDICT ----------------
211
+ with tab_predict:
212
+ if st.session_state.pipe is None:
213
+ st.warning("Train a model first.")
214
+ else:
215
+ infer_file = st.file_uploader("Upload inference Excel (.xlsx)", type=["xlsx"])
216
+ if infer_file:
217
+ df_inf = pd.read_excel(infer_file, engine="openpyxl")
218
+ X_inf = df_inf[FEATURE_COLS].copy()
219
+
220
+ for c in NUM_COLS:
221
+ X_inf[c] = pd.to_numeric(X_inf[c], errors="coerce")
222
+ for c in CAT_COLS:
223
+ X_inf[c] = X_inf[c].astype("string")
224
+
225
+ pipe = st.session_state.pipe
226
+ proba = pipe.predict_proba(X_inf)[:, 1]
227
+
228
+ df_out = df_inf.copy()
229
+ df_out["predicted_probability"] = proba
230
+ st.dataframe(df_out.head())
231
+
232
+ st.download_button(
233
+ "Download predictions",
234
+ df_out.to_csv(index=False).encode(),
235
+ "predictions.csv",
236
+ "text/csv"
237
+ )
238
+
239
+ st.divider()
240
+ st.subheader("SHAP explanation")
241
+
242
+ row = st.number_input("Row index", 0, len(X_inf)-1, 0)
243
+ X_one = X_inf.iloc[[row]]
244
+
245
+ pre = pipe.named_steps["preprocess"]
246
+ X_one_t = pre.transform(X_one)
247
+
248
+ explainer = st.session_state.explainer
249
+ shap_vals = explainer.shap_values(X_one_t)
250
+ base = explainer.expected_value
251
+
252
+ if isinstance(shap_vals, list):
253
+ shap_vals = shap_vals[1]
254
+
255
+ try:
256
+ names = list(pre.get_feature_names_out())
257
+ except Exception:
258
+ names = [f"f{i}" for i in range(len(shap_vals[0]))]
259
+
260
+ try:
261
+ x_dense = X_one_t.toarray()[0]
262
+ except Exception:
263
+ x_dense = np.array(X_one_t)[0]
264
+
265
+ exp = shap.Explanation(
266
+ values=shap_vals[0],
267
+ base_values=float(base),
268
+ data=x_dense,
269
+ feature_names=names
270
+ )
271
+
272
+ c1, c2 = st.columns(2)
273
+ with c1:
274
+ fig = plt.figure()
275
+ shap.plots.waterfall(exp, show=False)
276
+ st.pyplot(fig)
277
+
278
+ with c2:
279
+ fig2 = plt.figure()
280
+ shap.plots.bar(exp, show=False)
281
+ st.pyplot(fig2)