irhamni commited on
Commit
3fa9e82
·
verified ·
1 Parent(s): 0eff424

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +394 -0
app.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, math, io
2
+ import numpy as np
3
+ import pandas as pd
4
+ import gradio as gr
5
+ from PIL import Image
6
+
7
+ from scipy.stats import chisquare
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import matplotlib.pyplot as plt
11
+
12
+
13
+ # ============================================================
14
+ # CONFIG
15
+ # ============================================================
16
+ DATA_PATH = os.getenv("IPLM_DATA_PATH", "data/IPLM_clean_manual_131225.xlsx")
17
+
18
+ EXCLUDE_COLS_EXACT = {"kontak_wa", "npp", "tanggal_kirim", "updated_at", "created_at"}
19
+
20
+ BENFORD_P = np.array([math.log10(1 + 1/d) for d in range(1, 10)])
21
+ BENFORD_EXCLUDE_PATTERNS = [
22
+ r"\bid\b", r"\bid_", r"_id\b",
23
+ r"\bkode\b", r"\bcode\b",
24
+ r"\bnpsn\b", r"\bnik\b", r"\bnpwp\b",
25
+ r"\bkontak\b", r"\bwa\b", r"\bwhatsapp\b", r"\btelepon\b", r"\bphone\b", r"\bnohp\b",
26
+ r"\btanggal\b", r"\bdate\b",
27
+ r"\bwaktu\b", r"\btime\b", r"\bjam\b",
28
+ r"\bcreated\b", r"\bupdated\b", r"\bmodified\b",
29
+ r"\bsubmit\b", r"\bkirim\b",
30
+ r"\bmulai\b", r"\bselesai\b",
31
+ r"\blastpage\b", r"\bpage\b",
32
+ r"\bstatus\b",
33
+ r"\bnpp\b",
34
+ ]
35
+
36
+
37
+ # ============================================================
38
+ # HELPERS
39
+ # ============================================================
40
+ def canon(s: str) -> str:
41
+ return re.sub(r"[^a-z0-9]+", "", str(s).lower())
42
+
43
+ def pick_col(df, candidates):
44
+ cols = list(df.columns)
45
+ cc = {canon(c): c for c in cols}
46
+ for cand in candidates:
47
+ k = canon(cand)
48
+ if k in cc:
49
+ return cc[k]
50
+ for c in cols:
51
+ kc = canon(c)
52
+ for cand in candidates:
53
+ if canon(cand) in kc:
54
+ return c
55
+ return None
56
+
57
+ def detect_geo_cols(df):
58
+ prov = pick_col(df, ["provinsi", "propinsi", "province"])
59
+ kab = pick_col(df, ["kab_kota", "kabkota", "kabupatenkota", "kabupaten/kota", "kabupaten", "kota", "regency", "city"])
60
+ return prov, kab
61
+
62
+ def detect_kewenangan_col(df):
63
+ return pick_col(df, ["kewenangan", "pu_level", "level_kewenangan", "kewenangan_pengelola", "kewenangan_perpustakaan", "level"])
64
+
65
+ def load_excel(path):
66
+ df = pd.read_excel(path, engine="openpyxl")
67
+ for c in df.columns:
68
+ if df[c].dtype == object:
69
+ df[c] = (df[c].astype(str)
70
+ .str.replace("\u00a0", " ", regex=False)
71
+ .str.replace(r"\s+", " ", regex=True)
72
+ .str.strip())
73
+ df.loc[df[c].str.lower().isin(["nan", "none", "null", ""]), c] = np.nan
74
+ return df
75
+
76
+ def clean_str_list(values):
77
+ out = []
78
+ for v in values:
79
+ if v is None:
80
+ continue
81
+ s = str(v).strip()
82
+ if s == "" or s.lower() in ["nan", "none", "null"]:
83
+ continue
84
+ out.append(s)
85
+ seen = set()
86
+ uniq = []
87
+ for s in out:
88
+ if s not in seen:
89
+ uniq.append(s)
90
+ seen.add(s)
91
+ return uniq
92
+
93
+ def safe_numeric_cols(df, exclude=set(), min_non_na=0.25):
94
+ """Numeric cols used for completeness/zero/similarity. Hard-exclude columns by exact name."""
95
+ hard = {canon(x) for x in EXCLUDE_COLS_EXACT}
96
+ cols = []
97
+ for c in df.columns:
98
+ if c in exclude:
99
+ continue
100
+ if canon(c) in hard:
101
+ continue
102
+ s = pd.to_numeric(df[c], errors="coerce")
103
+ if s.notna().mean() >= min_non_na and s.nunique(dropna=True) >= 3:
104
+ cols.append(c)
105
+ return cols
106
+
107
+ def is_benford_applicable(colname: str) -> bool:
108
+ if canon(colname) in {canon(x) for x in EXCLUDE_COLS_EXACT}:
109
+ return False
110
+ name = str(colname).lower()
111
+ return not any(re.search(p, name) for p in BENFORD_EXCLUDE_PATTERNS)
112
+
113
+ def leading_digit_series(x: pd.Series):
114
+ x = pd.to_numeric(x, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
115
+ x = x[np.abs(x) > 0]
116
+ if len(x) == 0:
117
+ return None
118
+
119
+ def first_digit(v):
120
+ v = abs(float(v))
121
+ if v == 0:
122
+ return np.nan
123
+ while v < 1:
124
+ v *= 10
125
+ return int(str(v).replace(".", "")[0])
126
+
127
+ digs = x.apply(first_digit).dropna().astype(int)
128
+ digs = digs[(digs >= 1) & (digs <= 9)]
129
+ return digs
130
+
131
+ def benford_stats(x: pd.Series, min_n=50):
132
+ digs = leading_digit_series(x)
133
+ if digs is None or len(digs) < min_n:
134
+ return None
135
+ obs = np.array([(digs == d).sum() for d in range(1, 10)], dtype=float)
136
+ exp = BENFORD_P * obs.sum()
137
+ chi, p = chisquare(f_obs=obs, f_exp=exp)
138
+ obs_p = obs / obs.sum()
139
+ mad = float(np.mean(np.abs(obs_p - BENFORD_P)))
140
+ return {"n": int(len(digs)), "p_value": float(p), "mad": mad, "obs": obs_p}
141
+
142
+ def benford_flag(mad):
143
+ if mad < 0.012:
144
+ return "OK"
145
+ if mad < 0.015:
146
+ return "WASPADA"
147
+ return "RED FLAG"
148
+
149
+ def fig_to_pil(fig):
150
+ buf = io.BytesIO()
151
+ fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
152
+ plt.close(fig)
153
+ buf.seek(0)
154
+ return Image.open(buf).convert("RGBA")
155
+
156
+ def benford_plot(obs_p):
157
+ fig, ax = plt.subplots(figsize=(7, 3))
158
+ d = np.arange(1, 10)
159
+ ax.bar(d - 0.2, BENFORD_P, width=0.4, label="Benford")
160
+ ax.bar(d + 0.2, obs_p, width=0.4, label="Aktual")
161
+ ax.set_xticks(d)
162
+ ax.set_xlabel("Digit pertama")
163
+ ax.set_ylabel("Proporsi")
164
+ ax.legend()
165
+ return fig_to_pil(fig)
166
+
167
+ def scatter_plot(peer_agg, x_col, y_col):
168
+ fig, ax = plt.subplots(figsize=(7, 3.5))
169
+ ax.scatter(peer_agg[x_col], peer_agg[y_col], s=18)
170
+ ax.set_xlabel(x_col)
171
+ ax.set_ylabel(y_col)
172
+ ax.set_title("Peer Scatter (2 kolom paling variatif)")
173
+ return fig_to_pil(fig)
174
+
175
+
176
+ # ============================================================
177
+ # LOAD ONCE (GLOBAL)
178
+ # ============================================================
179
+ if not os.path.exists(DATA_PATH):
180
+ raise FileNotFoundError(f"Data file not found: {DATA_PATH}. Taruh excel di repo: data/..., atau set env IPLM_DATA_PATH.")
181
+
182
+ df_raw = load_excel(DATA_PATH)
183
+ prov_col, kab_col = detect_geo_cols(df_raw)
184
+ kew_col = detect_kewenangan_col(df_raw)
185
+
186
+ if prov_col is None or kab_col is None:
187
+ raise ValueError("Kolom provinsi/kab_kota tidak terdeteksi. Pastikan ada kolom provinsi dan kab_kota.")
188
+
189
+ df = df_raw.copy()
190
+ df["_prov_str"] = df[prov_col].astype(str).str.strip()
191
+ df["_kab_str"] = df[kab_col].astype(str).str.strip()
192
+ df.loc[df["_prov_str"].str.lower().isin(["nan", "none", "null", ""]), "_prov_str"] = np.nan
193
+ df.loc[df["_kab_str"].str.lower().isin(["nan", "none", "null", ""]), "_kab_str"] = np.nan
194
+ df = df[df["_prov_str"].notna() & df["_kab_str"].notna()].copy() # penting supaya tidak "campur"
195
+
196
+ exclude_base = {prov_col, kab_col, "_prov_str", "_kab_str"}
197
+ hard_exclude_cols_in_file = {c for c in df.columns if canon(c) in {canon(x) for x in EXCLUDE_COLS_EXACT}}
198
+ exclude_base = exclude_base.union(hard_exclude_cols_in_file)
199
+
200
+ num_cols_all = safe_numeric_cols(df, exclude=exclude_base)
201
+ benford_cols = [c for c in num_cols_all if is_benford_applicable(c)]
202
+
203
+ PROVS = clean_str_list(df["_prov_str"].unique().tolist())
204
+
205
+ prov_cache_peer = {} # cache per prov for similarity
206
+
207
+
208
+ def kabs_for_prov(pv):
209
+ return clean_str_list(df.loc[df["_prov_str"] == pv, "_kab_str"].unique().tolist())
210
+
211
+ def kew_for(pv, kv):
212
+ if not kew_col or kew_col not in df.columns:
213
+ return ["(kewenangan tidak tersedia)"]
214
+ vals = clean_str_list(df.loc[(df["_prov_str"] == pv) & (df["_kab_str"] == kv), kew_col].dropna().unique().tolist())
215
+ return vals if vals else ["(kewenangan kosong)"]
216
+
217
+ def get_peer_agg_for_prov(pv):
218
+ if pv in prov_cache_peer:
219
+ return prov_cache_peer[pv]
220
+ peer = df[df["_prov_str"] == pv]
221
+ peer_agg = peer.groupby("_kab_str")[num_cols_all].apply(
222
+ lambda g: g.apply(pd.to_numeric, errors="coerce").mean()
223
+ ).reset_index().rename(columns={"_kab_str": "kab_kota"})
224
+ prov_cache_peer[pv] = peer_agg
225
+ return peer_agg
226
+
227
+
228
+ # ============================================================
229
+ # CORE AUDIT FUNCTION (STRICT FILTER)
230
+ # ============================================================
231
+ def audit(pv, kv, kw):
232
+ # strict filter: prov + kab (+ kewenangan if available & chosen)
233
+ dfx = df[(df["_prov_str"] == pv) & (df["_kab_str"] == kv)].copy()
234
+
235
+ if kew_col and kew_col in dfx.columns and kw and not kw.startswith("("):
236
+ dfx = dfx[dfx[kew_col].astype(str).str.strip() == kw].copy()
237
+
238
+ if dfx.empty:
239
+ return (
240
+ "❌ Data kosong setelah filter (cek kewenangan / validitas label).",
241
+ pd.DataFrame(),
242
+ pd.DataFrame(),
243
+ None,
244
+ None
245
+ )
246
+
247
+ if not num_cols_all:
248
+ return ("❌ Tidak ada kolom numerik yang cukup.", pd.DataFrame(), pd.DataFrame(), None, None)
249
+
250
+ num_all = dfx[num_cols_all].apply(pd.to_numeric, errors="coerce")
251
+
252
+ completeness = float(num_all.notna().mean().mean())
253
+ zero_rate = float((num_all.fillna(0) == 0).mean().mean())
254
+
255
+ # Benford (applicable only, already excluded hard cols)
256
+ best = None
257
+ rows = []
258
+ for c in benford_cols:
259
+ st = benford_stats(num_all[c])
260
+ if st:
261
+ rows.append({"kolom": c, "n": st["n"], "MAD": st["mad"], "flag": benford_flag(st["mad"]), "p_value": st["p_value"]})
262
+ if best is None or st["mad"] > best["mad"]:
263
+ best = {"kolom": c, **st}
264
+ ben_tbl = pd.DataFrame(rows).sort_values("MAD", ascending=False).head(15) if rows else pd.DataFrame()
265
+
266
+ if best is None:
267
+ ben_note = "Benford (applicable only): tidak ada kolom memenuhi syarat (butuh ≥50 non-zero)."
268
+ ben_img = None
269
+ else:
270
+ ben_note = f"Benford strongest: {best['kolom']} | n={best['n']} | MAD={best['mad']:.4f} ({benford_flag(best['mad'])}) | p={best['p_value']:.3g}"
271
+ ben_img = benford_plot(best["obs"])
272
+
273
+ # Similarity (peer se-provinsi) => strict prov only (no mixing)
274
+ peer_agg = get_peer_agg_for_prov(pv)
275
+ sim_tbl = pd.DataFrame()
276
+ top_sim = None
277
+
278
+ if peer_agg.shape[0] >= 3:
279
+ X = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy(float)
280
+ Xs = StandardScaler().fit_transform(X)
281
+ sim = cosine_similarity(Xs)
282
+
283
+ idx = None
284
+ for i in range(len(peer_agg)):
285
+ if str(peer_agg.loc[i, "kab_kota"]) == kv:
286
+ idx = i
287
+ break
288
+
289
+ if idx is not None:
290
+ order = np.argsort(-sim[idx])
291
+ sim_tbl = pd.DataFrame([
292
+ {"kab_kota_pembanding": str(peer_agg.loc[j, "kab_kota"]), "cosine_similarity": float(sim[idx][j])}
293
+ for j in order[1:11]
294
+ ])
295
+ if not sim_tbl.empty:
296
+ top_sim = float(sim_tbl["cosine_similarity"].max())
297
+
298
+ # scatter
299
+ scat_img = None
300
+ if peer_agg.shape[0] >= 3:
301
+ vars_ = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).var(axis=0).sort_values(ascending=False)
302
+ if len(vars_) >= 2 and vars_.iloc[0] > 0 and vars_.iloc[1] > 0:
303
+ x_col, y_col = vars_.index[0], vars_.index[1]
304
+ scat_img = scatter_plot(peer_agg, x_col, y_col)
305
+
306
+ too_perfect = (completeness > 0.98) and (zero_rate < 0.02)
307
+
308
+ scorecard = pd.DataFrame([
309
+ ["Provinsi", pv, ""],
310
+ ["Kab/Kota", kv, ""],
311
+ ["Kewenangan", kw if kw else "NA", f"Sumber: {kew_col}" if (kew_col and not str(kw).startswith("(")) else "Kewenangan tidak tersedia/kosong."],
312
+ ["Completeness (numeric)", f"{completeness:.2%}",
313
+ "Kelengkapan tinggi; pastikan berasal dari validasi input (wajib isi) atau data administratif lengkap. Jika ada imputasi, dokumentasikan prosedurnya."],
314
+ ["Zero-rate (numeric)", f"{zero_rate:.2%}",
315
+ "Proporsi nol dipengaruhi jenis indikator. Nol wajar pada layanan/kegiatan; waspadai nol pada indikator kapasitas inti (koleksi/SDM/anggaran) tanpa bukti dukung."],
316
+ ["Benford (applicable only)", "ADA" if best else "TIDAK", ben_note],
317
+ ["Top similarity (peer)", f"{top_sim:.3f}" if top_sim is not None else "NA",
318
+ "≥0.95 indikasi template/duplikasi. Nilai rendah biasanya lebih wajar (heterogen)."],
319
+ ["Catatan pola", "WASPADA" if too_perfect else "Normal",
320
+ "Jika WASPADA: cek bukti dukung, log input, dan konsistensi antar indikator sebelum agregasi indeks."]
321
+ ], columns=["Komponen", "Nilai", "Catatan"])
322
+
323
+ narasi = (
324
+ f"**Filter aktif:** Provinsi = `{pv}` · Kab/Kota = `{kv}` · Kewenangan = `{kw}`\n\n"
325
+ f"**EXCLUDE (no analysis):** `{', '.join(sorted(EXCLUDE_COLS_EXACT))}`\n\n"
326
+ f"{ben_note}"
327
+ )
328
+
329
+ return narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl
330
+
331
+
332
+ # ============================================================
333
+ # GRADIO UI (DEPLOY READY)
334
+ # ============================================================
335
+ def ui_init():
336
+ pv = PROVS[0] if PROVS else None
337
+ kabs = kabs_for_prov(pv) if pv else []
338
+ kv = kabs[0] if kabs else None
339
+ kews = kew_for(pv, kv) if (pv and kv) else ["(kewenangan tidak tersedia)"]
340
+ kw = kews[0] if kews else None
341
+ return pv, kv, kw, kabs, kews
342
+
343
+ def on_prov_change(pv):
344
+ kabs = kabs_for_prov(pv) if pv else []
345
+ kv = kabs[0] if kabs else None
346
+ kews = kew_for(pv, kv) if (pv and kv) else ["(kewenangan tidak tersedia)"]
347
+ kw = kews[0] if kews else None
348
+ return gr.update(choices=kabs, value=kv), gr.update(choices=kews, value=kw)
349
+
350
+ def on_kab_change(pv, kv):
351
+ kews = kew_for(pv, kv) if (pv and kv) else ["(kewenangan tidak tersedia)"]
352
+ kw = kews[0] if kews else None
353
+ return gr.update(choices=kews, value=kw)
354
+
355
+ def run_audit(pv, kv, kw):
356
+ narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl = audit(pv, kv, kw)
357
+ # Return order: markdown, scorecard df, benford df, benford img, scatter img, sim df
358
+ return narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl
359
+
360
+
361
+ pv0, kv0, kw0, kabs0, kews0 = ui_init()
362
+
363
+ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", theme=gr.themes.Soft()) as demo:
364
+ gr.Markdown(
365
+ "# IPLM — Audit Kualitas Data & Indikasi Data Tidak Wajar (Satu Wilayah)\n"
366
+ f"- Sumber data: `{DATA_PATH}`\n"
367
+ f"- EXCLUDE (no analysis): `{', '.join(sorted(EXCLUDE_COLS_EXACT))}`\n"
368
+ f"- prov_col = `{prov_col}` · kab_col = `{kab_col}` · kewenangan_col = `{kew_col if kew_col else 'TIDAK ADA'}`\n"
369
+ "---"
370
+ )
371
+
372
+ with gr.Row():
373
+ prov = gr.Dropdown(label="Provinsi", choices=PROVS, value=pv0)
374
+ kab = gr.Dropdown(label="Kab/Kota", choices=kabs0, value=kv0)
375
+ kew = gr.Dropdown(label="Kewenangan", choices=kews0, value=kw0)
376
+
377
+ prov.change(on_prov_change, inputs=prov, outputs=[kab, kew], show_progress=False)
378
+ kab.change(on_kab_change, inputs=[prov, kab], outputs=kew, show_progress=False)
379
+
380
+ btn = gr.Button("Run Audit", variant="primary")
381
+
382
+ out_md = gr.Markdown()
383
+ out_score = gr.Dataframe(label="Scorecard", interactive=False, wrap=True)
384
+ out_ben_tbl = gr.Dataframe(label="Top Benford Signals (Applicable Only, max 15)", interactive=False, wrap=True)
385
+
386
+ with gr.Row():
387
+ out_ben_img = gr.Image(label="Benford Plot (Strongest Applicable Column)")
388
+ out_scat_img = gr.Image(label="Peer Scatter (2 kolom paling variatif)")
389
+
390
+ out_sim = gr.Dataframe(label="Top Similarity (se-Provinsi)", interactive=False, wrap=True)
391
+
392
+ btn.click(run_audit, inputs=[prov, kab, kew], outputs=[out_md, out_score, out_ben_tbl, out_ben_img, out_scat_img, out_sim])
393
+
394
+ demo.queue().launch()