irhamni commited on
Commit
a53136f
·
verified ·
1 Parent(s): 1fd864a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -51
app.py CHANGED
@@ -1,4 +1,21 @@
1
- import os, re, math, io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import gradio as gr
@@ -7,13 +24,25 @@ from PIL import Image
7
  from scipy.stats import chisquare
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
10
  import matplotlib.pyplot as plt
11
 
12
 
13
  # ============================================================
14
  # CONFIG
15
  # ============================================================
16
- DATA_PATH = os.getenv("IPLM_DATA_PATH", "IPLM_clean_manual_131225.xlsx")
 
 
 
 
 
 
 
 
 
17
 
18
  EXCLUDE_COLS_EXACT = {"kontak_wa", "npp", "tanggal_kirim", "updated_at", "created_at"}
19
 
@@ -40,6 +69,14 @@ BENFORD_EXCLUDE_PATTERNS = [
40
  def canon(s: str) -> str:
41
  return re.sub(r"[^a-z0-9]+", "", str(s).lower())
42
 
 
 
 
 
 
 
 
 
43
  def pick_col(df, candidates):
44
  cols = list(df.columns)
45
  cc = {canon(c): c for c in cols}
@@ -56,21 +93,19 @@ def pick_col(df, candidates):
56
 
57
  def detect_geo_cols(df):
58
  prov = pick_col(df, ["provinsi", "propinsi", "province"])
59
- kab = pick_col(df, ["kab_kota", "kabkota", "kabupatenkota", "kabupaten/kota", "kabupaten", "kota", "regency", "city"])
 
60
  return prov, kab
61
 
62
  def detect_kewenangan_col(df):
63
- return pick_col(df, ["kewenangan", "pu_level", "level_kewenangan", "kewenangan_pengelola", "kewenangan_perpustakaan", "level"])
 
64
 
65
  def load_excel(path):
66
  df = pd.read_excel(path, engine="openpyxl")
67
  for c in df.columns:
68
  if df[c].dtype == object:
69
- df[c] = (df[c].astype(str)
70
- .str.replace("\u00a0", " ", regex=False)
71
- .str.replace(r"\s+", " ", regex=True)
72
- .str.strip())
73
- df.loc[df[c].str.lower().isin(["nan", "none", "null", ""]), c] = np.nan
74
  return df
75
 
76
  def clean_str_list(values):
@@ -91,7 +126,6 @@ def clean_str_list(values):
91
  return uniq
92
 
93
  def safe_numeric_cols(df, exclude=set(), min_non_na=0.25):
94
- """Numeric cols used for completeness/zero/similarity. Hard-exclude columns by exact name."""
95
  hard = {canon(x) for x in EXCLUDE_COLS_EXACT}
96
  cols = []
97
  for c in df.columns:
@@ -174,10 +208,14 @@ def scatter_plot(peer_agg, x_col, y_col):
174
 
175
 
176
  # ============================================================
177
- # LOAD ONCE (GLOBAL)
178
  # ============================================================
179
  if not os.path.exists(DATA_PATH):
180
- raise FileNotFoundError(f"Data file not found: {DATA_PATH}. Taruh excel di repo: data/..., atau set env IPLM_DATA_PATH.")
 
 
 
 
181
 
182
  df_raw = load_excel(DATA_PATH)
183
  prov_col, kab_col = detect_geo_cols(df_raw)
@@ -187,11 +225,9 @@ if prov_col is None or kab_col is None:
187
  raise ValueError("Kolom provinsi/kab_kota tidak terdeteksi. Pastikan ada kolom provinsi dan kab_kota.")
188
 
189
  df = df_raw.copy()
190
- df["_prov_str"] = df[prov_col].astype(str).str.strip()
191
- df["_kab_str"] = df[kab_col].astype(str).str.strip()
192
- df.loc[df["_prov_str"].str.lower().isin(["nan", "none", "null", ""]), "_prov_str"] = np.nan
193
- df.loc[df["_kab_str"].str.lower().isin(["nan", "none", "null", ""]), "_kab_str"] = np.nan
194
- df = df[df["_prov_str"].notna() & df["_kab_str"].notna()].copy() # penting supaya tidak "campur"
195
 
196
  exclude_base = {prov_col, kab_col, "_prov_str", "_kab_str"}
197
  hard_exclude_cols_in_file = {c for c in df.columns if canon(c) in {canon(x) for x in EXCLUDE_COLS_EXACT}}
@@ -201,58 +237,69 @@ num_cols_all = safe_numeric_cols(df, exclude=exclude_base)
201
  benford_cols = [c for c in num_cols_all if is_benford_applicable(c)]
202
 
203
  PROVS = clean_str_list(df["_prov_str"].unique().tolist())
 
 
204
 
205
- prov_cache_peer = {} # cache per prov for similarity
206
 
207
 
208
  def kabs_for_prov(pv):
 
 
209
  return clean_str_list(df.loc[df["_prov_str"] == pv, "_kab_str"].unique().tolist())
210
 
211
  def kew_for(pv, kv):
212
  if not kew_col or kew_col not in df.columns:
213
  return ["(kewenangan tidak tersedia)"]
214
- vals = clean_str_list(df.loc[(df["_prov_str"] == pv) & (df["_kab_str"] == kv), kew_col].dropna().unique().tolist())
 
 
 
 
 
215
  return vals if vals else ["(kewenangan kosong)"]
216
 
217
  def get_peer_agg_for_prov(pv):
218
  if pv in prov_cache_peer:
219
  return prov_cache_peer[pv]
220
  peer = df[df["_prov_str"] == pv]
221
- peer_agg = peer.groupby("_kab_str")[num_cols_all].apply(
222
- lambda g: g.apply(pd.to_numeric, errors="coerce").mean()
223
- ).reset_index().rename(columns={"_kab_str": "kab_kota"})
 
 
 
 
 
 
 
 
224
  prov_cache_peer[pv] = peer_agg
225
  return peer_agg
226
 
227
 
228
  # ============================================================
229
- # CORE AUDIT FUNCTION (STRICT FILTER)
230
  # ============================================================
231
  def audit(pv, kv, kw):
232
- # strict filter: prov + kab (+ kewenangan if available & chosen)
233
- dfx = df[(df["_prov_str"] == pv) & (df["_kab_str"] == kv)].copy()
234
 
235
- if kew_col and kew_col in dfx.columns and kw and not kw.startswith("("):
236
- dfx = dfx[dfx[kew_col].astype(str).str.strip() == kw].copy()
 
237
 
238
  if dfx.empty:
239
- return (
240
- "❌ Data kosong setelah filter (cek kewenangan / validitas label).",
241
- pd.DataFrame(),
242
- pd.DataFrame(),
243
- None,
244
- None
245
- )
246
 
247
  if not num_cols_all:
248
- return ("❌ Tidak ada kolom numerik yang cukup.", pd.DataFrame(), pd.DataFrame(), None, None)
249
 
250
  num_all = dfx[num_cols_all].apply(pd.to_numeric, errors="coerce")
251
-
252
  completeness = float(num_all.notna().mean().mean())
253
  zero_rate = float((num_all.fillna(0) == 0).mean().mean())
254
 
255
- # Benford (applicable only, already excluded hard cols)
256
  best = None
257
  rows = []
258
  for c in benford_cols:
@@ -270,12 +317,12 @@ def audit(pv, kv, kw):
270
  ben_note = f"Benford strongest: {best['kolom']} | n={best['n']} | MAD={best['mad']:.4f} ({benford_flag(best['mad'])}) | p={best['p_value']:.3g}"
271
  ben_img = benford_plot(best["obs"])
272
 
273
- # Similarity (peer se-provinsi) => strict prov only (no mixing)
274
  peer_agg = get_peer_agg_for_prov(pv)
275
  sim_tbl = pd.DataFrame()
276
  top_sim = None
277
 
278
- if peer_agg.shape[0] >= 3:
279
  X = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy(float)
280
  Xs = StandardScaler().fit_transform(X)
281
  sim = cosine_similarity(Xs)
@@ -295,20 +342,19 @@ def audit(pv, kv, kw):
295
  if not sim_tbl.empty:
296
  top_sim = float(sim_tbl["cosine_similarity"].max())
297
 
298
- # scatter
299
  scat_img = None
300
- if peer_agg.shape[0] >= 3:
301
  vars_ = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).var(axis=0).sort_values(ascending=False)
302
  if len(vars_) >= 2 and vars_.iloc[0] > 0 and vars_.iloc[1] > 0:
303
- x_col, y_col = vars_.index[0], vars_.index[1]
304
- scat_img = scatter_plot(peer_agg, x_col, y_col)
305
 
306
  too_perfect = (completeness > 0.98) and (zero_rate < 0.02)
307
 
308
  scorecard = pd.DataFrame([
309
  ["Provinsi", pv, ""],
310
  ["Kab/Kota", kv, ""],
311
- ["Kewenangan", kw if kw else "NA", f"Sumber: {kew_col}" if (kew_col and not str(kw).startswith("(")) else "Kewenangan tidak tersedia/kosong."],
 
312
  ["Completeness (numeric)", f"{completeness:.2%}",
313
  "Kelengkapan tinggi; pastikan berasal dari validasi input (wajib isi) atau data administratif lengkap. Jika ada imputasi, dokumentasikan prosedurnya."],
314
  ["Zero-rate (numeric)", f"{zero_rate:.2%}",
@@ -330,7 +376,92 @@ def audit(pv, kv, kw):
330
 
331
 
332
  # ============================================================
333
- # GRADIO UI (DEPLOY READY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  # ============================================================
335
  def ui_init():
336
  pv = PROVS[0] if PROVS else None
@@ -352,10 +483,13 @@ def on_kab_change(pv, kv):
352
  kw = kews[0] if kews else None
353
  return gr.update(choices=kews, value=kw)
354
 
 
355
  def run_audit(pv, kv, kw):
356
  narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl = audit(pv, kv, kw)
357
- # Return order: markdown, scorecard df, benford df, benford img, scatter img, sim df
358
- return narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl
 
 
359
 
360
 
361
  pv0, kv0, kw0, kabs0, kews0 = ui_init()
@@ -366,7 +500,8 @@ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", them
366
  f"- Sumber data: `{DATA_PATH}`\n"
367
  f"- EXCLUDE (no analysis): `{', '.join(sorted(EXCLUDE_COLS_EXACT))}`\n"
368
  f"- prov_col = `{prov_col}` · kab_col = `{kab_col}` · kewenangan_col = `{kew_col if kew_col else 'TIDAK ADA'}`\n"
369
- "---"
 
370
  )
371
 
372
  with gr.Row():
@@ -377,7 +512,9 @@ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", them
377
  prov.change(on_prov_change, inputs=prov, outputs=[kab, kew], show_progress=False)
378
  kab.change(on_kab_change, inputs=[prov, kab], outputs=kew, show_progress=False)
379
 
380
- btn = gr.Button("Run Audit", variant="primary")
 
 
381
 
382
  out_md = gr.Markdown()
383
  out_score = gr.Dataframe(label="Scorecard", interactive=False, wrap=True)
@@ -389,6 +526,26 @@ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", them
389
 
390
  out_sim = gr.Dataframe(label="Top Similarity (se-Provinsi)", interactive=False, wrap=True)
391
 
392
- btn.click(run_audit, inputs=[prov, kab, kew], outputs=[out_md, out_score, out_ben_tbl, out_ben_img, out_scat_img, out_sim])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
- demo.queue().launch()
 
1
+ # ============================================================
2
+ # IPLM Audit — HF Spaces (Gradio) — + LLM Analysis (Optional)
3
+ # - Scorecard + Benford + Similarity + Scatter
4
+ # - Tambahan: LLM narasi untuk Scorecard (teknokratis)
5
+ # ============================================================
6
+
7
+ import os
8
+
9
+ # ---- CRASH FIX (HF Spaces Exit 139 / SIGSEGV) ----
10
+ os.environ["OMP_NUM_THREADS"] = "1"
11
+ os.environ["OPENBLAS_NUM_THREADS"] = "1"
12
+ os.environ["MKL_NUM_THREADS"] = "1"
13
+ os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
14
+ os.environ["NUMEXPR_NUM_THREADS"] = "1"
15
+ os.environ["MPLBACKEND"] = "Agg"
16
+ os.environ["PYTHONUNBUFFERED"] = "1"
17
+
18
+ import re, math, io, json, textwrap
19
  import numpy as np
20
  import pandas as pd
21
  import gradio as gr
 
24
  from scipy.stats import chisquare
25
  from sklearn.preprocessing import StandardScaler
26
  from sklearn.metrics.pairwise import cosine_similarity
27
+
28
+ import matplotlib
29
+ matplotlib.use("Agg")
30
  import matplotlib.pyplot as plt
31
 
32
 
33
  # ============================================================
34
  # CONFIG
35
  # ============================================================
36
+ DATA_PATH = os.getenv("IPLM_DATA_PATH", "data/IPLM_clean_manual_131225.xlsx")
37
+
38
+ # LLM provider optional:
39
+ # - OpenAI: set OPENAI_API_KEY (+ optional OPENAI_MODEL)
40
+ # - Gemini: set GEMINI_API_KEY (+ optional GEMINI_MODEL)
41
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
42
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") # aman & murah (ubah bebas)
43
+
44
+ GEMINI_API_KEY = os.getenv("HF_TOKEN_DQ", "").strip()
45
+ GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
46
 
47
  EXCLUDE_COLS_EXACT = {"kontak_wa", "npp", "tanggal_kirim", "updated_at", "created_at"}
48
 
 
69
  def canon(s: str) -> str:
70
  return re.sub(r"[^a-z0-9]+", "", str(s).lower())
71
 
72
+ def clean_text_col(s: pd.Series) -> pd.Series:
73
+ s = s.astype(str)
74
+ s = s.str.replace("\u00a0", " ", regex=False)
75
+ s = s.str.replace(r"\s+", " ", regex=True)
76
+ s = s.str.strip()
77
+ s = s.mask(s.str.lower().isin(["nan", "none", "null", ""]), np.nan)
78
+ return s
79
+
80
  def pick_col(df, candidates):
81
  cols = list(df.columns)
82
  cc = {canon(c): c for c in cols}
 
93
 
94
  def detect_geo_cols(df):
95
  prov = pick_col(df, ["provinsi", "propinsi", "province"])
96
+ kab = pick_col(df, ["kab_kota", "kabkota", "kabupatenkota", "kabupaten/kota",
97
+ "kabupaten", "kota", "regency", "city"])
98
  return prov, kab
99
 
100
  def detect_kewenangan_col(df):
101
+ return pick_col(df, ["kewenangan", "pu_level", "level_kewenangan",
102
+ "kewenangan_pengelola", "kewenangan_perpustakaan", "level"])
103
 
104
  def load_excel(path):
105
  df = pd.read_excel(path, engine="openpyxl")
106
  for c in df.columns:
107
  if df[c].dtype == object:
108
+ df[c] = clean_text_col(df[c])
 
 
 
 
109
  return df
110
 
111
  def clean_str_list(values):
 
126
  return uniq
127
 
128
  def safe_numeric_cols(df, exclude=set(), min_non_na=0.25):
 
129
  hard = {canon(x) for x in EXCLUDE_COLS_EXACT}
130
  cols = []
131
  for c in df.columns:
 
208
 
209
 
210
  # ============================================================
211
+ # LOAD DATA (GLOBAL)
212
  # ============================================================
213
  if not os.path.exists(DATA_PATH):
214
+ raise FileNotFoundError(
215
+ f"Data file not found: {DATA_PATH}\n"
216
+ "Taruh file excel di repo: data/IPLM_clean_manual_131225.xlsx\n"
217
+ "atau set env variable IPLM_DATA_PATH."
218
+ )
219
 
220
  df_raw = load_excel(DATA_PATH)
221
  prov_col, kab_col = detect_geo_cols(df_raw)
 
225
  raise ValueError("Kolom provinsi/kab_kota tidak terdeteksi. Pastikan ada kolom provinsi dan kab_kota.")
226
 
227
  df = df_raw.copy()
228
+ df["_prov_str"] = clean_text_col(df[prov_col])
229
+ df["_kab_str"] = clean_text_col(df[kab_col])
230
+ df = df[df["_prov_str"].notna() & df["_kab_str"].notna()].copy() # cegah mixing
 
 
231
 
232
  exclude_base = {prov_col, kab_col, "_prov_str", "_kab_str"}
233
  hard_exclude_cols_in_file = {c for c in df.columns if canon(c) in {canon(x) for x in EXCLUDE_COLS_EXACT}}
 
237
  benford_cols = [c for c in num_cols_all if is_benford_applicable(c)]
238
 
239
  PROVS = clean_str_list(df["_prov_str"].unique().tolist())
240
+ if not PROVS:
241
+ raise ValueError("Tidak ada nilai provinsi yang valid setelah cleaning.")
242
 
243
+ prov_cache_peer = {} # cache peer per prov
244
 
245
 
246
  def kabs_for_prov(pv):
247
+ if pv is None:
248
+ return []
249
  return clean_str_list(df.loc[df["_prov_str"] == pv, "_kab_str"].unique().tolist())
250
 
251
  def kew_for(pv, kv):
252
  if not kew_col or kew_col not in df.columns:
253
  return ["(kewenangan tidak tersedia)"]
254
+ if pv is None or kv is None:
255
+ return ["(pilih provinsi & kab/kota)"]
256
+ vals = clean_str_list(
257
+ df.loc[(df["_prov_str"] == pv) & (df["_kab_str"] == kv), kew_col]
258
+ .dropna().unique().tolist()
259
+ )
260
  return vals if vals else ["(kewenangan kosong)"]
261
 
262
  def get_peer_agg_for_prov(pv):
263
  if pv in prov_cache_peer:
264
  return prov_cache_peer[pv]
265
  peer = df[df["_prov_str"] == pv]
266
+ if peer.empty:
267
+ peer_agg = pd.DataFrame({"kab_kota": []})
268
+ prov_cache_peer[pv] = peer_agg
269
+ return peer_agg
270
+
271
+ peer_agg = (
272
+ peer.groupby("_kab_str")[num_cols_all]
273
+ .apply(lambda g: g.apply(pd.to_numeric, errors="coerce").mean())
274
+ .reset_index()
275
+ .rename(columns={"_kab_str": "kab_kota"})
276
+ )
277
  prov_cache_peer[pv] = peer_agg
278
  return peer_agg
279
 
280
 
281
  # ============================================================
282
+ # CORE AUDIT
283
  # ============================================================
284
  def audit(pv, kv, kw):
285
+ if pv is None or kv is None:
286
+ return " Pilih provinsi dan kab/kota.", pd.DataFrame(), pd.DataFrame(), None, None, pd.DataFrame()
287
 
288
+ dfx = df[(df["_prov_str"] == pv) & (df["_kab_str"] == kv)].copy()
289
+ if kew_col and kew_col in dfx.columns and kw and not str(kw).startswith("("):
290
+ dfx = dfx[dfx[kew_col].astype(str).str.strip() == str(kw).strip()].copy()
291
 
292
  if dfx.empty:
293
+ return "❌ Data kosong setelah filter (cek kewenangan/label).", pd.DataFrame(), pd.DataFrame(), None, None, pd.DataFrame()
 
 
 
 
 
 
294
 
295
  if not num_cols_all:
296
+ return "❌ Tidak ada kolom numerik yang cukup.", pd.DataFrame(), pd.DataFrame(), None, None, pd.DataFrame()
297
 
298
  num_all = dfx[num_cols_all].apply(pd.to_numeric, errors="coerce")
 
299
  completeness = float(num_all.notna().mean().mean())
300
  zero_rate = float((num_all.fillna(0) == 0).mean().mean())
301
 
302
+ # Benford
303
  best = None
304
  rows = []
305
  for c in benford_cols:
 
317
  ben_note = f"Benford strongest: {best['kolom']} | n={best['n']} | MAD={best['mad']:.4f} ({benford_flag(best['mad'])}) | p={best['p_value']:.3g}"
318
  ben_img = benford_plot(best["obs"])
319
 
320
+ # Similarity within prov
321
  peer_agg = get_peer_agg_for_prov(pv)
322
  sim_tbl = pd.DataFrame()
323
  top_sim = None
324
 
325
+ if not peer_agg.empty and peer_agg.shape[0] >= 3:
326
  X = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy(float)
327
  Xs = StandardScaler().fit_transform(X)
328
  sim = cosine_similarity(Xs)
 
342
  if not sim_tbl.empty:
343
  top_sim = float(sim_tbl["cosine_similarity"].max())
344
 
 
345
  scat_img = None
346
+ if not peer_agg.empty and peer_agg.shape[0] >= 3:
347
  vars_ = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).var(axis=0).sort_values(ascending=False)
348
  if len(vars_) >= 2 and vars_.iloc[0] > 0 and vars_.iloc[1] > 0:
349
+ scat_img = scatter_plot(peer_agg, vars_.index[0], vars_.index[1])
 
350
 
351
  too_perfect = (completeness > 0.98) and (zero_rate < 0.02)
352
 
353
  scorecard = pd.DataFrame([
354
  ["Provinsi", pv, ""],
355
  ["Kab/Kota", kv, ""],
356
+ ["Kewenangan", kw if kw else "NA",
357
+ f"Sumber: {kew_col}" if (kew_col and not str(kw).startswith("(")) else "Kewenangan tidak tersedia/kosong."],
358
  ["Completeness (numeric)", f"{completeness:.2%}",
359
  "Kelengkapan tinggi; pastikan berasal dari validasi input (wajib isi) atau data administratif lengkap. Jika ada imputasi, dokumentasikan prosedurnya."],
360
  ["Zero-rate (numeric)", f"{zero_rate:.2%}",
 
376
 
377
 
378
  # ============================================================
379
+ # LLM (OPTIONAL)
380
+ # ============================================================
381
+ def _llm_available():
382
+ return bool(OPENAI_API_KEY) or bool(GEMINI_API_KEY)
383
+
384
+ def llm_analyze_scorecard(pv, kv, kw, scorecard_df, ben_tbl, sim_tbl):
385
+ """
386
+ Return a policy-style narrative in Indonesian.
387
+ Works if OPENAI_API_KEY or GEMINI_API_KEY is set.
388
+ """
389
+ if scorecard_df is None or len(scorecard_df) == 0:
390
+ return "❌ Scorecard kosong. Jalankan audit dulu."
391
+
392
+ if not _llm_available():
393
+ return (
394
+ "⚠️ **LLM belum aktif** karena API key belum diset.\n\n"
395
+ "Set salah satu environment variable di HuggingFace Space:\n"
396
+ "- `OPENAI_API_KEY` (opsional `OPENAI_MODEL`)\n"
397
+ "- atau `GEMINI_API_KEY` (opsional `GEMINI_MODEL`)\n"
398
+ )
399
+
400
+ payload = {
401
+ "provinsi": pv,
402
+ "kab_kota": kv,
403
+ "kewenangan": kw,
404
+ "scorecard": scorecard_df.to_dict(orient="records"),
405
+ "top_benford_signals": ben_tbl.head(10).to_dict(orient="records") if isinstance(ben_tbl, pd.DataFrame) else [],
406
+ "top_similarity": sim_tbl.head(10).to_dict(orient="records") if isinstance(sim_tbl, pd.DataFrame) else []
407
+ }
408
+
409
+ system = (
410
+ "Anda adalah analis kebijakan publik dan auditor kualitas data untuk indikator pembangunan literasi/perpustakaan. "
411
+ "Tulis analisis ringkas namun tajam, berbahasa Indonesia teknokratis (gaya Perpusnas/pemerintah). "
412
+ "Fokus pada interpretasi scorecard: kelengkapan, zero-rate, Benford, similarity, implikasi risiko, dan rekomendasi tindak lanjut "
413
+ "(cek bukti dukung, metadata, log input, sampling/validasi). Hindari asumsi yang tidak ada di data."
414
+ )
415
+
416
+ user = (
417
+ "Buatkan ANALISIS NARATIF untuk hasil audit berikut. Struktur wajib:\n"
418
+ "1) Ringkasan status (1 paragraf)\n"
419
+ "2) Interpretasi tiap komponen (bullet)\n"
420
+ "3) Rekomendasi tindak lanjut prioritas (maks 6 bullet)\n\n"
421
+ f"DATA (JSON):\n{json.dumps(payload, ensure_ascii=False)}"
422
+ )
423
+
424
+ # ---- Try OpenAI first if key exists ----
425
+ if OPENAI_API_KEY:
426
+ try:
427
+ from openai import OpenAI
428
+ client = OpenAI(api_key=OPENAI_API_KEY)
429
+ resp = client.chat.completions.create(
430
+ model=OPENAI_MODEL,
431
+ messages=[
432
+ {"role": "system", "content": system},
433
+ {"role": "user", "content": user},
434
+ ],
435
+ temperature=0.2,
436
+ max_tokens=700,
437
+ )
438
+ return resp.choices[0].message.content.strip()
439
+ except Exception as e:
440
+ err = str(e)
441
+ # fallback to gemini if available
442
+ if not GEMINI_API_KEY:
443
+ return f"❌ LLM error (OpenAI): {err}"
444
+ # else continue to Gemini
445
+
446
+ # ---- Gemini fallback ----
447
+ if GEMINI_API_KEY:
448
+ try:
449
+ from google import genai
450
+ client = genai.Client(api_key=GEMINI_API_KEY)
451
+ resp = client.models.generate_content(
452
+ model=GEMINI_MODEL,
453
+ contents=f"{system}\n\n{user}",
454
+ )
455
+ text = getattr(resp, "text", None)
456
+ return (text or "").strip() or "❌ Gemini tidak mengembalikan teks."
457
+ except Exception as e:
458
+ return f"❌ LLM error (Gemini): {e}"
459
+
460
+ return "❌ LLM tidak tersedia."
461
+
462
+
463
+ # ============================================================
464
+ # GRADIO UI
465
  # ============================================================
466
  def ui_init():
467
  pv = PROVS[0] if PROVS else None
 
483
  kw = kews[0] if kews else None
484
  return gr.update(choices=kews, value=kw)
485
 
486
+ # state holder for last outputs (to feed LLM button without recompute)
487
  def run_audit(pv, kv, kw):
488
  narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl = audit(pv, kv, kw)
489
+ return narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl, scorecard, ben_tbl, sim_tbl
490
+
491
+ def run_llm(pv, kv, kw, scorecard_df, ben_tbl, sim_tbl):
492
+ return llm_analyze_scorecard(pv, kv, kw, scorecard_df, ben_tbl, sim_tbl)
493
 
494
 
495
  pv0, kv0, kw0, kabs0, kews0 = ui_init()
 
500
  f"- Sumber data: `{DATA_PATH}`\n"
501
  f"- EXCLUDE (no analysis): `{', '.join(sorted(EXCLUDE_COLS_EXACT))}`\n"
502
  f"- prov_col = `{prov_col}` · kab_col = `{kab_col}` · kewenangan_col = `{kew_col if kew_col else 'TIDAK ADA'}`\n"
503
+ "---\n"
504
+ "**LLM Analysis (opsional):** set `OPENAI_API_KEY` atau `GEMINI_API_KEY` di Space Variables."
505
  )
506
 
507
  with gr.Row():
 
512
  prov.change(on_prov_change, inputs=prov, outputs=[kab, kew], show_progress=False)
513
  kab.change(on_kab_change, inputs=[prov, kab], outputs=kew, show_progress=False)
514
 
515
+ with gr.Row():
516
+ btn = gr.Button("Run Audit", variant="primary")
517
+ btn_llm = gr.Button("Generate LLM Analysis", variant="secondary")
518
 
519
  out_md = gr.Markdown()
520
  out_score = gr.Dataframe(label="Scorecard", interactive=False, wrap=True)
 
526
 
527
  out_sim = gr.Dataframe(label="Top Similarity (se-Provinsi)", interactive=False, wrap=True)
528
 
529
+ gr.Markdown("## Analisis Naratif (LLM)")
530
+ out_llm = gr.Markdown()
531
+
532
+ # hidden states: store last tables for llm button
533
+ st_score = gr.State(pd.DataFrame())
534
+ st_ben = gr.State(pd.DataFrame())
535
+ st_sim = gr.State(pd.DataFrame())
536
+
537
+ btn.click(
538
+ run_audit,
539
+ inputs=[prov, kab, kew],
540
+ outputs=[out_md, out_score, out_ben_tbl, out_ben_img, out_scat_img, out_sim, st_score, st_ben, st_sim],
541
+ show_progress=False
542
+ )
543
+
544
+ btn_llm.click(
545
+ run_llm,
546
+ inputs=[prov, kab, kew, st_score, st_ben, st_sim],
547
+ outputs=[out_llm],
548
+ show_progress=True
549
+ )
550
 
551
+ demo.launch()