Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,21 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
import gradio as gr
|
|
@@ -7,13 +24,25 @@ from PIL import Image
|
|
| 7 |
from scipy.stats import chisquare
|
| 8 |
from sklearn.preprocessing import StandardScaler
|
| 9 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
| 10 |
import matplotlib.pyplot as plt
|
| 11 |
|
| 12 |
|
| 13 |
# ============================================================
|
| 14 |
# CONFIG
|
| 15 |
# ============================================================
|
| 16 |
-
DATA_PATH = os.getenv("IPLM_DATA_PATH", "IPLM_clean_manual_131225.xlsx")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
EXCLUDE_COLS_EXACT = {"kontak_wa", "npp", "tanggal_kirim", "updated_at", "created_at"}
|
| 19 |
|
|
@@ -40,6 +69,14 @@ BENFORD_EXCLUDE_PATTERNS = [
|
|
| 40 |
def canon(s: str) -> str:
|
| 41 |
return re.sub(r"[^a-z0-9]+", "", str(s).lower())
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def pick_col(df, candidates):
|
| 44 |
cols = list(df.columns)
|
| 45 |
cc = {canon(c): c for c in cols}
|
|
@@ -56,21 +93,19 @@ def pick_col(df, candidates):
|
|
| 56 |
|
| 57 |
def detect_geo_cols(df):
|
| 58 |
prov = pick_col(df, ["provinsi", "propinsi", "province"])
|
| 59 |
-
kab = pick_col(df, ["kab_kota", "kabkota", "kabupatenkota", "kabupaten/kota",
|
|
|
|
| 60 |
return prov, kab
|
| 61 |
|
| 62 |
def detect_kewenangan_col(df):
|
| 63 |
-
return pick_col(df, ["kewenangan", "pu_level", "level_kewenangan",
|
|
|
|
| 64 |
|
| 65 |
def load_excel(path):
|
| 66 |
df = pd.read_excel(path, engine="openpyxl")
|
| 67 |
for c in df.columns:
|
| 68 |
if df[c].dtype == object:
|
| 69 |
-
df[c] = (df[c]
|
| 70 |
-
.str.replace("\u00a0", " ", regex=False)
|
| 71 |
-
.str.replace(r"\s+", " ", regex=True)
|
| 72 |
-
.str.strip())
|
| 73 |
-
df.loc[df[c].str.lower().isin(["nan", "none", "null", ""]), c] = np.nan
|
| 74 |
return df
|
| 75 |
|
| 76 |
def clean_str_list(values):
|
|
@@ -91,7 +126,6 @@ def clean_str_list(values):
|
|
| 91 |
return uniq
|
| 92 |
|
| 93 |
def safe_numeric_cols(df, exclude=set(), min_non_na=0.25):
|
| 94 |
-
"""Numeric cols used for completeness/zero/similarity. Hard-exclude columns by exact name."""
|
| 95 |
hard = {canon(x) for x in EXCLUDE_COLS_EXACT}
|
| 96 |
cols = []
|
| 97 |
for c in df.columns:
|
|
@@ -174,10 +208,14 @@ def scatter_plot(peer_agg, x_col, y_col):
|
|
| 174 |
|
| 175 |
|
| 176 |
# ============================================================
|
| 177 |
-
# LOAD
|
| 178 |
# ============================================================
|
| 179 |
if not os.path.exists(DATA_PATH):
|
| 180 |
-
raise FileNotFoundError(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
df_raw = load_excel(DATA_PATH)
|
| 183 |
prov_col, kab_col = detect_geo_cols(df_raw)
|
|
@@ -187,11 +225,9 @@ if prov_col is None or kab_col is None:
|
|
| 187 |
raise ValueError("Kolom provinsi/kab_kota tidak terdeteksi. Pastikan ada kolom provinsi dan kab_kota.")
|
| 188 |
|
| 189 |
df = df_raw.copy()
|
| 190 |
-
df["_prov_str"] = df[prov_col]
|
| 191 |
-
df["_kab_str"]
|
| 192 |
-
df
|
| 193 |
-
df.loc[df["_kab_str"].str.lower().isin(["nan", "none", "null", ""]), "_kab_str"] = np.nan
|
| 194 |
-
df = df[df["_prov_str"].notna() & df["_kab_str"].notna()].copy() # penting supaya tidak "campur"
|
| 195 |
|
| 196 |
exclude_base = {prov_col, kab_col, "_prov_str", "_kab_str"}
|
| 197 |
hard_exclude_cols_in_file = {c for c in df.columns if canon(c) in {canon(x) for x in EXCLUDE_COLS_EXACT}}
|
|
@@ -201,58 +237,69 @@ num_cols_all = safe_numeric_cols(df, exclude=exclude_base)
|
|
| 201 |
benford_cols = [c for c in num_cols_all if is_benford_applicable(c)]
|
| 202 |
|
| 203 |
PROVS = clean_str_list(df["_prov_str"].unique().tolist())
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
prov_cache_peer = {} # cache per prov
|
| 206 |
|
| 207 |
|
| 208 |
def kabs_for_prov(pv):
|
|
|
|
|
|
|
| 209 |
return clean_str_list(df.loc[df["_prov_str"] == pv, "_kab_str"].unique().tolist())
|
| 210 |
|
| 211 |
def kew_for(pv, kv):
|
| 212 |
if not kew_col or kew_col not in df.columns:
|
| 213 |
return ["(kewenangan tidak tersedia)"]
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
return vals if vals else ["(kewenangan kosong)"]
|
| 216 |
|
| 217 |
def get_peer_agg_for_prov(pv):
|
| 218 |
if pv in prov_cache_peer:
|
| 219 |
return prov_cache_peer[pv]
|
| 220 |
peer = df[df["_prov_str"] == pv]
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
prov_cache_peer[pv] = peer_agg
|
| 225 |
return peer_agg
|
| 226 |
|
| 227 |
|
| 228 |
# ============================================================
|
| 229 |
-
# CORE AUDIT
|
| 230 |
# ============================================================
|
| 231 |
def audit(pv, kv, kw):
|
| 232 |
-
|
| 233 |
-
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
|
|
|
| 237 |
|
| 238 |
if dfx.empty:
|
| 239 |
-
return (
|
| 240 |
-
"❌ Data kosong setelah filter (cek kewenangan / validitas label).",
|
| 241 |
-
pd.DataFrame(),
|
| 242 |
-
pd.DataFrame(),
|
| 243 |
-
None,
|
| 244 |
-
None
|
| 245 |
-
)
|
| 246 |
|
| 247 |
if not num_cols_all:
|
| 248 |
-
return
|
| 249 |
|
| 250 |
num_all = dfx[num_cols_all].apply(pd.to_numeric, errors="coerce")
|
| 251 |
-
|
| 252 |
completeness = float(num_all.notna().mean().mean())
|
| 253 |
zero_rate = float((num_all.fillna(0) == 0).mean().mean())
|
| 254 |
|
| 255 |
-
# Benford
|
| 256 |
best = None
|
| 257 |
rows = []
|
| 258 |
for c in benford_cols:
|
|
@@ -270,12 +317,12 @@ def audit(pv, kv, kw):
|
|
| 270 |
ben_note = f"Benford strongest: {best['kolom']} | n={best['n']} | MAD={best['mad']:.4f} ({benford_flag(best['mad'])}) | p={best['p_value']:.3g}"
|
| 271 |
ben_img = benford_plot(best["obs"])
|
| 272 |
|
| 273 |
-
# Similarity
|
| 274 |
peer_agg = get_peer_agg_for_prov(pv)
|
| 275 |
sim_tbl = pd.DataFrame()
|
| 276 |
top_sim = None
|
| 277 |
|
| 278 |
-
if peer_agg.shape[0] >= 3:
|
| 279 |
X = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy(float)
|
| 280 |
Xs = StandardScaler().fit_transform(X)
|
| 281 |
sim = cosine_similarity(Xs)
|
|
@@ -295,20 +342,19 @@ def audit(pv, kv, kw):
|
|
| 295 |
if not sim_tbl.empty:
|
| 296 |
top_sim = float(sim_tbl["cosine_similarity"].max())
|
| 297 |
|
| 298 |
-
# scatter
|
| 299 |
scat_img = None
|
| 300 |
-
if peer_agg.shape[0] >= 3:
|
| 301 |
vars_ = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).var(axis=0).sort_values(ascending=False)
|
| 302 |
if len(vars_) >= 2 and vars_.iloc[0] > 0 and vars_.iloc[1] > 0:
|
| 303 |
-
|
| 304 |
-
scat_img = scatter_plot(peer_agg, x_col, y_col)
|
| 305 |
|
| 306 |
too_perfect = (completeness > 0.98) and (zero_rate < 0.02)
|
| 307 |
|
| 308 |
scorecard = pd.DataFrame([
|
| 309 |
["Provinsi", pv, ""],
|
| 310 |
["Kab/Kota", kv, ""],
|
| 311 |
-
["Kewenangan", kw if kw else "NA",
|
|
|
|
| 312 |
["Completeness (numeric)", f"{completeness:.2%}",
|
| 313 |
"Kelengkapan tinggi; pastikan berasal dari validasi input (wajib isi) atau data administratif lengkap. Jika ada imputasi, dokumentasikan prosedurnya."],
|
| 314 |
["Zero-rate (numeric)", f"{zero_rate:.2%}",
|
|
@@ -330,7 +376,92 @@ def audit(pv, kv, kw):
|
|
| 330 |
|
| 331 |
|
| 332 |
# ============================================================
|
| 333 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# ============================================================
|
| 335 |
def ui_init():
|
| 336 |
pv = PROVS[0] if PROVS else None
|
|
@@ -352,10 +483,13 @@ def on_kab_change(pv, kv):
|
|
| 352 |
kw = kews[0] if kews else None
|
| 353 |
return gr.update(choices=kews, value=kw)
|
| 354 |
|
|
|
|
| 355 |
def run_audit(pv, kv, kw):
|
| 356 |
narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl = audit(pv, kv, kw)
|
| 357 |
-
|
| 358 |
-
|
|
|
|
|
|
|
| 359 |
|
| 360 |
|
| 361 |
pv0, kv0, kw0, kabs0, kews0 = ui_init()
|
|
@@ -366,7 +500,8 @@ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", them
|
|
| 366 |
f"- Sumber data: `{DATA_PATH}`\n"
|
| 367 |
f"- EXCLUDE (no analysis): `{', '.join(sorted(EXCLUDE_COLS_EXACT))}`\n"
|
| 368 |
f"- prov_col = `{prov_col}` · kab_col = `{kab_col}` · kewenangan_col = `{kew_col if kew_col else 'TIDAK ADA'}`\n"
|
| 369 |
-
"
|
|
|
|
| 370 |
)
|
| 371 |
|
| 372 |
with gr.Row():
|
|
@@ -377,7 +512,9 @@ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", them
|
|
| 377 |
prov.change(on_prov_change, inputs=prov, outputs=[kab, kew], show_progress=False)
|
| 378 |
kab.change(on_kab_change, inputs=[prov, kab], outputs=kew, show_progress=False)
|
| 379 |
|
| 380 |
-
|
|
|
|
|
|
|
| 381 |
|
| 382 |
out_md = gr.Markdown()
|
| 383 |
out_score = gr.Dataframe(label="Scorecard", interactive=False, wrap=True)
|
|
@@ -389,6 +526,26 @@ with gr.Blocks(title="IPLM Audit — Kualitas Data & Indikasi Tidak Wajar", them
|
|
| 389 |
|
| 390 |
out_sim = gr.Dataframe(label="Top Similarity (se-Provinsi)", interactive=False, wrap=True)
|
| 391 |
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
-
demo.
|
|
|
|
| 1 |
+
# ============================================================
|
| 2 |
+
# IPLM Audit — HF Spaces (Gradio) — + LLM Analysis (Optional)
|
| 3 |
+
# - Scorecard + Benford + Similarity + Scatter
|
| 4 |
+
# - Tambahan: LLM narasi untuk Scorecard (teknokratis)
|
| 5 |
+
# ============================================================
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# ---- CRASH FIX (HF Spaces Exit 139 / SIGSEGV) ----
|
| 10 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
| 11 |
+
os.environ["OPENBLAS_NUM_THREADS"] = "1"
|
| 12 |
+
os.environ["MKL_NUM_THREADS"] = "1"
|
| 13 |
+
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
|
| 14 |
+
os.environ["NUMEXPR_NUM_THREADS"] = "1"
|
| 15 |
+
os.environ["MPLBACKEND"] = "Agg"
|
| 16 |
+
os.environ["PYTHONUNBUFFERED"] = "1"
|
| 17 |
+
|
| 18 |
+
import re, math, io, json, textwrap
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
import gradio as gr
|
|
|
|
| 24 |
from scipy.stats import chisquare
|
| 25 |
from sklearn.preprocessing import StandardScaler
|
| 26 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 27 |
+
|
| 28 |
+
import matplotlib
|
| 29 |
+
matplotlib.use("Agg")
|
| 30 |
import matplotlib.pyplot as plt
|
| 31 |
|
| 32 |
|
| 33 |
# ============================================================
|
| 34 |
# CONFIG
|
| 35 |
# ============================================================
|
| 36 |
+
DATA_PATH = os.getenv("IPLM_DATA_PATH", "data/IPLM_clean_manual_131225.xlsx")
|
| 37 |
+
|
| 38 |
+
# LLM provider optional:
|
| 39 |
+
# - OpenAI: set OPENAI_API_KEY (+ optional OPENAI_MODEL)
|
| 40 |
+
# - Gemini: set GEMINI_API_KEY (+ optional GEMINI_MODEL)
|
| 41 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
|
| 42 |
+
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") # aman & murah (ubah bebas)
|
| 43 |
+
|
| 44 |
+
GEMINI_API_KEY = os.getenv("HF_TOKEN_DQ", "").strip()
|
| 45 |
+
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
|
| 46 |
|
| 47 |
EXCLUDE_COLS_EXACT = {"kontak_wa", "npp", "tanggal_kirim", "updated_at", "created_at"}
|
| 48 |
|
|
|
|
| 69 |
def canon(s: str) -> str:
|
| 70 |
return re.sub(r"[^a-z0-9]+", "", str(s).lower())
|
| 71 |
|
| 72 |
+
def clean_text_col(s: pd.Series) -> pd.Series:
|
| 73 |
+
s = s.astype(str)
|
| 74 |
+
s = s.str.replace("\u00a0", " ", regex=False)
|
| 75 |
+
s = s.str.replace(r"\s+", " ", regex=True)
|
| 76 |
+
s = s.str.strip()
|
| 77 |
+
s = s.mask(s.str.lower().isin(["nan", "none", "null", ""]), np.nan)
|
| 78 |
+
return s
|
| 79 |
+
|
| 80 |
def pick_col(df, candidates):
|
| 81 |
cols = list(df.columns)
|
| 82 |
cc = {canon(c): c for c in cols}
|
|
|
|
| 93 |
|
| 94 |
def detect_geo_cols(df):
|
| 95 |
prov = pick_col(df, ["provinsi", "propinsi", "province"])
|
| 96 |
+
kab = pick_col(df, ["kab_kota", "kabkota", "kabupatenkota", "kabupaten/kota",
|
| 97 |
+
"kabupaten", "kota", "regency", "city"])
|
| 98 |
return prov, kab
|
| 99 |
|
| 100 |
def detect_kewenangan_col(df):
|
| 101 |
+
return pick_col(df, ["kewenangan", "pu_level", "level_kewenangan",
|
| 102 |
+
"kewenangan_pengelola", "kewenangan_perpustakaan", "level"])
|
| 103 |
|
| 104 |
def load_excel(path):
|
| 105 |
df = pd.read_excel(path, engine="openpyxl")
|
| 106 |
for c in df.columns:
|
| 107 |
if df[c].dtype == object:
|
| 108 |
+
df[c] = clean_text_col(df[c])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
return df
|
| 110 |
|
| 111 |
def clean_str_list(values):
|
|
|
|
| 126 |
return uniq
|
| 127 |
|
| 128 |
def safe_numeric_cols(df, exclude=set(), min_non_na=0.25):
|
|
|
|
| 129 |
hard = {canon(x) for x in EXCLUDE_COLS_EXACT}
|
| 130 |
cols = []
|
| 131 |
for c in df.columns:
|
|
|
|
| 208 |
|
| 209 |
|
| 210 |
# ============================================================
|
| 211 |
+
# LOAD DATA (GLOBAL)
|
| 212 |
# ============================================================
|
| 213 |
if not os.path.exists(DATA_PATH):
|
| 214 |
+
raise FileNotFoundError(
|
| 215 |
+
f"Data file not found: {DATA_PATH}\n"
|
| 216 |
+
"Taruh file excel di repo: data/IPLM_clean_manual_131225.xlsx\n"
|
| 217 |
+
"atau set env variable IPLM_DATA_PATH."
|
| 218 |
+
)
|
| 219 |
|
| 220 |
df_raw = load_excel(DATA_PATH)
|
| 221 |
prov_col, kab_col = detect_geo_cols(df_raw)
|
|
|
|
| 225 |
raise ValueError("Kolom provinsi/kab_kota tidak terdeteksi. Pastikan ada kolom provinsi dan kab_kota.")
|
| 226 |
|
| 227 |
df = df_raw.copy()
|
| 228 |
+
df["_prov_str"] = clean_text_col(df[prov_col])
|
| 229 |
+
df["_kab_str"] = clean_text_col(df[kab_col])
|
| 230 |
+
df = df[df["_prov_str"].notna() & df["_kab_str"].notna()].copy() # cegah mixing
|
|
|
|
|
|
|
| 231 |
|
| 232 |
exclude_base = {prov_col, kab_col, "_prov_str", "_kab_str"}
|
| 233 |
hard_exclude_cols_in_file = {c for c in df.columns if canon(c) in {canon(x) for x in EXCLUDE_COLS_EXACT}}
|
|
|
|
| 237 |
benford_cols = [c for c in num_cols_all if is_benford_applicable(c)]
|
| 238 |
|
| 239 |
PROVS = clean_str_list(df["_prov_str"].unique().tolist())
|
| 240 |
+
if not PROVS:
|
| 241 |
+
raise ValueError("Tidak ada nilai provinsi yang valid setelah cleaning.")
|
| 242 |
|
| 243 |
+
prov_cache_peer = {} # cache peer per prov
|
| 244 |
|
| 245 |
|
| 246 |
def kabs_for_prov(pv):
|
| 247 |
+
if pv is None:
|
| 248 |
+
return []
|
| 249 |
return clean_str_list(df.loc[df["_prov_str"] == pv, "_kab_str"].unique().tolist())
|
| 250 |
|
| 251 |
def kew_for(pv, kv):
|
| 252 |
if not kew_col or kew_col not in df.columns:
|
| 253 |
return ["(kewenangan tidak tersedia)"]
|
| 254 |
+
if pv is None or kv is None:
|
| 255 |
+
return ["(pilih provinsi & kab/kota)"]
|
| 256 |
+
vals = clean_str_list(
|
| 257 |
+
df.loc[(df["_prov_str"] == pv) & (df["_kab_str"] == kv), kew_col]
|
| 258 |
+
.dropna().unique().tolist()
|
| 259 |
+
)
|
| 260 |
return vals if vals else ["(kewenangan kosong)"]
|
| 261 |
|
| 262 |
def get_peer_agg_for_prov(pv):
|
| 263 |
if pv in prov_cache_peer:
|
| 264 |
return prov_cache_peer[pv]
|
| 265 |
peer = df[df["_prov_str"] == pv]
|
| 266 |
+
if peer.empty:
|
| 267 |
+
peer_agg = pd.DataFrame({"kab_kota": []})
|
| 268 |
+
prov_cache_peer[pv] = peer_agg
|
| 269 |
+
return peer_agg
|
| 270 |
+
|
| 271 |
+
peer_agg = (
|
| 272 |
+
peer.groupby("_kab_str")[num_cols_all]
|
| 273 |
+
.apply(lambda g: g.apply(pd.to_numeric, errors="coerce").mean())
|
| 274 |
+
.reset_index()
|
| 275 |
+
.rename(columns={"_kab_str": "kab_kota"})
|
| 276 |
+
)
|
| 277 |
prov_cache_peer[pv] = peer_agg
|
| 278 |
return peer_agg
|
| 279 |
|
| 280 |
|
| 281 |
# ============================================================
|
| 282 |
+
# CORE AUDIT
|
| 283 |
# ============================================================
|
| 284 |
def audit(pv, kv, kw):
|
| 285 |
+
if pv is None or kv is None:
|
| 286 |
+
return "❌ Pilih provinsi dan kab/kota.", pd.DataFrame(), pd.DataFrame(), None, None, pd.DataFrame()
|
| 287 |
|
| 288 |
+
dfx = df[(df["_prov_str"] == pv) & (df["_kab_str"] == kv)].copy()
|
| 289 |
+
if kew_col and kew_col in dfx.columns and kw and not str(kw).startswith("("):
|
| 290 |
+
dfx = dfx[dfx[kew_col].astype(str).str.strip() == str(kw).strip()].copy()
|
| 291 |
|
| 292 |
if dfx.empty:
|
| 293 |
+
return "❌ Data kosong setelah filter (cek kewenangan/label).", pd.DataFrame(), pd.DataFrame(), None, None, pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
if not num_cols_all:
|
| 296 |
+
return "❌ Tidak ada kolom numerik yang cukup.", pd.DataFrame(), pd.DataFrame(), None, None, pd.DataFrame()
|
| 297 |
|
| 298 |
num_all = dfx[num_cols_all].apply(pd.to_numeric, errors="coerce")
|
|
|
|
| 299 |
completeness = float(num_all.notna().mean().mean())
|
| 300 |
zero_rate = float((num_all.fillna(0) == 0).mean().mean())
|
| 301 |
|
| 302 |
+
# Benford
|
| 303 |
best = None
|
| 304 |
rows = []
|
| 305 |
for c in benford_cols:
|
|
|
|
| 317 |
ben_note = f"Benford strongest: {best['kolom']} | n={best['n']} | MAD={best['mad']:.4f} ({benford_flag(best['mad'])}) | p={best['p_value']:.3g}"
|
| 318 |
ben_img = benford_plot(best["obs"])
|
| 319 |
|
| 320 |
+
# Similarity within prov
|
| 321 |
peer_agg = get_peer_agg_for_prov(pv)
|
| 322 |
sim_tbl = pd.DataFrame()
|
| 323 |
top_sim = None
|
| 324 |
|
| 325 |
+
if not peer_agg.empty and peer_agg.shape[0] >= 3:
|
| 326 |
X = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy(float)
|
| 327 |
Xs = StandardScaler().fit_transform(X)
|
| 328 |
sim = cosine_similarity(Xs)
|
|
|
|
| 342 |
if not sim_tbl.empty:
|
| 343 |
top_sim = float(sim_tbl["cosine_similarity"].max())
|
| 344 |
|
|
|
|
| 345 |
scat_img = None
|
| 346 |
+
if not peer_agg.empty and peer_agg.shape[0] >= 3:
|
| 347 |
vars_ = peer_agg[num_cols_all].replace([np.inf, -np.inf], np.nan).fillna(0.0).var(axis=0).sort_values(ascending=False)
|
| 348 |
if len(vars_) >= 2 and vars_.iloc[0] > 0 and vars_.iloc[1] > 0:
|
| 349 |
+
scat_img = scatter_plot(peer_agg, vars_.index[0], vars_.index[1])
|
|
|
|
| 350 |
|
| 351 |
too_perfect = (completeness > 0.98) and (zero_rate < 0.02)
|
| 352 |
|
| 353 |
scorecard = pd.DataFrame([
|
| 354 |
["Provinsi", pv, ""],
|
| 355 |
["Kab/Kota", kv, ""],
|
| 356 |
+
["Kewenangan", kw if kw else "NA",
|
| 357 |
+
f"Sumber: {kew_col}" if (kew_col and not str(kw).startswith("(")) else "Kewenangan tidak tersedia/kosong."],
|
| 358 |
["Completeness (numeric)", f"{completeness:.2%}",
|
| 359 |
"Kelengkapan tinggi; pastikan berasal dari validasi input (wajib isi) atau data administratif lengkap. Jika ada imputasi, dokumentasikan prosedurnya."],
|
| 360 |
["Zero-rate (numeric)", f"{zero_rate:.2%}",
|
|
|
|
| 376 |
|
| 377 |
|
| 378 |
# ============================================================
|
| 379 |
+
# LLM (OPTIONAL)
|
| 380 |
+
# ============================================================
|
| 381 |
+
def _llm_available():
|
| 382 |
+
return bool(OPENAI_API_KEY) or bool(GEMINI_API_KEY)
|
| 383 |
+
|
| 384 |
+
def llm_analyze_scorecard(pv, kv, kw, scorecard_df, ben_tbl, sim_tbl):
|
| 385 |
+
"""
|
| 386 |
+
Return a policy-style narrative in Indonesian.
|
| 387 |
+
Works if OPENAI_API_KEY or GEMINI_API_KEY is set.
|
| 388 |
+
"""
|
| 389 |
+
if scorecard_df is None or len(scorecard_df) == 0:
|
| 390 |
+
return "❌ Scorecard kosong. Jalankan audit dulu."
|
| 391 |
+
|
| 392 |
+
if not _llm_available():
|
| 393 |
+
return (
|
| 394 |
+
"⚠️ **LLM belum aktif** karena API key belum diset.\n\n"
|
| 395 |
+
"Set salah satu environment variable di HuggingFace Space:\n"
|
| 396 |
+
"- `OPENAI_API_KEY` (opsional `OPENAI_MODEL`)\n"
|
| 397 |
+
"- atau `GEMINI_API_KEY` (opsional `GEMINI_MODEL`)\n"
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
payload = {
|
| 401 |
+
"provinsi": pv,
|
| 402 |
+
"kab_kota": kv,
|
| 403 |
+
"kewenangan": kw,
|
| 404 |
+
"scorecard": scorecard_df.to_dict(orient="records"),
|
| 405 |
+
"top_benford_signals": ben_tbl.head(10).to_dict(orient="records") if isinstance(ben_tbl, pd.DataFrame) else [],
|
| 406 |
+
"top_similarity": sim_tbl.head(10).to_dict(orient="records") if isinstance(sim_tbl, pd.DataFrame) else []
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
system = (
|
| 410 |
+
"Anda adalah analis kebijakan publik dan auditor kualitas data untuk indikator pembangunan literasi/perpustakaan. "
|
| 411 |
+
"Tulis analisis ringkas namun tajam, berbahasa Indonesia teknokratis (gaya Perpusnas/pemerintah). "
|
| 412 |
+
"Fokus pada interpretasi scorecard: kelengkapan, zero-rate, Benford, similarity, implikasi risiko, dan rekomendasi tindak lanjut "
|
| 413 |
+
"(cek bukti dukung, metadata, log input, sampling/validasi). Hindari asumsi yang tidak ada di data."
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
user = (
|
| 417 |
+
"Buatkan ANALISIS NARATIF untuk hasil audit berikut. Struktur wajib:\n"
|
| 418 |
+
"1) Ringkasan status (1 paragraf)\n"
|
| 419 |
+
"2) Interpretasi tiap komponen (bullet)\n"
|
| 420 |
+
"3) Rekomendasi tindak lanjut prioritas (maks 6 bullet)\n\n"
|
| 421 |
+
f"DATA (JSON):\n{json.dumps(payload, ensure_ascii=False)}"
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
# ---- Try OpenAI first if key exists ----
|
| 425 |
+
if OPENAI_API_KEY:
|
| 426 |
+
try:
|
| 427 |
+
from openai import OpenAI
|
| 428 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 429 |
+
resp = client.chat.completions.create(
|
| 430 |
+
model=OPENAI_MODEL,
|
| 431 |
+
messages=[
|
| 432 |
+
{"role": "system", "content": system},
|
| 433 |
+
{"role": "user", "content": user},
|
| 434 |
+
],
|
| 435 |
+
temperature=0.2,
|
| 436 |
+
max_tokens=700,
|
| 437 |
+
)
|
| 438 |
+
return resp.choices[0].message.content.strip()
|
| 439 |
+
except Exception as e:
|
| 440 |
+
err = str(e)
|
| 441 |
+
# fallback to gemini if available
|
| 442 |
+
if not GEMINI_API_KEY:
|
| 443 |
+
return f"❌ LLM error (OpenAI): {err}"
|
| 444 |
+
# else continue to Gemini
|
| 445 |
+
|
| 446 |
+
# ---- Gemini fallback ----
|
| 447 |
+
if GEMINI_API_KEY:
|
| 448 |
+
try:
|
| 449 |
+
from google import genai
|
| 450 |
+
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 451 |
+
resp = client.models.generate_content(
|
| 452 |
+
model=GEMINI_MODEL,
|
| 453 |
+
contents=f"{system}\n\n{user}",
|
| 454 |
+
)
|
| 455 |
+
text = getattr(resp, "text", None)
|
| 456 |
+
return (text or "").strip() or "❌ Gemini tidak mengembalikan teks."
|
| 457 |
+
except Exception as e:
|
| 458 |
+
return f"❌ LLM error (Gemini): {e}"
|
| 459 |
+
|
| 460 |
+
return "❌ LLM tidak tersedia."
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
# ============================================================
|
| 464 |
+
# GRADIO UI
|
| 465 |
# ============================================================
|
| 466 |
def ui_init():
|
| 467 |
pv = PROVS[0] if PROVS else None
|
|
|
|
| 483 |
kw = kews[0] if kews else None
|
| 484 |
return gr.update(choices=kews, value=kw)
|
| 485 |
|
| 486 |
+
# state holder for last outputs (to feed LLM button without recompute)
|
| 487 |
def run_audit(pv, kv, kw):
|
| 488 |
narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl = audit(pv, kv, kw)
|
| 489 |
+
return narasi, scorecard, ben_tbl, ben_img, scat_img, sim_tbl, scorecard, ben_tbl, sim_tbl
|
| 490 |
+
|
| 491 |
+
def run_llm(pv, kv, kw, scorecard_df, ben_tbl, sim_tbl):
|
| 492 |
+
return llm_analyze_scorecard(pv, kv, kw, scorecard_df, ben_tbl, sim_tbl)
|
| 493 |
|
| 494 |
|
| 495 |
pv0, kv0, kw0, kabs0, kews0 = ui_init()
|
|
|
|
| 500 |
f"- Sumber data: `{DATA_PATH}`\n"
|
| 501 |
f"- EXCLUDE (no analysis): `{', '.join(sorted(EXCLUDE_COLS_EXACT))}`\n"
|
| 502 |
f"- prov_col = `{prov_col}` · kab_col = `{kab_col}` · kewenangan_col = `{kew_col if kew_col else 'TIDAK ADA'}`\n"
|
| 503 |
+
"---\n"
|
| 504 |
+
"**LLM Analysis (opsional):** set `OPENAI_API_KEY` atau `GEMINI_API_KEY` di Space Variables."
|
| 505 |
)
|
| 506 |
|
| 507 |
with gr.Row():
|
|
|
|
| 512 |
prov.change(on_prov_change, inputs=prov, outputs=[kab, kew], show_progress=False)
|
| 513 |
kab.change(on_kab_change, inputs=[prov, kab], outputs=kew, show_progress=False)
|
| 514 |
|
| 515 |
+
with gr.Row():
|
| 516 |
+
btn = gr.Button("Run Audit", variant="primary")
|
| 517 |
+
btn_llm = gr.Button("Generate LLM Analysis", variant="secondary")
|
| 518 |
|
| 519 |
out_md = gr.Markdown()
|
| 520 |
out_score = gr.Dataframe(label="Scorecard", interactive=False, wrap=True)
|
|
|
|
| 526 |
|
| 527 |
out_sim = gr.Dataframe(label="Top Similarity (se-Provinsi)", interactive=False, wrap=True)
|
| 528 |
|
| 529 |
+
gr.Markdown("## Analisis Naratif (LLM)")
|
| 530 |
+
out_llm = gr.Markdown()
|
| 531 |
+
|
| 532 |
+
# hidden states: store last tables for llm button
|
| 533 |
+
st_score = gr.State(pd.DataFrame())
|
| 534 |
+
st_ben = gr.State(pd.DataFrame())
|
| 535 |
+
st_sim = gr.State(pd.DataFrame())
|
| 536 |
+
|
| 537 |
+
btn.click(
|
| 538 |
+
run_audit,
|
| 539 |
+
inputs=[prov, kab, kew],
|
| 540 |
+
outputs=[out_md, out_score, out_ben_tbl, out_ben_img, out_scat_img, out_sim, st_score, st_ben, st_sim],
|
| 541 |
+
show_progress=False
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
btn_llm.click(
|
| 545 |
+
run_llm,
|
| 546 |
+
inputs=[prov, kab, kew, st_score, st_ben, st_sim],
|
| 547 |
+
outputs=[out_llm],
|
| 548 |
+
show_progress=True
|
| 549 |
+
)
|
| 550 |
|
| 551 |
+
demo.launch()
|