irhamni commited on
Commit
e63555f
·
verified ·
1 Parent(s): 447fdc0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +799 -0
app.py ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ app.py — Dashboard Kekurangan Sampel IPLM (TANPA HITUNG INDEKS)
4
+ - Fokus: melihat kekurangan jumlah sampel IPLM per wilayah
5
+ - Bandingkan "sampel masuk (DM)" vs "populasi target (meta)"
6
+ - Pertahankan LLM untuk membuat laporan naratif kekurangan sampel
7
+
8
+ Output:
9
+ - Tabel verifikasi (coverage & gap)
10
+ - Download Excel (rekap + detail subset)
11
+ - Word report (opsional pie chart kalau kaleido tersedia)
12
+ """
13
+
14
+ import os
15
+ import re
16
+ import math
17
+ import tempfile
18
+ from pathlib import Path
19
+
20
+ import gradio as gr
21
+ import numpy as np
22
+ import pandas as pd
23
+ from huggingface_hub import InferenceClient
24
+
25
+ # Word report
26
+ from docx import Document
27
+ from docx.shared import Inches
28
+
29
+ # Pie chart opsional (kalau kaleido ada)
30
+ import plotly.express as px
31
+ try:
32
+ import kaleido # noqa: F401
33
+ HAS_KALEIDO = True
34
+ except Exception:
35
+ HAS_KALEIDO = False
36
+
37
+
38
+ # ============================================================
39
+ # 1) KONFIGURASI FILE
40
+ # ============================================================
41
+ DATA_FILE = "DM_001.xlsx" # data sampel masuk (multi-sheet)
42
+ META_KAB_FILE = "jumlahdesa_fixed (1).xlsx" # kecamatan & desa/kel per kab/kota
43
+ META_SDSMP_FILE = "SD-SMP-kab.xlsx" # jumlah SD & SMP per kab/kota
44
+ META_SMA_FILE = "SMA.xlsx" # jumlah SMA per provinsi
45
+
46
+ # ============================================================
47
+ # 1b) KONFIGURASI LLM (Hugging Face Inference)
48
+ # ============================================================
49
+ USE_LLM = True
50
+ LLM_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
51
+
52
+ HF_TOKEN = (
53
+ os.getenv("HF_TOKEN")
54
+ or os.getenv("HUGGINGFACEHUB_API_TOKEN")
55
+ or os.getenv("HF_API_TOKEN")
56
+ )
57
+
58
+ _HF_CLIENT = None
59
+ def get_llm_client():
60
+ global _HF_CLIENT
61
+ if _HF_CLIENT is not None:
62
+ return _HF_CLIENT
63
+ try:
64
+ if HF_TOKEN:
65
+ _HF_CLIENT = InferenceClient(model=LLM_MODEL_NAME, token=HF_TOKEN)
66
+ else:
67
+ _HF_CLIENT = InferenceClient(model=LLM_MODEL_NAME)
68
+ return _HF_CLIENT
69
+ except Exception:
70
+ _HF_CLIENT = None
71
+ return None
72
+
73
+
74
+ # ============================================================
75
+ # 2) UTIL
76
+ # ============================================================
77
+ def _canon(s: str) -> str:
78
+ return re.sub(r"[^a-z0-9]+", "", str(s).lower())
79
+
80
+ def pick_col(df, candidates):
81
+ for c in candidates:
82
+ if c in df.columns:
83
+ return c
84
+ can_map = {_canon(c): c for c in df.columns}
85
+ for c in candidates:
86
+ k = _canon(c)
87
+ if k in can_map:
88
+ return can_map[k]
89
+ return None
90
+
91
+ def coerce_num(val):
92
+ if pd.isna(val):
93
+ return np.nan
94
+ t = str(val).strip()
95
+ if t == "" or t in {"-", "–", "—"}:
96
+ return np.nan
97
+ t = t.replace("\u00a0", " ").replace("Rp", "").replace("%", "")
98
+ t = re.sub(r"[^0-9,.\-]", "", t)
99
+ if t.count(".") > 1 and t.count(",") == 1:
100
+ t = t.replace(".", "").replace(",", ".")
101
+ elif t.count(",") > 1 and t.count(".") == 1:
102
+ t = t.replace(",", "")
103
+ elif t.count(",") == 1 and t.count(".") == 0:
104
+ t = t.replace(",", ".")
105
+ else:
106
+ t = t.replace(",", "")
107
+ try:
108
+ return float(t)
109
+ except Exception:
110
+ return np.nan
111
+
112
+ def norm_kew(v):
113
+ if pd.isna(v):
114
+ return None
115
+ t = str(v).strip().upper()
116
+ if "KAB" in t or "KOTA" in t:
117
+ return "KAB/KOTA"
118
+ if "PROV" in t:
119
+ return "PROVINSI"
120
+ if "PUSAT" in t or "NASIONAL" in t:
121
+ return "PUSAT"
122
+ return t
123
+
124
+ def _norm_text(x):
125
+ if pd.isna(x):
126
+ return None
127
+ t = str(x).strip().upper()
128
+ return " ".join(t.split())
129
+
130
+ def norm_prov_label(s):
131
+ if pd.isna(s):
132
+ return None
133
+ t = str(s).upper()
134
+ for bad in ["PROVINSI", "PROPINSI"]:
135
+ t = t.replace(bad, "")
136
+ t = " ".join(t.split())
137
+ return re.sub(r"[^A-Z0-9]+", "", t)
138
+
139
+ def norm_kab_label(s):
140
+ if pd.isna(s):
141
+ return None
142
+ t = str(s).upper()
143
+ t = t.replace("KABUPATEN", "KAB")
144
+ t = t.replace("KAB.", "KAB")
145
+ t = t.replace("KOTA ADMINISTRASI", "KOTA")
146
+ t = t.replace("KOTA ADM.", "KOTA")
147
+ t = t.replace("KOTA.", "KOTA")
148
+ t = " ".join(t.split())
149
+ return re.sub(r"[^A-Z0-9]+", "", t)
150
+
151
+ def make_pie_plotly(num, den, title):
152
+ if not HAS_KALEIDO:
153
+ return None
154
+ if den is None or pd.isna(den) or den <= 0:
155
+ values = [0, 1]
156
+ labels = ["Terjangkau", "Belum Terjangkau"]
157
+ else:
158
+ num = 0 if pd.isna(num) else float(num)
159
+ den = float(den)
160
+ values = [max(num, 0), max(den - num, 0)]
161
+ labels = ["Terjangkau", "Belum Terjangkau"]
162
+ fig = px.pie(values=values, names=labels, title=title, hole=0.3)
163
+ tmp = tempfile.mktemp(suffix=".png")
164
+ try:
165
+ fig.write_image(tmp, scale=2)
166
+ return tmp
167
+ except Exception:
168
+ return None
169
+
170
+
171
+ # ============================================================
172
+ # 3) LOAD DATA (DM + META)
173
+ # ============================================================
174
+ DATA_INFO = ""
175
+ df_all_raw = None
176
+ meta_kab_df = None # kab_key -> kec, desa/kel, SD, SMP (gabungan)
177
+ meta_sma_df = None # prov_key -> Jml_SMA
178
+
179
+ prov_col_glob = kab_col_glob = kew_col_glob = jenis_col_glob = subjenis_col_glob = nama_col_glob = None
180
+
181
+ try:
182
+ fp = Path(DATA_FILE)
183
+ if not fp.exists():
184
+ raise FileNotFoundError(f"File tidak ditemukan: {DATA_FILE}")
185
+
186
+ xls = pd.ExcelFile(fp)
187
+ frames = [pd.read_excel(fp, sheet_name=s) for s in xls.sheet_names]
188
+ df_all_raw = pd.concat(frames, ignore_index=True, sort=False)
189
+
190
+ prov_col_glob = pick_col(df_all_raw, ["provinsi", "Provinsi", "PROVINSI"])
191
+ kab_col_glob = pick_col(df_all_raw, ["kab_kota", "Kab_Kota", "Kab/Kota", "KAB/KOTA", "kabupaten_kota", "kota"])
192
+ kew_col_glob = pick_col(df_all_raw, ["kewenangan", "jenis_kewenangan", "Kewenangan", "KEWENANGAN"])
193
+ jenis_col_glob = pick_col(df_all_raw, ["jenis_perpustakaan", "JENIS_PERPUSTAKAAN", "Jenis Perpustakaan", "jenis perpustakaan"])
194
+ subjenis_col_glob = pick_col(df_all_raw, ["sub_jenis_perpus", "Sub Jenis", "SubJenis", "subjenis", "jenjang"])
195
+ nama_col_glob = pick_col(df_all_raw, ["nama_perpustakaan", "nm_perpustakaan", "nm_instansi_lembaga", "Nama Perpustakaan"])
196
+
197
+ # kewenangan norm
198
+ if kew_col_glob:
199
+ df_all_raw["KEW_NORM"] = df_all_raw[kew_col_glob].apply(norm_kew)
200
+ else:
201
+ df_all_raw["KEW_NORM"] = None
202
+
203
+ # jenis perpustakaan -> dataset {sekolah/umum/khusus}
204
+ val_map_jenis = {
205
+ "PERPUSTAKAAN SEKOLAH": "sekolah",
206
+ "SEKOLAH": "sekolah",
207
+ "PERPUSTAKAAN UMUM": "umum",
208
+ "UMUM": "umum",
209
+ "PERPUSTAKAAN DAERAH": "umum",
210
+ "PERPUSTAKAAN KHUSUS": "khusus",
211
+ "KHUSUS": "khusus",
212
+ }
213
+ if jenis_col_glob:
214
+ df_all_raw["_dataset"] = df_all_raw[jenis_col_glob].apply(_norm_text).map(val_map_jenis)
215
+ else:
216
+ df_all_raw["_dataset"] = None
217
+
218
+ DATA_INFO = f"Data terbaca dari: **{DATA_FILE}** | Jumlah baris: **{len(df_all_raw)}**"
219
+ except Exception as e:
220
+ df_all_raw = None
221
+ DATA_INFO = f"⚠️ Gagal memuat `{DATA_FILE}` | Error: `{e}`"
222
+
223
+ extra_info = []
224
+
225
+ # --- META kab: kec + desa/kel ---
226
+ try:
227
+ meta_kab_raw = pd.read_excel(META_KAB_FILE)
228
+ col_kab = pick_col(meta_kab_raw, ["Kab/Kota", "Kab_Kota", "kab/kota", "kabupaten_kota"])
229
+ col_kec = pick_col(meta_kab_raw, ["Kecamatan", "jml_kecamatan", "jumlah_kecamatan"])
230
+ col_des = pick_col(meta_kab_raw, ["Desa/Kel", "Desa Kelurahan", "Desa", "Desa_kel"])
231
+
232
+ if col_kab and col_kec and col_des:
233
+ meta_kab_df = pd.DataFrame({
234
+ "Kab_Kota_Label": meta_kab_raw[col_kab].astype(str).str.strip(),
235
+ "Jml_Kecamatan": meta_kab_raw[col_kec].apply(coerce_num),
236
+ "Jml_DesaKel": meta_kab_raw[col_des].apply(coerce_num),
237
+ })
238
+ meta_kab_df["kab_key"] = meta_kab_df["Kab_Kota_Label"].apply(norm_kab_label)
239
+ extra_info.append(f"Meta Kab/Kota (Kec/Desa) terbaca: **{META_KAB_FILE}** (n={len(meta_kab_df)})")
240
+ else:
241
+ meta_kab_df = None
242
+ extra_info.append(f"⚠️ Kolom kunci meta kab tidak lengkap di `{META_KAB_FILE}`")
243
+ except Exception as e:
244
+ meta_kab_df = None
245
+ extra_info.append(f"⚠️ Gagal memuat `{META_KAB_FILE}` ({e})")
246
+
247
+ # --- META SD/SMP per kab/kota ---
248
+ try:
249
+ sd_smp_raw = pd.read_excel(META_SDSMP_FILE)
250
+ col_kab2 = pick_col(sd_smp_raw, [
251
+ "Kabupaten/Kota_Kabupaten/Kota", "Kabupaten/Kota",
252
+ "Kab/Kota", "Kab_Kota", "kab/kota", "kabupaten_kota"
253
+ ])
254
+ col_sd = pick_col(sd_smp_raw, ["SD", "Jumlah SD", "Total SD", "SD_Total", "jml_sd", "Jml_SD"])
255
+ col_smp = pick_col(sd_smp_raw, ["SMP", "Jumlah SMP", "Total SMP", "SMP_Total", "jml_smp", "Jml_SMP"])
256
+
257
+ if col_kab2 and (col_sd or col_smp):
258
+ df_sd_smp = pd.DataFrame({
259
+ "Kab_Kota_Label_SD": sd_smp_raw[col_kab2].astype(str).str.strip(),
260
+ })
261
+ df_sd_smp["Jml_SD"] = sd_smp_raw[col_sd].apply(coerce_num) if col_sd else 0.0
262
+ df_sd_smp["Jml_SMP"] = sd_smp_raw[col_smp].apply(coerce_num) if col_smp else 0.0
263
+ df_sd_smp["kab_key"] = df_sd_smp["Kab_Kota_Label_SD"].apply(norm_kab_label)
264
+
265
+ df_sd_smp_grp = df_sd_smp.groupby("kab_key", as_index=False).agg({
266
+ "Jml_SD": "sum",
267
+ "Jml_SMP": "sum",
268
+ })
269
+
270
+ if meta_kab_df is not None:
271
+ meta_kab_df = meta_kab_df.merge(df_sd_smp_grp, on="kab_key", how="left")
272
+ else:
273
+ meta_kab_df = df_sd_smp_grp.copy()
274
+ meta_kab_df["Kab_Kota_Label"] = df_sd_smp.groupby("kab_key")["Kab_Kota_Label_SD"].first().values
275
+
276
+ extra_info.append(f"Meta SD/SMP terbaca: **{META_SDSMP_FILE}** (n={len(df_sd_smp_grp)})")
277
+ else:
278
+ extra_info.append(f"⚠️ Kolom kunci SD/SMP tidak lengkap di `{META_SDSMP_FILE}`")
279
+ except Exception as e:
280
+ extra_info.append(f"⚠️ Gagal memuat `{META_SDSMP_FILE}` ({e})")
281
+
282
+ # --- META SMA per provinsi ---
283
+ try:
284
+ meta_sma_raw = pd.read_excel(META_SMA_FILE)
285
+
286
+ col_prov_sma = pick_col(meta_sma_raw, [
287
+ "Provinsi", "provinsi", "PROVINSI", "NAMA_PROVINSI", "Nama Provinsi",
288
+ "nm_prov", "nm_provinsi", "prov"
289
+ ])
290
+ col_sma = pick_col(meta_sma_raw, [
291
+ "Total SMA", "TOTAL_SMA", "TOTAL", "total",
292
+ "Jml_SMA", "Jumlah SMA", "SMA", "SMA_Total",
293
+ "jumlah_sma", "total_sma", "jml_sma"
294
+ ])
295
+ if col_prov_sma is None:
296
+ raise ValueError("Kolom provinsi tidak ditemukan di file SMA.")
297
+ if col_sma is None:
298
+ raise ValueError("Kolom jumlah SMA tidak ditemukan di file SMA.")
299
+
300
+ meta_sma_df = pd.DataFrame({
301
+ "Provinsi_Label": meta_sma_raw[col_prov_sma].astype(str).str.strip(),
302
+ "Jml_SMA": meta_sma_raw[col_sma].apply(coerce_num),
303
+ })
304
+ meta_sma_df["prov_key"] = meta_sma_df["Provinsi_Label"].apply(norm_prov_label)
305
+ meta_sma_df = meta_sma_df.groupby(["prov_key"], as_index=False).agg({
306
+ "Provinsi_Label": "first",
307
+ "Jml_SMA": "sum"
308
+ })
309
+
310
+ extra_info.append(f"Meta SMA terbaca: **{META_SMA_FILE}** ({len(meta_sma_df)} provinsi)")
311
+ except Exception as e:
312
+ meta_sma_df = None
313
+ extra_info.append(f"⚠️ Gagal memuat file SMA: {e}")
314
+
315
+ if extra_info:
316
+ DATA_INFO = DATA_INFO + "<br>" + "<br>".join(extra_info)
317
+
318
+
319
+ # ============================================================
320
+ # 4) PILIHAN DROPDOWN
321
+ # ============================================================
322
+ def all_prov_choices():
323
+ if df_all_raw is None or prov_col_glob is None:
324
+ return ["(Semua)"]
325
+ s = df_all_raw[prov_col_glob].dropna().astype(str).str.strip()
326
+ vals = sorted([o for o in s.unique() if o != ""])
327
+ return ["(Semua)"] + vals
328
+
329
+ def get_kab_choices_for_prov(prov_value):
330
+ if df_all_raw is None or kab_col_glob is None:
331
+ return ["(Semua)"]
332
+ if prov_value is None or prov_value == "(Semua)" or prov_col_glob is None:
333
+ s = df_all_raw[kab_col_glob].dropna().astype(str).str.strip()
334
+ else:
335
+ m = df_all_raw[prov_col_glob].astype(str).str.strip() == prov_value
336
+ s = df_all_raw.loc[m, kab_col_glob].dropna().astype(str).str.strip()
337
+ vals = sorted([x for x in s.unique() if x != ""])
338
+ return ["(Semua)"] + vals
339
+
340
+ def all_kew_choices():
341
+ if df_all_raw is None:
342
+ return ["(Semua)"]
343
+ s = df_all_raw.get("KEW_NORM", pd.Series(dtype=object)).dropna().astype(str).str.strip()
344
+ vals = sorted([o for o in s.unique() if o != ""])
345
+ return ["(Semua)"] + vals if vals else ["(Semua)"]
346
+
347
+ prov_choices = all_prov_choices()
348
+ kab_choices = get_kab_choices_for_prov(prov_choices[0] if prov_choices else "(Semua)")
349
+ kew_choices = all_kew_choices()
350
+ default_kew = "KAB/KOTA" if "KAB/KOTA" in kew_choices else (kew_choices[0] if kew_choices else "(Semua)")
351
+
352
+
353
+ # ============================================================
354
+ # 5) INTI: HITUNG COVERAGE & GAP
355
+ # ============================================================
356
+ def _infer_jenjang_sd_smp(x):
357
+ if pd.isna(x):
358
+ return "OTHER"
359
+ t = str(x).upper()
360
+ # heuristik sederhana
361
+ if " SD " in f" {t} " or " SD/" in t or " MI " in f" {t} ":
362
+ return "SD"
363
+ if " SMP " in f" {t} " or " SMP/" in t or " MTS " in f" {t} ":
364
+ return "SMP"
365
+ return "OTHER"
366
+
367
+ def safe_pct(num, den):
368
+ if den is None or pd.isna(den) or den <= 0:
369
+ return np.nan
370
+ if num is None or pd.isna(num):
371
+ num = 0
372
+ return 100.0 * float(num) / float(den)
373
+
374
+ def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.DataFrame:
375
+ """
376
+ Keluaran: tabel coverage & GAP (kekurangan sampel) sesuai kewenangan.
377
+ - KAB/KOTA: bandingkan sampel sekolah vs (SD+SMP), umum vs (kec+desa/kel)
378
+ - PROVINSI: bandingkan sampel SMA vs (jumlah SMA)
379
+ """
380
+ if df_filtered is None or len(df_filtered) == 0:
381
+ return pd.DataFrame()
382
+
383
+ kew_norm = str(kew_value or "").upper()
384
+
385
+ # ================= KAB/KOTA =================
386
+ if ("KAB" in kew_norm or "KOTA" in kew_norm):
387
+ if kab_col_glob is None or meta_kab_df is None:
388
+ return pd.DataFrame({"Info": ["Kolom kab/kota atau meta kab tidak tersedia."]})
389
+
390
+ tmp = df_filtered.copy()
391
+ tmp = tmp[pd.notna(tmp[kab_col_glob])]
392
+ if tmp.empty:
393
+ return pd.DataFrame()
394
+
395
+ tmp["kab_key"] = tmp[kab_col_glob].apply(norm_kab_label)
396
+
397
+ # total sampel per kab
398
+ g_total = tmp.groupby("kab_key").size().rename("Sampel_Total").reset_index()
399
+
400
+ # sekolah & jenjang
401
+ if subjenis_col_glob and subjenis_col_glob in tmp.columns:
402
+ tmp["jenjang"] = tmp[subjenis_col_glob].apply(_infer_jenjang_sd_smp)
403
+ else:
404
+ tmp["jenjang"] = "OTHER"
405
+
406
+ tmp_sek = tmp[tmp["_dataset"] == "sekolah"].copy() if "_dataset" in tmp.columns else tmp.copy()
407
+ g_sek_total = tmp_sek.groupby("kab_key").size().rename("Sampel_Sekolah_Total").reset_index()
408
+ g_sd = tmp_sek[tmp_sek["jenjang"] == "SD"].groupby("kab_key").size().rename("Sampel_SD").reset_index()
409
+ g_smp = tmp_sek[tmp_sek["jenjang"] == "SMP"].groupby("kab_key").size().rename("Sampel_SMP").reset_index()
410
+
411
+ # umum
412
+ tmp_umum = tmp[tmp["_dataset"] == "umum"].copy() if "_dataset" in tmp.columns else tmp.copy()
413
+ g_umum = tmp_umum.groupby("kab_key").size().rename("Sampel_Umum").reset_index()
414
+
415
+ use_cols = ["kab_key", "Kab_Kota_Label", "Jml_Kecamatan", "Jml_DesaKel", "Jml_SD", "Jml_SMP"]
416
+ use_cols = [c for c in use_cols if c in meta_kab_df.columns]
417
+
418
+ merged = (
419
+ g_total
420
+ .merge(g_sek_total, on="kab_key", how="left")
421
+ .merge(g_sd, on="kab_key", how="left")
422
+ .merge(g_smp, on="kab_key", how="left")
423
+ .merge(g_umum, on="kab_key", how="left")
424
+ .merge(meta_kab_df[use_cols], on="kab_key", how="left")
425
+ )
426
+
427
+ for c in ["Sampel_Total", "Sampel_Sekolah_Total", "Sampel_SD", "Sampel_SMP", "Sampel_Umum"]:
428
+ if c in merged.columns:
429
+ merged[c] = merged[c].fillna(0).astype(int)
430
+
431
+ merged["Pop_SD_SMP"] = merged[["Jml_SD", "Jml_SMP"]].sum(axis=1, skipna=True)
432
+ merged["Pop_Kec_DesaKel"] = merged.get("Jml_Kecamatan", np.nan) + merged.get("Jml_DesaKel", np.nan)
433
+
434
+ merged["Coverage_Sekolah_%"] = merged.apply(
435
+ lambda r: safe_pct(r["Sampel_Sekolah_Total"], r.get("Pop_SD_SMP", np.nan)), axis=1
436
+ )
437
+ merged["Coverage_Umum_%"] = merged.apply(
438
+ lambda r: safe_pct(r["Sampel_Umum"], r.get("Pop_Kec_DesaKel", np.nan)), axis=1
439
+ )
440
+
441
+ # GAP (kekurangan sampel)
442
+ merged["Gap_Sekolah"] = merged.apply(
443
+ lambda r: max(int(math.ceil(r["Pop_SD_SMP"] - r["Sampel_Sekolah_Total"])) if pd.notna(r["Pop_SD_SMP"]) else 0, 0),
444
+ axis=1
445
+ )
446
+ merged["Gap_Umum"] = merged.apply(
447
+ lambda r: max(int(math.ceil(r["Pop_Kec_DesaKel"] - r["Sampel_Umum"])) if pd.notna(r["Pop_Kec_DesaKel"]) else 0, 0),
448
+ axis=1
449
+ )
450
+
451
+ out = pd.DataFrame({
452
+ "Kab/Kota": merged.get("Kab_Kota_Label", merged["kab_key"]),
453
+ "Sampel Total": merged["Sampel_Total"],
454
+ "Sampel Sekolah (Total)": merged["Sampel_Sekolah_Total"],
455
+ "Populasi Sekolah (SD+SMP)": merged["Pop_SD_SMP"],
456
+ "Coverage Sekolah (%)": merged["Coverage_Sekolah_%"],
457
+ "Kekurangan Sampel Sekolah": merged["Gap_Sekolah"],
458
+ "Sampel Umum": merged["Sampel_Umum"],
459
+ "Populasi Admin (Kec+Desa/Kel)": merged["Pop_Kec_DesaKel"],
460
+ "Coverage Umum (%)": merged["Coverage_Umum_%"],
461
+ "Kekurangan Sampel Umum": merged["Gap_Umum"],
462
+ })
463
+
464
+ return out.sort_values("Kab/Kota").reset_index(drop=True).round(3)
465
+
466
+ # ================= PROVINSI =================
467
+ if ("PROV" in kew_norm):
468
+ if meta_sma_df is None:
469
+ return pd.DataFrame({"Info": ["Meta SMA tidak tersedia."]})
470
+
471
+ if prov_col_glob is None:
472
+ return pd.DataFrame({"Info": ["Kolom provinsi tidak ditemukan di DM."]})
473
+
474
+ tmp = df_filtered.copy()
475
+ tmp = tmp[pd.notna(tmp[prov_col_glob])]
476
+ if tmp.empty:
477
+ return pd.DataFrame({"Info": ["Tidak ada data sampel kewenangan provinsi."]})
478
+
479
+ tmp["prov_key"] = tmp[prov_col_glob].apply(norm_prov_label)
480
+
481
+ g_total = tmp.groupby("prov_key").size().rename("Sampel_Total").reset_index()
482
+ tmp_sek = tmp[tmp["_dataset"] == "sekolah"].copy() if "_dataset" in tmp.columns else tmp.copy()
483
+ g_sma = tmp_sek.groupby("prov_key").size().rename("Sampel_SMA").reset_index()
484
+
485
+ merged = (
486
+ meta_sma_df.merge(g_total, on="prov_key", how="left")
487
+ .merge(g_sma, on="prov_key", how="left")
488
+ )
489
+
490
+ merged["Sampel_Total"] = merged["Sampel_Total"].fillna(0).astype(int)
491
+ merged["Sampel_SMA"] = merged["Sampel_SMA"].fillna(0).astype(int)
492
+
493
+ merged["Coverage_SMA_%"] = merged.apply(
494
+ lambda r: safe_pct(r["Sampel_SMA"], r.get("Jml_SMA", np.nan)), axis=1
495
+ )
496
+ merged["Kekurangan Sampel SMA"] = merged.apply(
497
+ lambda r: max(int(math.ceil(r["Jml_SMA"] - r["Sampel_SMA"])) if pd.notna(r["Jml_SMA"]) else 0, 0),
498
+ axis=1
499
+ )
500
+
501
+ out = pd.DataFrame({
502
+ "Provinsi": merged["Provinsi_Label"],
503
+ "Sampel Total (Prov)": merged["Sampel_Total"],
504
+ "Sampel SMA (di DM)": merged["Sampel_SMA"],
505
+ "Populasi SMA (Meta)": merged["Jml_SMA"],
506
+ "Coverage SMA (%)": merged["Coverage_SMA_%"],
507
+ "Kekurangan Sampel SMA": merged["Kekurangan Sampel SMA"],
508
+ })
509
+
510
+ return out.sort_values("Provinsi").reset_index(drop=True).round(3)
511
+
512
+ return pd.DataFrame({"Info": ["Kewenangan tidak dikenali / tidak didukung."]})
513
+
514
+
515
+ # ============================================================
516
+ # 6) BUILD CONTEXT UNTUK LLM + FALLBACK
517
+ # ============================================================
518
+ def build_context_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
519
+ wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
520
+ lines = []
521
+ lines.append(f"Wilayah filter: {wilayah}")
522
+ lines.append(f"Kewenangan: {kew}")
523
+ lines.append(f"Jumlah baris verifikasi: {len(verif_df)}")
524
+
525
+ # ringkas total gap
526
+ gap_cols = [c for c in verif_df.columns if "Kekurangan" in c]
527
+ for gc in gap_cols:
528
+ try:
529
+ total_gap = float(pd.to_numeric(verif_df[gc], errors="coerce").fillna(0).sum())
530
+ lines.append(f"Total {gc}: {int(total_gap)}")
531
+ except Exception:
532
+ pass
533
+
534
+ # top 10 terbesar
535
+ if gap_cols:
536
+ gc = gap_cols[0]
537
+ try:
538
+ t = verif_df.copy()
539
+ t[gc] = pd.to_numeric(t[gc], errors="coerce").fillna(0)
540
+ top = t.sort_values(gc, ascending=False).head(10)
541
+ keycol = "Kab/Kota" if "Kab/Kota" in top.columns else ("Provinsi" if "Provinsi" in top.columns else top.columns[0])
542
+ lines.append("\nTop prioritas (gap terbesar):")
543
+ for _, r in top.iterrows():
544
+ lines.append(f"- {r[keycol]}: {gc}={int(r[gc])}")
545
+ except Exception:
546
+ pass
547
+
548
+ return "\n".join(lines)
549
+
550
+ def rule_based_gap_report(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
551
+ if verif_df is None or verif_df.empty:
552
+ return "Tidak ada data verifikasi yang dapat dilaporkan."
553
+ wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
554
+
555
+ lines = []
556
+ lines.append("## Ringkasan Kekurangan Sampel IPLM (Rule-based)\n")
557
+ lines.append(f"Wilayah: {wilayah}")
558
+ lines.append(f"Kewenangan: {kew}")
559
+ lines.append(f"Jumlah unit analisis: {len(verif_df)}\n")
560
+
561
+ gap_cols = [c for c in verif_df.columns if "Kekurangan" in c]
562
+ if not gap_cols:
563
+ lines.append("Kolom kekurangan sampel tidak ditemukan pada tabel verifikasi.")
564
+ return "\n".join(lines)
565
+
566
+ for gc in gap_cols:
567
+ total_gap = int(pd.to_numeric(verif_df[gc], errors="coerce").fillna(0).sum())
568
+ lines.append(f"- Total {gc}: **{total_gap}** unit yang perlu dilengkapi.")
569
+
570
+ lines.append(
571
+ "\nRekomendasi operasional: fokuskan pengumpulan data pada unit/wilayah dengan gap terbesar, "
572
+ "mulai dari area yang memiliki populasi target besar namun sampel masuk masih terbatas. "
573
+ "Pastikan konsistensi penamaan provinsi/kab-kota agar matching dengan meta tidak gagal."
574
+ )
575
+ return "\n".join(lines)
576
+
577
+ def generate_llm_gap_report(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
578
+ ctx = build_context_gap(verif_df, prov, kab, kew)
579
+
580
+ client = get_llm_client()
581
+ if client is None or not USE_LLM:
582
+ return "⚠️ LLM tidak tersedia, memakai laporan rule-based.\n\n" + rule_based_gap_report(verif_df, prov, kab, kew)
583
+
584
+ system_prompt = (
585
+ "Anda adalah analis kebijakan dan manajer program IPLM. "
586
+ "Tugas Anda menyusun narasi singkat dan tegas tentang kekurangan sampel data IPLM "
587
+ "serta strategi pengumpulan data untuk menutup gap."
588
+ )
589
+
590
+ user_prompt = f"""
591
+ DATA RINGKAS GAP SAMPEL IPLM:
592
+
593
+ {ctx}
594
+
595
+ TULIS LAPORAN (BAHASA INDONESIA FORMAL) DENGAN STRUKTUR:
596
+ 1) Ringkasan kondisi pengumpulan data (1 paragraf).
597
+ 2) Angka total kekurangan sampel yang masih perlu dikumpulkan (1 paragraf).
598
+ 3) Prioritas wilayah (top gap) dan alasan operasionalnya (1 paragraf).
599
+ 4) Rencana aksi 30–60 hari (paragraf naratif, bukan bullet).
600
+
601
+ BATASAN:
602
+ - Jangan bahas indeks / skor IPLM sama sekali.
603
+ - Fokus murni pada coverage, kekurangan sampel, dan strategi pelengkapannya.
604
+ """
605
+
606
+ try:
607
+ resp = client.chat_completion(
608
+ model=LLM_MODEL_NAME,
609
+ messages=[{"role": "system", "content": system_prompt},
610
+ {"role": "user", "content": user_prompt}],
611
+ max_tokens=900,
612
+ temperature=0.2,
613
+ top_p=0.9,
614
+ )
615
+ text = resp.choices[0].message.content.strip()
616
+ if not text:
617
+ raise ValueError("Respon LLM kosong.")
618
+ return text
619
+ except Exception as e:
620
+ return (
621
+ "⚠️ Error saat memanggil LLM, memakai laporan rule-based.\n\n"
622
+ f"(Detail teknis: {repr(e)})\n\n"
623
+ + rule_based_gap_report(verif_df, prov, kab, kew)
624
+ )
625
+
626
+
627
+ # ============================================================
628
+ # 7) WORD REPORT
629
+ # ============================================================
630
+ def generate_word_report_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str, analysis_text: str):
631
+ wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
632
+ doc = Document()
633
+ doc.add_heading(f"Laporan Kekurangan Sampel IPLM – {wilayah}", level=1)
634
+
635
+ doc.add_paragraph(f"Kewenangan: {kew}")
636
+ doc.add_paragraph(f"Jumlah unit analisis: {len(verif_df)}")
637
+
638
+ # tabel verifikasi (batasi 200 baris biar gak jebol)
639
+ doc.add_heading("Tabel Verifikasi Coverage & Kekurangan Sampel", level=2)
640
+ view = verif_df.copy()
641
+ if len(view) > 200:
642
+ doc.add_paragraph("Catatan: tabel dipotong (200 baris pertama) untuk menjaga ukuran dokumen.")
643
+ view = view.head(200)
644
+
645
+ table = doc.add_table(rows=1, cols=len(view.columns))
646
+ hdr = table.rows[0].cells
647
+ for i, c in enumerate(view.columns):
648
+ hdr[i].text = str(c)
649
+
650
+ for _, row in view.iterrows():
651
+ r = table.add_row().cells
652
+ for i, c in enumerate(view.columns):
653
+ r[i].text = str(row[c])
654
+
655
+ # pie chart opsional: hanya 1 ringkasan total (bukan per kab/prov biar gak kebanyakan)
656
+ doc.add_heading("Ringkasan Visual (Opsional)", level=2)
657
+ if not HAS_KALEIDO:
658
+ doc.add_paragraph("Grafik pie tidak dibuat karena 'kaleido' tidak tersedia di server.")
659
+ else:
660
+ # cari kolom pop & sampel yang paling relevan (ambil pertama yang cocok)
661
+ pie_made = False
662
+ if "Sampel Sekolah (Total)" in verif_df.columns and "Populasi Sekolah (SD+SMP)" in verif_df.columns:
663
+ samp = pd.to_numeric(verif_df["Sampel Sekolah (Total)"], errors="coerce").fillna(0).sum()
664
+ pop = pd.to_numeric(verif_df["Populasi Sekolah (SD+SMP)"], errors="coerce").fillna(0).sum()
665
+ img = make_pie_plotly(samp, pop, "Coverage Perpustakaan Sekolah (Total)")
666
+ if img:
667
+ doc.add_picture(img, width=Inches(5))
668
+ pie_made = True
669
+
670
+ if (not pie_made) and ("Sampel SMA (di DM)" in verif_df.columns and "Populasi SMA (Meta)" in verif_df.columns):
671
+ samp = pd.to_numeric(verif_df["Sampel SMA (di DM)"], errors="coerce").fillna(0).sum()
672
+ pop = pd.to_numeric(verif_df["Populasi SMA (Meta)"], errors="coerce").fillna(0).sum()
673
+ img = make_pie_plotly(samp, pop, "Coverage Perpustakaan SMA (Total)")
674
+ if img:
675
+ doc.add_picture(img, width=Inches(5))
676
+ pie_made = True
677
+
678
+ if not pie_made:
679
+ doc.add_paragraph("Tidak ada pasangan kolom sampel-populasi yang valid untuk dibuat pie chart.")
680
+
681
+ doc.add_heading("Analisis Naratif (LLM)", level=2)
682
+ for p in analysis_text.split("\n"):
683
+ if p.strip():
684
+ doc.add_paragraph(p)
685
+
686
+ outpath = tempfile.mktemp(suffix=".docx")
687
+ doc.save(outpath)
688
+ return outpath
689
+
690
+
691
+ # ============================================================
692
+ # 8) CORE RUN (FILTER + EXPORT)
693
+ # ============================================================
694
+ def run_core(prov_value, kab_value, kew_value):
695
+ if df_all_raw is None or df_all_raw.empty:
696
+ empty = pd.DataFrame()
697
+ return empty, empty, None, None, None, "Data DM tidak terbaca.", "Tidak ada analisis."
698
+
699
+ df = df_all_raw.copy()
700
+
701
+ # filter prov
702
+ if prov_col_glob and prov_value and prov_value != "(Semua)":
703
+ df = df[df[prov_col_glob].astype(str).str.strip() == prov_value]
704
+
705
+ # filter kab
706
+ if kab_col_glob and kab_value and kab_value != "(Semua)":
707
+ df = df[df[kab_col_glob].astype(str).str.strip() == kab_value]
708
+
709
+ # filter kew
710
+ if kew_value and kew_value != "(Semua)":
711
+ df = df[df["KEW_NORM"] == kew_value]
712
+
713
+ if len(df) == 0:
714
+ empty = pd.DataFrame()
715
+ return empty, empty, None, None, None, "Tidak ada data untuk filter tersebut.", "Tidak ada analisis."
716
+
717
+ # hitung verifikasi gap
718
+ verif_df = compute_gap_verification(df, kew_value)
719
+
720
+ # buat detail subset untuk download (ringkas)
721
+ cols = []
722
+ for c in [prov_col_glob, kab_col_glob, nama_col_glob, kew_col_glob, jenis_col_glob, subjenis_col_glob, "_dataset", "KEW_NORM"]:
723
+ if c and c in df.columns and c not in cols:
724
+ cols.append(c)
725
+ detail_df = df[cols].copy() if cols else df.copy()
726
+
727
+ # simpan excel
728
+ tmpdir = tempfile.mkdtemp()
729
+ out_excel = os.path.join(tmpdir, "Kekurangan_Sampel_IPLM.xlsx")
730
+
731
+ with pd.ExcelWriter(out_excel, engine="openpyxl") as w:
732
+ verif_df.to_excel(w, sheet_name="Verifikasi_Gap", index=False)
733
+ detail_df.to_excel(w, sheet_name="Detail_Subset_DM", index=False)
734
+
735
+ # analisis LLM
736
+ analysis_text = generate_llm_gap_report(verif_df, prov_value, kab_value, kew_value)
737
+
738
+ # word report
739
+ out_word = generate_word_report_gap(verif_df, prov_value, kab_value, kew_value, analysis_text)
740
+
741
+ msg = f"OK. Subset DM: {len(df)} baris | Verifikasi: {len(verif_df)} baris."
742
+ return verif_df, detail_df, out_excel, out_word, None, msg, analysis_text
743
+
744
+
745
+ def on_prov_change(prov_value):
746
+ return gr.update(choices=get_kab_choices_for_prov(prov_value), value="(Semua)")
747
+
748
+
749
+ # ============================================================
750
+ # 9) UI GRADIO
751
+ # ============================================================
752
+ with gr.Blocks() as demo:
753
+ gr.Markdown(
754
+ f"""
755
+ # Dashboard Kekurangan Sampel IPLM (Tanpa Hitung Indeks)
756
+
757
+ Aplikasi ini hanya mengecek **kekurangan sampel** berdasarkan:
758
+ - **DM (sampel masuk)** vs **Meta populasi (SD/SMP, SMA, Kec/DesaKel)**
759
+
760
+ **File:**
761
+ - `{DATA_FILE}` (DM)
762
+ - `{META_KAB_FILE}` (Kecamatan + Desa/Kel)
763
+ - `{META_SDSMP_FILE}` (SD + SMP)
764
+ - `{META_SMA_FILE}` (SMA)
765
+
766
+ {DATA_INFO}
767
+ """
768
+ )
769
+
770
+ with gr.Row():
771
+ dd_prov = gr.Dropdown(label="Provinsi", choices=prov_choices, value=prov_choices[0])
772
+ dd_kab = gr.Dropdown(label="Kab/Kota", choices=kab_choices, value=kab_choices[0])
773
+ dd_kew = gr.Dropdown(label="Kewenangan", choices=kew_choices, value=default_kew)
774
+
775
+ dd_prov.change(fn=on_prov_change, inputs=dd_prov, outputs=dd_kab)
776
+
777
+ run_btn = gr.Button("Hitung Kekurangan Sampel")
778
+ msg_out = gr.Markdown()
779
+
780
+ gr.Markdown("### Verifikasi Coverage & Kekurangan Sampel")
781
+ verif_out = gr.DataFrame(interactive=False)
782
+
783
+ gr.Markdown("### Detail Subset DM (yang terfilter)")
784
+ detail_out = gr.DataFrame(interactive=False)
785
+
786
+ gr.Markdown("### Analisis Naratif (LLM)")
787
+ analysis_out = gr.Markdown()
788
+
789
+ with gr.Row():
790
+ excel_out = gr.File(label="Download Rekap Excel (.xlsx)")
791
+ word_out = gr.File(label="Download Laporan Word (.docx)")
792
+
793
+ run_btn.click(
794
+ fn=run_core,
795
+ inputs=[dd_prov, dd_kab, dd_kew],
796
+ outputs=[verif_out, detail_out, excel_out, word_out, gr.State(), msg_out, analysis_out],
797
+ )
798
+
799
+ demo.launch()