irhamni commited on
Commit
d914200
·
verified ·
1 Parent(s): 3cf2586

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -91
app.py CHANGED
@@ -1,14 +1,24 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
  app.py — Dashboard Kekurangan Sampel IPLM (TANPA HITUNG INDEKS)
4
- - Fokus: melihat kekurangan jumlah sampel IPLM per wilayah
5
- - Bandingkan "sampel masuk (DM)" vs "populasi target (meta)"
6
- - Pertahankan LLM untuk membuat laporan naratif kekurangan sampel
7
-
8
- Output:
9
- - Tabel verifikasi (coverage & gap)
10
- - Download Excel (rekap + detail subset)
11
- - Word report (opsional pie chart kalau kaleido tersedia)
 
 
 
 
 
 
 
 
 
 
12
  """
13
 
14
  import os
@@ -26,7 +36,7 @@ from huggingface_hub import InferenceClient
26
  from docx import Document
27
  from docx.shared import Inches
28
 
29
- # Pie chart opsional (kalau kaleido ada)
30
  import plotly.express as px
31
  try:
32
  import kaleido # noqa: F401
@@ -43,6 +53,7 @@ META_KAB_FILE = "jumlahdesa_fixed (1).xlsx" # kecamatan & desa/kel per kab/k
43
  META_SDSMP_FILE = "SD-SMP-kab.xlsx" # jumlah SD & SMP per kab/kota
44
  META_SMA_FILE = "SMA.xlsx" # jumlah SMA per provinsi
45
 
 
46
  # ============================================================
47
  # 1b) KONFIGURASI LLM (Hugging Face Inference)
48
  # ============================================================
@@ -50,7 +61,7 @@ USE_LLM = True
50
  LLM_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
51
 
52
  HF_TOKEN = (
53
- os.getenv("HF_SECRET")
54
  or os.getenv("HUGGINGFACEHUB_API_TOKEN")
55
  or os.getenv("HF_API_TOKEN")
56
  )
@@ -148,9 +159,27 @@ def norm_kab_label(s):
148
  t = " ".join(t.split())
149
  return re.sub(r"[^A-Z0-9]+", "", t)
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def make_pie_plotly(num, den, title):
152
  if not HAS_KALEIDO:
153
  return None
 
154
  if den is None or pd.isna(den) or den <= 0:
155
  values = [0, 1]
156
  labels = ["Terjangkau", "Belum Terjangkau"]
@@ -159,6 +188,7 @@ def make_pie_plotly(num, den, title):
159
  den = float(den)
160
  values = [max(num, 0), max(den - num, 0)]
161
  labels = ["Terjangkau", "Belum Terjangkau"]
 
162
  fig = px.pie(values=values, names=labels, title=title, hole=0.3)
163
  tmp = tempfile.mktemp(suffix=".png")
164
  try:
@@ -173,10 +203,16 @@ def make_pie_plotly(num, den, title):
173
  # ============================================================
174
  DATA_INFO = ""
175
  df_all_raw = None
176
- meta_kab_df = None # kab_key -> kec, desa/kel, SD, SMP (gabungan)
177
- meta_sma_df = None # prov_key -> Jml_SMA
178
 
179
- prov_col_glob = kab_col_glob = kew_col_glob = jenis_col_glob = subjenis_col_glob = nama_col_glob = None
 
 
 
 
 
 
 
 
180
 
181
  try:
182
  fp = Path(DATA_FILE)
@@ -282,7 +318,6 @@ except Exception as e:
282
  # --- META SMA per provinsi ---
283
  try:
284
  meta_sma_raw = pd.read_excel(META_SMA_FILE)
285
-
286
  col_prov_sma = pick_col(meta_sma_raw, [
287
  "Provinsi", "provinsi", "PROVINSI", "NAMA_PROVINSI", "Nama Provinsi",
288
  "nm_prov", "nm_provinsi", "prov"
@@ -353,30 +388,7 @@ default_kew = "KAB/KOTA" if "KAB/KOTA" in kew_choices else (kew_choices[0] if k
353
  # ============================================================
354
  # 5) INTI: HITUNG COVERAGE & GAP
355
  # ============================================================
356
- def _infer_jenjang_sd_smp(x):
357
- if pd.isna(x):
358
- return "OTHER"
359
- t = str(x).upper()
360
- # heuristik sederhana
361
- if " SD " in f" {t} " or " SD/" in t or " MI " in f" {t} ":
362
- return "SD"
363
- if " SMP " in f" {t} " or " SMP/" in t or " MTS " in f" {t} ":
364
- return "SMP"
365
- return "OTHER"
366
-
367
- def safe_pct(num, den):
368
- if den is None or pd.isna(den) or den <= 0:
369
- return np.nan
370
- if num is None or pd.isna(num):
371
- num = 0
372
- return 100.0 * float(num) / float(den)
373
-
374
  def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.DataFrame:
375
- """
376
- Keluaran: tabel coverage & GAP (kekurangan sampel) sesuai kewenangan.
377
- - KAB/KOTA: bandingkan sampel sekolah vs (SD+SMP), umum vs (kec+desa/kel)
378
- - PROVINSI: bandingkan sampel SMA vs (jumlah SMA)
379
- """
380
  if df_filtered is None or len(df_filtered) == 0:
381
  return pd.DataFrame()
382
 
@@ -438,13 +450,15 @@ def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.Da
438
  lambda r: safe_pct(r["Sampel_Umum"], r.get("Pop_Kec_DesaKel", np.nan)), axis=1
439
  )
440
 
441
- # GAP (kekurangan sampel)
442
  merged["Gap_Sekolah"] = merged.apply(
443
- lambda r: max(int(math.ceil(r["Pop_SD_SMP"] - r["Sampel_Sekolah_Total"])) if pd.notna(r["Pop_SD_SMP"]) else 0, 0),
 
444
  axis=1
445
  )
446
  merged["Gap_Umum"] = merged.apply(
447
- lambda r: max(int(math.ceil(r["Pop_Kec_DesaKel"] - r["Sampel_Umum"])) if pd.notna(r["Pop_Kec_DesaKel"]) else 0, 0),
 
448
  axis=1
449
  )
450
 
@@ -467,7 +481,6 @@ def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.Da
467
  if ("PROV" in kew_norm):
468
  if meta_sma_df is None:
469
  return pd.DataFrame({"Info": ["Meta SMA tidak tersedia."]})
470
-
471
  if prov_col_glob is None:
472
  return pd.DataFrame({"Info": ["Kolom provinsi tidak ditemukan di DM."]})
473
 
@@ -478,29 +491,32 @@ def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.Da
478
 
479
  tmp["prov_key"] = tmp[prov_col_glob].apply(norm_prov_label)
480
 
 
481
  g_total = tmp.groupby("prov_key").size().rename("Sampel_Total").reset_index()
 
482
  tmp_sek = tmp[tmp["_dataset"] == "sekolah"].copy() if "_dataset" in tmp.columns else tmp.copy()
483
  g_sma = tmp_sek.groupby("prov_key").size().rename("Sampel_SMA").reset_index()
484
 
485
  merged = (
486
- meta_sma_df.merge(g_total, on="prov_key", how="left")
487
- .merge(g_sma, on="prov_key", how="left")
 
488
  )
489
 
490
- merged["Sampel_Total"] = merged["Sampel_Total"].fillna(0).astype(int)
491
- merged["Sampel_SMA"] = merged["Sampel_SMA"].fillna(0).astype(int)
492
 
493
  merged["Coverage_SMA_%"] = merged.apply(
494
  lambda r: safe_pct(r["Sampel_SMA"], r.get("Jml_SMA", np.nan)), axis=1
495
  )
496
  merged["Kekurangan Sampel SMA"] = merged.apply(
497
- lambda r: max(int(math.ceil(r["Jml_SMA"] - r["Sampel_SMA"])) if pd.notna(r["Jml_SMA"]) else 0, 0),
 
498
  axis=1
499
  )
500
 
501
  out = pd.DataFrame({
502
- "Provinsi": merged["Provinsi_Label"],
503
- "Sampel Total (Prov)": merged["Sampel_Total"],
504
  "Sampel SMA (di DM)": merged["Sampel_SMA"],
505
  "Populasi SMA (Meta)": merged["Jml_SMA"],
506
  "Coverage SMA (%)": merged["Coverage_SMA_%"],
@@ -513,7 +529,7 @@ def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.Da
513
 
514
 
515
  # ============================================================
516
- # 6) BUILD CONTEXT UNTUK LLM + FALLBACK
517
  # ============================================================
518
  def build_context_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
519
  wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
@@ -522,36 +538,30 @@ def build_context_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) ->
522
  lines.append(f"Kewenangan: {kew}")
523
  lines.append(f"Jumlah baris verifikasi: {len(verif_df)}")
524
 
525
- # ringkas total gap
526
  gap_cols = [c for c in verif_df.columns if "Kekurangan" in c]
527
  for gc in gap_cols:
528
- try:
529
- total_gap = float(pd.to_numeric(verif_df[gc], errors="coerce").fillna(0).sum())
530
- lines.append(f"Total {gc}: {int(total_gap)}")
531
- except Exception:
532
- pass
533
 
534
- # top 10 terbesar
535
  if gap_cols:
536
  gc = gap_cols[0]
537
- try:
538
- t = verif_df.copy()
539
- t[gc] = pd.to_numeric(t[gc], errors="coerce").fillna(0)
540
- top = t.sort_values(gc, ascending=False).head(10)
541
- keycol = "Kab/Kota" if "Kab/Kota" in top.columns else ("Provinsi" if "Provinsi" in top.columns else top.columns[0])
542
- lines.append("\nTop prioritas (gap terbesar):")
543
- for _, r in top.iterrows():
544
- lines.append(f"- {r[keycol]}: {gc}={int(r[gc])}")
545
- except Exception:
546
- pass
547
 
548
  return "\n".join(lines)
549
 
550
  def rule_based_gap_report(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
551
  if verif_df is None or verif_df.empty:
552
  return "Tidak ada data verifikasi yang dapat dilaporkan."
553
- wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
554
 
 
555
  lines = []
556
  lines.append("## Ringkasan Kekurangan Sampel IPLM (Rule-based)\n")
557
  lines.append(f"Wilayah: {wilayah}")
@@ -560,7 +570,7 @@ def rule_based_gap_report(verif_df: pd.DataFrame, prov: str, kab: str, kew: str)
560
 
561
  gap_cols = [c for c in verif_df.columns if "Kekurangan" in c]
562
  if not gap_cols:
563
- lines.append("Kolom kekurangan sampel tidak ditemukan pada tabel verifikasi.")
564
  return "\n".join(lines)
565
 
566
  for gc in gap_cols:
@@ -568,9 +578,8 @@ def rule_based_gap_report(verif_df: pd.DataFrame, prov: str, kab: str, kew: str)
568
  lines.append(f"- Total {gc}: **{total_gap}** unit yang perlu dilengkapi.")
569
 
570
  lines.append(
571
- "\nRekomendasi operasional: fokuskan pengumpulan data pada unit/wilayah dengan gap terbesar, "
572
- "mulai dari area yang memiliki populasi target besar namun sampel masuk masih terbatas. "
573
- "Pastikan konsistensi penamaan provinsi/kab-kota agar matching dengan meta tidak gagal."
574
  )
575
  return "\n".join(lines)
576
 
@@ -606,8 +615,10 @@ BATASAN:
606
  try:
607
  resp = client.chat_completion(
608
  model=LLM_MODEL_NAME,
609
- messages=[{"role": "system", "content": system_prompt},
610
- {"role": "user", "content": user_prompt}],
 
 
611
  max_tokens=900,
612
  temperature=0.2,
613
  top_p=0.9,
@@ -629,13 +640,12 @@ BATASAN:
629
  # ============================================================
630
  def generate_word_report_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str, analysis_text: str):
631
  wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
 
632
  doc = Document()
633
  doc.add_heading(f"Laporan Kekurangan Sampel IPLM – {wilayah}", level=1)
634
-
635
  doc.add_paragraph(f"Kewenangan: {kew}")
636
  doc.add_paragraph(f"Jumlah unit analisis: {len(verif_df)}")
637
 
638
- # tabel verifikasi (batasi 200 baris biar gak jebol)
639
  doc.add_heading("Tabel Verifikasi Coverage & Kekurangan Sampel", level=2)
640
  view = verif_df.copy()
641
  if len(view) > 200:
@@ -652,13 +662,12 @@ def generate_word_report_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: s
652
  for i, c in enumerate(view.columns):
653
  r[i].text = str(row[c])
654
 
655
- # pie chart opsional: hanya 1 ringkasan total (bukan per kab/prov biar gak kebanyakan)
656
  doc.add_heading("Ringkasan Visual (Opsional)", level=2)
657
  if not HAS_KALEIDO:
658
  doc.add_paragraph("Grafik pie tidak dibuat karena 'kaleido' tidak tersedia di server.")
659
  else:
660
- # cari kolom pop & sampel yang paling relevan (ambil pertama yang cocok)
661
  pie_made = False
 
662
  if "Sampel Sekolah (Total)" in verif_df.columns and "Populasi Sekolah (SD+SMP)" in verif_df.columns:
663
  samp = pd.to_numeric(verif_df["Sampel Sekolah (Total)"], errors="coerce").fillna(0).sum()
664
  pop = pd.to_numeric(verif_df["Populasi Sekolah (SD+SMP)"], errors="coerce").fillna(0).sum()
@@ -667,6 +676,7 @@ def generate_word_report_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: s
667
  doc.add_picture(img, width=Inches(5))
668
  pie_made = True
669
 
 
670
  if (not pie_made) and ("Sampel SMA (di DM)" in verif_df.columns and "Populasi SMA (Meta)" in verif_df.columns):
671
  samp = pd.to_numeric(verif_df["Sampel SMA (di DM)"], errors="coerce").fillna(0).sum()
672
  pop = pd.to_numeric(verif_df["Populasi SMA (Meta)"], errors="coerce").fillna(0).sum()
@@ -694,7 +704,12 @@ def generate_word_report_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: s
694
  def run_core(prov_value, kab_value, kew_value):
695
  if df_all_raw is None or df_all_raw.empty:
696
  empty = pd.DataFrame()
697
- return empty, empty, None, None, None, "Data DM tidak terbaca.", "Tidak ada analisis."
 
 
 
 
 
698
 
699
  df = df_all_raw.copy()
700
 
@@ -712,35 +727,52 @@ def run_core(prov_value, kab_value, kew_value):
712
 
713
  if len(df) == 0:
714
  empty = pd.DataFrame()
715
- return empty, empty, None, None, None, "Tidak ada data untuk filter tersebut.", "Tidak ada analisis."
 
 
 
 
 
716
 
717
  # hitung verifikasi gap
718
  verif_df = compute_gap_verification(df, kew_value)
719
 
720
- # buat detail subset untuk download (ringkas)
721
  cols = []
722
  for c in [prov_col_glob, kab_col_glob, nama_col_glob, kew_col_glob, jenis_col_glob, subjenis_col_glob, "_dataset", "KEW_NORM"]:
723
  if c and c in df.columns and c not in cols:
724
  cols.append(c)
725
  detail_df = df[cols].copy() if cols else df.copy()
726
 
727
- # simpan excel
728
  tmpdir = tempfile.mkdtemp()
729
- out_excel = os.path.join(tmpdir, "Kekurangan_Sampel_IPLM.xlsx")
 
730
 
731
- with pd.ExcelWriter(out_excel, engine="openpyxl") as w:
 
732
  verif_df.to_excel(w, sheet_name="Verifikasi_Gap", index=False)
733
  detail_df.to_excel(w, sheet_name="Detail_Subset_DM", index=False)
734
 
735
- # analisis LLM
 
 
 
736
  analysis_text = generate_llm_gap_report(verif_df, prov_value, kab_value, kew_value)
737
 
738
- # word report
739
- out_word = generate_word_report_gap(verif_df, prov_value, kab_value, kew_value, analysis_text)
740
 
741
  msg = f"OK. Subset DM: {len(df)} baris | Verifikasi: {len(verif_df)} baris."
742
- return verif_df, detail_df, out_excel, out_word, None, msg, analysis_text
743
-
 
 
 
 
 
 
 
744
 
745
  def on_prov_change(prov_value):
746
  return gr.update(choices=get_kab_choices_for_prov(prov_value), value="(Semua)")
@@ -787,13 +819,14 @@ Aplikasi ini hanya mengecek **kekurangan sampel** berdasarkan:
787
  analysis_out = gr.Markdown()
788
 
789
  with gr.Row():
790
- excel_out = gr.File(label="Download Rekap Excel (.xlsx)")
791
- word_out = gr.File(label="Download Laporan Word (.docx)")
 
792
 
793
  run_btn.click(
794
  fn=run_core,
795
  inputs=[dd_prov, dd_kab, dd_kew],
796
- outputs=[verif_out, detail_out, excel_out, word_out, gr.State(), msg_out, analysis_out],
797
  )
798
 
799
  demo.launch()
 
1
  # -*- coding: utf-8 -*-
2
  """
3
  app.py — Dashboard Kekurangan Sampel IPLM (TANPA HITUNG INDEKS)
4
+
5
+ Fokus:
6
+ - Mengecek "kekurangan sampel" pengumpulan data IPLM per wilayah
7
+ - Bandingkan sampel yang sudah masuk (DM) vs populasi target (META):
8
+ - Kab/Kota: SD+SMP (meta SD/SMP) dan Kec+Desa/Kel (meta jumlah desa)
9
+ - Provinsi: SMA (meta SMA provinsi)
10
+
11
+ Fitur:
12
+ - Filter: Provinsi, Kab/Kota, Kewenangan
13
+ - Tabel Verifikasi Coverage & Kekurangan Sampel
14
+ - Tabel Detail Subset DM (ringkas)
15
+ - Download:
16
+ 1) Rekap Excel (verifikasi + detail ringkas)
17
+ 2) Data mentah subset DM (RAW) sesuai filter user
18
+ 3) Laporan Word (narasi LLM + tabel verifikasi + pie ringkasan opsional)
19
+
20
+ Catatan:
21
+ - Tidak ada perhitungan Indeks IPLM sama sekali.
22
  """
23
 
24
  import os
 
36
  from docx import Document
37
  from docx.shared import Inches
38
 
39
+ # Pie chart opsional (butuh kaleido)
40
  import plotly.express as px
41
  try:
42
  import kaleido # noqa: F401
 
53
  META_SDSMP_FILE = "SD-SMP-kab.xlsx" # jumlah SD & SMP per kab/kota
54
  META_SMA_FILE = "SMA.xlsx" # jumlah SMA per provinsi
55
 
56
+
57
  # ============================================================
58
  # 1b) KONFIGURASI LLM (Hugging Face Inference)
59
  # ============================================================
 
61
  LLM_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
62
 
63
  HF_TOKEN = (
64
+ os.getenv("HF_TOKEN")
65
  or os.getenv("HUGGINGFACEHUB_API_TOKEN")
66
  or os.getenv("HF_API_TOKEN")
67
  )
 
159
  t = " ".join(t.split())
160
  return re.sub(r"[^A-Z0-9]+", "", t)
161
 
162
+ def safe_pct(num, den):
163
+ if den is None or pd.isna(den) or den <= 0:
164
+ return np.nan
165
+ if num is None or pd.isna(num):
166
+ num = 0
167
+ return 100.0 * float(num) / float(den)
168
+
169
+ def _infer_jenjang_sd_smp(x):
170
+ if pd.isna(x):
171
+ return "OTHER"
172
+ t = str(x).upper()
173
+ if " SD " in f" {t} " or " SD/" in t or " MI " in f" {t} ":
174
+ return "SD"
175
+ if " SMP " in f" {t} " or " SMP/" in t or " MTS " in f" {t} ":
176
+ return "SMP"
177
+ return "OTHER"
178
+
179
  def make_pie_plotly(num, den, title):
180
  if not HAS_KALEIDO:
181
  return None
182
+
183
  if den is None or pd.isna(den) or den <= 0:
184
  values = [0, 1]
185
  labels = ["Terjangkau", "Belum Terjangkau"]
 
188
  den = float(den)
189
  values = [max(num, 0), max(den - num, 0)]
190
  labels = ["Terjangkau", "Belum Terjangkau"]
191
+
192
  fig = px.pie(values=values, names=labels, title=title, hole=0.3)
193
  tmp = tempfile.mktemp(suffix=".png")
194
  try:
 
203
  # ============================================================
204
  DATA_INFO = ""
205
  df_all_raw = None
 
 
206
 
207
+ meta_kab_df = None # kab_key -> (Jml_Kecamatan, Jml_DesaKel, Jml_SD, Jml_SMP)
208
+ meta_sma_df = None # prov_key -> Jml_SMA
209
+
210
+ prov_col_glob = None
211
+ kab_col_glob = None
212
+ kew_col_glob = None
213
+ jenis_col_glob = None
214
+ subjenis_col_glob = None
215
+ nama_col_glob = None
216
 
217
  try:
218
  fp = Path(DATA_FILE)
 
318
  # --- META SMA per provinsi ---
319
  try:
320
  meta_sma_raw = pd.read_excel(META_SMA_FILE)
 
321
  col_prov_sma = pick_col(meta_sma_raw, [
322
  "Provinsi", "provinsi", "PROVINSI", "NAMA_PROVINSI", "Nama Provinsi",
323
  "nm_prov", "nm_provinsi", "prov"
 
388
  # ============================================================
389
  # 5) INTI: HITUNG COVERAGE & GAP
390
  # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  def compute_gap_verification(df_filtered: pd.DataFrame, kew_value: str) -> pd.DataFrame:
 
 
 
 
 
392
  if df_filtered is None or len(df_filtered) == 0:
393
  return pd.DataFrame()
394
 
 
450
  lambda r: safe_pct(r["Sampel_Umum"], r.get("Pop_Kec_DesaKel", np.nan)), axis=1
451
  )
452
 
453
+ # GAP (kekurangan sampel) -> asumsi target = 100% populasi
454
  merged["Gap_Sekolah"] = merged.apply(
455
+ lambda r: max(int(math.ceil(r["Pop_SD_SMP"] - r["Sampel_Sekolah_Total"]))
456
+ if pd.notna(r["Pop_SD_SMP"]) else 0, 0),
457
  axis=1
458
  )
459
  merged["Gap_Umum"] = merged.apply(
460
+ lambda r: max(int(math.ceil(r["Pop_Kec_DesaKel"] - r["Sampel_Umum"]))
461
+ if pd.notna(r["Pop_Kec_DesaKel"]) else 0, 0),
462
  axis=1
463
  )
464
 
 
481
  if ("PROV" in kew_norm):
482
  if meta_sma_df is None:
483
  return pd.DataFrame({"Info": ["Meta SMA tidak tersedia."]})
 
484
  if prov_col_glob is None:
485
  return pd.DataFrame({"Info": ["Kolom provinsi tidak ditemukan di DM."]})
486
 
 
491
 
492
  tmp["prov_key"] = tmp[prov_col_glob].apply(norm_prov_label)
493
 
494
+ # IMPORTANT: start dari sampel (biar tidak munculin provinsi lain dari meta)
495
  g_total = tmp.groupby("prov_key").size().rename("Sampel_Total").reset_index()
496
+
497
  tmp_sek = tmp[tmp["_dataset"] == "sekolah"].copy() if "_dataset" in tmp.columns else tmp.copy()
498
  g_sma = tmp_sek.groupby("prov_key").size().rename("Sampel_SMA").reset_index()
499
 
500
  merged = (
501
+ g_total
502
+ .merge(g_sma, on="prov_key", how="left")
503
+ .merge(meta_sma_df[["prov_key", "Provinsi_Label", "Jml_SMA"]], on="prov_key", how="left")
504
  )
505
 
506
+ merged["Sampel_SMA"] = merged["Sampel_SMA"].fillna(0).astype(int)
 
507
 
508
  merged["Coverage_SMA_%"] = merged.apply(
509
  lambda r: safe_pct(r["Sampel_SMA"], r.get("Jml_SMA", np.nan)), axis=1
510
  )
511
  merged["Kekurangan Sampel SMA"] = merged.apply(
512
+ lambda r: max(int(math.ceil(r["Jml_SMA"] - r["Sampel_SMA"]))
513
+ if pd.notna(r["Jml_SMA"]) else 0, 0),
514
  axis=1
515
  )
516
 
517
  out = pd.DataFrame({
518
+ "Provinsi": merged["Provinsi_Label"].fillna(merged["prov_key"]),
519
+ "Sampel Total (Prov)": merged["Sampel_Total"].fillna(0).astype(int),
520
  "Sampel SMA (di DM)": merged["Sampel_SMA"],
521
  "Populasi SMA (Meta)": merged["Jml_SMA"],
522
  "Coverage SMA (%)": merged["Coverage_SMA_%"],
 
529
 
530
 
531
  # ============================================================
532
+ # 6) LLM REPORT (GAP)
533
  # ============================================================
534
  def build_context_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
535
  wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
 
538
  lines.append(f"Kewenangan: {kew}")
539
  lines.append(f"Jumlah baris verifikasi: {len(verif_df)}")
540
 
 
541
  gap_cols = [c for c in verif_df.columns if "Kekurangan" in c]
542
  for gc in gap_cols:
543
+ total_gap = int(pd.to_numeric(verif_df[gc], errors="coerce").fillna(0).sum())
544
+ lines.append(f"Total {gc}: {total_gap}")
 
 
 
545
 
546
+ # top prioritas (ambil kolom gap pertama)
547
  if gap_cols:
548
  gc = gap_cols[0]
549
+ t = verif_df.copy()
550
+ t[gc] = pd.to_numeric(t[gc], errors="coerce").fillna(0)
551
+ keycol = "Kab/Kota" if "Kab/Kota" in t.columns else ("Provinsi" if "Provinsi" in t.columns else t.columns[0])
552
+ top = t.sort_values(gc, ascending=False).head(10)
553
+
554
+ lines.append("\nTop prioritas (gap terbesar):")
555
+ for _, r in top.iterrows():
556
+ lines.append(f"- {r[keycol]}: {gc}={int(r[gc])}")
 
 
557
 
558
  return "\n".join(lines)
559
 
560
  def rule_based_gap_report(verif_df: pd.DataFrame, prov: str, kab: str, kew: str) -> str:
561
  if verif_df is None or verif_df.empty:
562
  return "Tidak ada data verifikasi yang dapat dilaporkan."
 
563
 
564
+ wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
565
  lines = []
566
  lines.append("## Ringkasan Kekurangan Sampel IPLM (Rule-based)\n")
567
  lines.append(f"Wilayah: {wilayah}")
 
570
 
571
  gap_cols = [c for c in verif_df.columns if "Kekurangan" in c]
572
  if not gap_cols:
573
+ lines.append("Kolom kekurangan sampel tidak ditemukan.")
574
  return "\n".join(lines)
575
 
576
  for gc in gap_cols:
 
578
  lines.append(f"- Total {gc}: **{total_gap}** unit yang perlu dilengkapi.")
579
 
580
  lines.append(
581
+ "\nRekomendasi operasional: prioritaskan pengumpulan data pada wilayah dengan gap terbesar, "
582
+ "dan pastikan konsistensi penamaan provinsi/kab-kota agar pencocokan dengan meta tidak gagal."
 
583
  )
584
  return "\n".join(lines)
585
 
 
615
  try:
616
  resp = client.chat_completion(
617
  model=LLM_MODEL_NAME,
618
+ messages=[
619
+ {"role": "system", "content": system_prompt},
620
+ {"role": "user", "content": user_prompt},
621
+ ],
622
  max_tokens=900,
623
  temperature=0.2,
624
  top_p=0.9,
 
640
  # ============================================================
641
  def generate_word_report_gap(verif_df: pd.DataFrame, prov: str, kab: str, kew: str, analysis_text: str):
642
  wilayah = kab if kab and kab != "(Semua)" else (prov if prov and prov != "(Semua)" else "NASIONAL")
643
+
644
  doc = Document()
645
  doc.add_heading(f"Laporan Kekurangan Sampel IPLM – {wilayah}", level=1)
 
646
  doc.add_paragraph(f"Kewenangan: {kew}")
647
  doc.add_paragraph(f"Jumlah unit analisis: {len(verif_df)}")
648
 
 
649
  doc.add_heading("Tabel Verifikasi Coverage & Kekurangan Sampel", level=2)
650
  view = verif_df.copy()
651
  if len(view) > 200:
 
662
  for i, c in enumerate(view.columns):
663
  r[i].text = str(row[c])
664
 
 
665
  doc.add_heading("Ringkasan Visual (Opsional)", level=2)
666
  if not HAS_KALEIDO:
667
  doc.add_paragraph("Grafik pie tidak dibuat karena 'kaleido' tidak tersedia di server.")
668
  else:
 
669
  pie_made = False
670
+ # Ringkas sekolah kab/kota
671
  if "Sampel Sekolah (Total)" in verif_df.columns and "Populasi Sekolah (SD+SMP)" in verif_df.columns:
672
  samp = pd.to_numeric(verif_df["Sampel Sekolah (Total)"], errors="coerce").fillna(0).sum()
673
  pop = pd.to_numeric(verif_df["Populasi Sekolah (SD+SMP)"], errors="coerce").fillna(0).sum()
 
676
  doc.add_picture(img, width=Inches(5))
677
  pie_made = True
678
 
679
+ # Ringkas SMA provinsi
680
  if (not pie_made) and ("Sampel SMA (di DM)" in verif_df.columns and "Populasi SMA (Meta)" in verif_df.columns):
681
  samp = pd.to_numeric(verif_df["Sampel SMA (di DM)"], errors="coerce").fillna(0).sum()
682
  pop = pd.to_numeric(verif_df["Populasi SMA (Meta)"], errors="coerce").fillna(0).sum()
 
704
  def run_core(prov_value, kab_value, kew_value):
705
  if df_all_raw is None or df_all_raw.empty:
706
  empty = pd.DataFrame()
707
+ return (
708
+ empty, empty,
709
+ None, None, None,
710
+ "Data DM tidak terbaca.",
711
+ "Tidak ada analisis."
712
+ )
713
 
714
  df = df_all_raw.copy()
715
 
 
727
 
728
  if len(df) == 0:
729
  empty = pd.DataFrame()
730
+ return (
731
+ empty, empty,
732
+ None, None, None,
733
+ "Tidak ada data untuk kombinasi filter yang dipilih.",
734
+ "Tidak ada analisis."
735
+ )
736
 
737
  # hitung verifikasi gap
738
  verif_df = compute_gap_verification(df, kew_value)
739
 
740
+ # detail subset untuk UI (ringkas)
741
  cols = []
742
  for c in [prov_col_glob, kab_col_glob, nama_col_glob, kew_col_glob, jenis_col_glob, subjenis_col_glob, "_dataset", "KEW_NORM"]:
743
  if c and c in df.columns and c not in cols:
744
  cols.append(c)
745
  detail_df = df[cols].copy() if cols else df.copy()
746
 
747
+ # simpan file download
748
  tmpdir = tempfile.mkdtemp()
749
+ rekap_excel_path = os.path.join(tmpdir, "Rekap_Kekurangan_Sampel_IPLM.xlsx")
750
+ raw_dm_path = os.path.join(tmpdir, "DM_Subset_Raw.xlsx")
751
 
752
+ # 1) rekap excel (verif + detail ringkas)
753
+ with pd.ExcelWriter(rekap_excel_path, engine="openpyxl") as w:
754
  verif_df.to_excel(w, sheet_name="Verifikasi_Gap", index=False)
755
  detail_df.to_excel(w, sheet_name="Detail_Subset_DM", index=False)
756
 
757
+ # 2) raw dm subset (SEMUA kolom DM hasil filter user)
758
+ df.to_excel(raw_dm_path, index=False)
759
+
760
+ # 3) analisis LLM
761
  analysis_text = generate_llm_gap_report(verif_df, prov_value, kab_value, kew_value)
762
 
763
+ # 4) word report
764
+ word_path = generate_word_report_gap(verif_df, prov_value, kab_value, kew_value, analysis_text)
765
 
766
  msg = f"OK. Subset DM: {len(df)} baris | Verifikasi: {len(verif_df)} baris."
767
+ return (
768
+ verif_df,
769
+ detail_df,
770
+ rekap_excel_path,
771
+ raw_dm_path,
772
+ word_path,
773
+ msg,
774
+ analysis_text
775
+ )
776
 
777
  def on_prov_change(prov_value):
778
  return gr.update(choices=get_kab_choices_for_prov(prov_value), value="(Semua)")
 
819
  analysis_out = gr.Markdown()
820
 
821
  with gr.Row():
822
+ rekap_excel_out = gr.File(label="Download Rekap (Verifikasi + Detail) (.xlsx)")
823
+ raw_dm_out = gr.File(label="Download Data Mentah Subset DM (.xlsx)")
824
+ word_out = gr.File(label="Download Laporan Word (.docx)")
825
 
826
  run_btn.click(
827
  fn=run_core,
828
  inputs=[dd_prov, dd_kab, dd_kew],
829
+ outputs=[verif_out, detail_out, rekap_excel_out, raw_dm_out, word_out, msg_out, analysis_out],
830
  )
831
 
832
  demo.launch()