Spaces:
Sleeping
Sleeping
File size: 11,044 Bytes
4edbd5c 50c7bda 579547b 4edbd5c 579547b 4edbd5c 50c7bda 4edbd5c 50c7bda 4edbd5c 9a43d06 4edbd5c 50c7bda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
# app.py β RAG luwes untuk IPLM
import os, re, json, hashlib
from pathlib import Path
import gradio as gr
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
# ========== Konfigurasi ==========
DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl"))
EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/gemma-2b-it") # lokal & ringan
TOP_K = int(os.getenv("TOP_K", "5"))
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.4"))
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "320"))
THRESHOLD = float(os.getenv("THRESHOLD", "0.62")) # naikkan sedikit agar lebih tepercaya
# ========== Prompt (lebih natural) ==========
SYSTEM_PROMPT = """
Kamu adalah asisten pustakawan Perpustakaan Nasional RI untuk topik IPLM (Indeks Pembangunan Literasi Masyarakat).
Tugasmu:
- Jawab hanya berdasarkan KONTEKS yang diberikan (jangan menambah fakta baru).
- Tulis dengan bahasa Indonesia yang alami, ramah, dan mudah dipahami publik.
- Jelaskan dengan contoh singkat bila membantu.
- Jika konteks tidak cukup, katakan dengan jelas apa yang belum tersedia dan berikan langkah/arah yang bisa dilakukan.
Format jawaban:
1) Paragraf inti (1β3 kalimat) sesuai gaya diminta pengguna.
2) Jika perlu, tambahkan poin-poin ringkas (maks 4 bullet) untuk memudahkan.
3) Jika benar-benar tidak ada datanya di konteks, tulis: "Maaf, datanya belum tersedia di dasar informasi kami."
"""
# ========== Utilitas ==========
def norm(s): return re.sub(r"\s+"," ",str(s or "").strip())
def load_jsonl_with_variants(path: Path):
"""
Mendukung skema:
- {"question": "...", "answer": "...", "q_variants": [...], "followups": [...], "source": "..."}
Kolom opsional: q_variants, followups, source
Jika q_variants tidak ada, pakai question saja.
"""
items = []
with path.open("r", encoding="utf-8") as f:
for line in f:
if not line.strip():
continue
obj = json.loads(line)
q = obj.get("question") or obj.get("q")
a = obj.get("answer") or obj.get("a")
if not (q and a):
continue
qv = obj.get("q_variants") or []
if not isinstance(qv, list):
qv = [qv]
variants = [norm(q)] + [norm(x) for x in qv if x]
followups = obj.get("followups") or []
if not isinstance(followups, list):
followups = []
items.append({
"question": norm(q),
"answer": norm(a),
"q_variants": variants,
"followups": followups,
"source": norm(obj.get("source") or "")
})
return items
# ========== Indexer/Retriever ==========
class FAQIndex:
def __init__(self, emb_model: str):
self.model_name = emb_model
self.model = None
self.rows = [] # setiap row = 1 QA
self.flat_q = [] # daftar semua query variants
self.parent = [] # mapping flat_q -> index row induk
self.nn = None
self.emb = None
def build(self, rows):
self.rows = rows
self.model = SentenceTransformer(self.model_name)
self.flat_q, self.parent = [], []
for i, r in enumerate(rows):
for qv in r["q_variants"]:
self.flat_q.append(qv)
self.parent.append(i)
self.emb = self.model.encode(
self.flat_q,
normalize_embeddings=True,
convert_to_numpy=True,
show_progress_bar=False
)
self.nn = NearestNeighbors(
n_neighbors=min(15, len(self.flat_q)), metric="cosine"
).fit(self.emb)
def retrieve(self, query: str, top_k=TOP_K):
if not self.flat_q:
return []
qv = self.model.encode(
[query], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False
)
d, idx = self.nn.kneighbors(qv, n_neighbors=min(top_k, len(self.flat_q)))
sims = 1.0 - d[0]
hits = []
for ix, s in zip(idx[0], sims):
parent_i = self.parent[int(ix)]
base = self.rows[parent_i]
hits.append({
"match_q": self.flat_q[int(ix)],
"score": float(s),
"question": base["question"],
"answer": base["answer"],
"followups": base.get("followups") or [],
"source": base.get("source") or ""
})
# deduplicate by canonical question, keep best score
best = {}
for h in hits:
key = h["question"]
if key not in best or h["score"] > best[key]["score"]:
best[key] = h
hits_dedup = sorted(best.values(), key=lambda x: -x["score"])[:top_k]
return hits_dedup
# ========== Local LLM (opsional rephrasing/merging) ==========
_local_pipe = None
def call_local_llm(prompt: str):
"""
Jika lingkungan tidak punya model lokal, Anda bisa mematikan fungsi ini
dan langsung pakai template jawaban tanpa LLM (rule-based rephrase).
"""
global _local_pipe
try:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
if _local_pipe is None:
tok = AutoTokenizer.from_pretrained(LOCAL_MODEL)
mdl = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL, torch_dtype=torch.float32)
_local_pipe = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
out = _local_pipe(
prompt,
max_new_tokens=MAX_TOKENS,
do_sample=True,
temperature=TEMPERATURE,
pad_token_id=_local_pipe.tokenizer.eos_token_id
)
return out[0]["generated_text"]
except Exception as e:
# fallback: jika LLM gagal, kembalikan prompt terakhir (akan dipotong di caller)
return f"[LLM unavailable] {prompt}"
# ========== Orchestration ==========
STYLE_GUIDE = {
"Formal": "Nada formal, jelas, dan bernuansa kebijakan publik.",
"Santai": "Nada bersahabat dan ringan, hindari jargon teknis.",
"Ringkas": "Jawaban sangat singkat (1β2 kalimat) namun informatif.",
"Naratif": "Gaya bercerita singkat agar mudah dibayangkan."
}
def craft_prompt(context_bullets, question, style):
style_rule = STYLE_GUIDE.get(style, STYLE_GUIDE["Formal"])
ctx = "\n".join([f"- {c}" for c in context_bullets if c.strip()])
return f"""{SYSTEM_PROMPT}
GAYA JAWABAN: {style_rule}
KONTEKS:
{ctx}
PERTANYAAN PENGGUNA:
{question}
TULIS JAWABAN SEKARANG:
"""
def merge_context(hits):
# Ambil 3β5 jawaban teratas sebagai konteks bullet
bullets = []
for h in hits[:5]:
bullets.append(h["answer"])
return bullets
def safe_cut(text, marker="TULIS JAWABAN SEKARANG:"):
# Jika pipeline mengembalikan prompt+jawaban, potong bagian setelah marker
if marker in text:
return text.split(marker, 1)[-1].strip()
return text.strip()
def render_followups(hits, max_items=4):
# Kumpulkan followups dari hit terbaik
seen, out = set(), []
for h in hits:
for f in h.get("followups") or []:
f = norm(f)
if f and f not in seen:
out.append(f)
seen.add(f)
if len(out) >= max_items:
break
if len(out) >= max_items:
break
return out
# ========== Build index ==========
faq = FAQIndex(EMB_MODEL)
faq.build(load_jsonl_with_variants(DATA_PATH))
# ========== Gradio Callback ==========
def answer_query(msg, chat_history, style, show_sources):
msg = norm(msg)
if not msg:
return "Silakan tulis pertanyaan tentang IPLM."
hits = faq.retrieve(msg, TOP_K)
if not hits:
return "Maaf, datanya belum tersedia di dasar informasi kami."
# Jika ada hit yang sangat kuat, pakai jawabannya langsung tapi tetap dipoles
top = hits[0]
if top["score"] >= THRESHOLD:
base = top["answer"]
# Poles ringan tanpa LLM
if style == "Ringkas":
final = base
elif style == "Santai":
final = f"Singkatnya, {base[0].lower()}{base[1:]}"
elif style == "Naratif":
final = f"Bayangkan kita menilai literasi di daerah. {base}"
else:
final = base
if show_sources:
meta = f"\n\nβ Cocokkan dengan: β{top['question']}β β’ keyakinan ~{top['score']:.2f}"
if top.get("source"):
meta += f" β’ sumber: {top['source']}"
final += meta
# Tambah followups
fups = render_followups(hits)
if fups:
final += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups])
return final
# Kalau skor belum mantap, gabungkan konteks lalu minta LLM memformulasikan jawaban luwes
ctx = merge_context(hits)
prompt = craft_prompt(ctx, msg, style)
raw = call_local_llm(prompt)
ans = safe_cut(raw)
# Proteksi: jika LLM malah halu/keluar jalur, fallback ke ringkasan rule-based
if not ans or "Maaf" in ans and "tidak" in ans and "tersedia" in ans:
# ringkasan sederhana dari konteks
ans = ctx[0] if ctx else "Maaf, datanya belum tersedia di dasar informasi kami."
if show_sources:
src_lines = []
for h in hits[:3]:
s = f'β’ β{h["question"]}β (keyakinan ~{h["score"]:.2f})'
if h.get("source"):
s += f' β sumber: {h["source"]}'
src_lines.append(s)
if src_lines:
ans += "\n\nRujukan terdekat:\n" + "\n".join(src_lines)
# Tambah saran follow-up
fups = render_followups(hits)
if fups:
ans += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups])
return ans
# ========== UI ==========
with gr.Blocks(title="π IPLM Chatbot (luwes)") as demo:
gr.Markdown("## π IPLM Chatbot\nTanya apa saja tentang IPLM. Jawaban berbasis data JSONL, disajikan dengan bahasa yang lebih luwes.")
with gr.Row():
style = gr.Radio(choices=list(STYLE_GUIDE.keys()), value="Formal", label="Gaya jawaban")
show_sources = gr.Checkbox(value=True, label="Tampilkan rujukan terdekat")
chat = gr.ChatInterface(
fn=lambda m,h: answer_query(m, h, style.value, show_sources.value),
title="IPLM Chatbot",
description="Jawaban hanya berdasarkan data JSONL, namun ditulis dengan gaya bahasa yang lebih natural.",
examples=[
"Sederhananya, apa itu IPLM?",
"Gimana cara hitung nilai IPLM biar jadi angka 0β100?",
"Bedanya dimensi kepatuhan sama kinerja apa ya?",
"Kalau anggaran BOS, yang dihitung bagian mana?",
"Siapa yang ngumpulin data di daerah dan gimana verifikasinya?"
],
cache_examples=False
)
if __name__ == "__main__":
demo.launch()
|