Spaces:
Sleeping
Sleeping
| # app.py β RAG luwes untuk IPLM | |
| import os, re, json, hashlib | |
| from pathlib import Path | |
| import gradio as gr | |
| import numpy as np | |
| from sklearn.neighbors import NearestNeighbors | |
| from sentence_transformers import SentenceTransformer | |
| # ========== Konfigurasi ========== | |
| DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl")) | |
| EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2") | |
| LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/gemma-2b-it") # lokal & ringan | |
| TOP_K = int(os.getenv("TOP_K", "5")) | |
| TEMPERATURE = float(os.getenv("TEMPERATURE", "0.4")) | |
| MAX_TOKENS = int(os.getenv("MAX_TOKENS", "320")) | |
| THRESHOLD = float(os.getenv("THRESHOLD", "0.62")) # naikkan sedikit agar lebih tepercaya | |
| # ========== Prompt (lebih natural) ========== | |
| SYSTEM_PROMPT = """ | |
| Kamu adalah asisten pustakawan Perpustakaan Nasional RI untuk topik IPLM (Indeks Pembangunan Literasi Masyarakat). | |
| Tugasmu: | |
| - Jawab hanya berdasarkan KONTEKS yang diberikan (jangan menambah fakta baru). | |
| - Tulis dengan bahasa Indonesia yang alami, ramah, dan mudah dipahami publik. | |
| - Jelaskan dengan contoh singkat bila membantu. | |
| - Jika konteks tidak cukup, katakan dengan jelas apa yang belum tersedia dan berikan langkah/arah yang bisa dilakukan. | |
| Format jawaban: | |
| 1) Paragraf inti (1β3 kalimat) sesuai gaya diminta pengguna. | |
| 2) Jika perlu, tambahkan poin-poin ringkas (maks 4 bullet) untuk memudahkan. | |
| 3) Jika benar-benar tidak ada datanya di konteks, tulis: "Maaf, datanya belum tersedia di dasar informasi kami." | |
| """ | |
| # ========== Utilitas ========== | |
| def norm(s): return re.sub(r"\s+"," ",str(s or "").strip()) | |
| def load_jsonl_with_variants(path: Path): | |
| """ | |
| Mendukung skema: | |
| - {"question": "...", "answer": "...", "q_variants": [...], "followups": [...], "source": "..."} | |
| Kolom opsional: q_variants, followups, source | |
| Jika q_variants tidak ada, pakai question saja. | |
| """ | |
| items = [] | |
| with path.open("r", encoding="utf-8") as f: | |
| for line in f: | |
| if not line.strip(): | |
| continue | |
| obj = json.loads(line) | |
| q = obj.get("question") or obj.get("q") | |
| a = obj.get("answer") or obj.get("a") | |
| if not (q and a): | |
| continue | |
| qv = obj.get("q_variants") or [] | |
| if not isinstance(qv, list): | |
| qv = [qv] | |
| variants = [norm(q)] + [norm(x) for x in qv if x] | |
| followups = obj.get("followups") or [] | |
| if not isinstance(followups, list): | |
| followups = [] | |
| items.append({ | |
| "question": norm(q), | |
| "answer": norm(a), | |
| "q_variants": variants, | |
| "followups": followups, | |
| "source": norm(obj.get("source") or "") | |
| }) | |
| return items | |
| # ========== Indexer/Retriever ========== | |
| class FAQIndex: | |
| def __init__(self, emb_model: str): | |
| self.model_name = emb_model | |
| self.model = None | |
| self.rows = [] # setiap row = 1 QA | |
| self.flat_q = [] # daftar semua query variants | |
| self.parent = [] # mapping flat_q -> index row induk | |
| self.nn = None | |
| self.emb = None | |
| def build(self, rows): | |
| self.rows = rows | |
| self.model = SentenceTransformer(self.model_name) | |
| self.flat_q, self.parent = [], [] | |
| for i, r in enumerate(rows): | |
| for qv in r["q_variants"]: | |
| self.flat_q.append(qv) | |
| self.parent.append(i) | |
| self.emb = self.model.encode( | |
| self.flat_q, | |
| normalize_embeddings=True, | |
| convert_to_numpy=True, | |
| show_progress_bar=False | |
| ) | |
| self.nn = NearestNeighbors( | |
| n_neighbors=min(15, len(self.flat_q)), metric="cosine" | |
| ).fit(self.emb) | |
| def retrieve(self, query: str, top_k=TOP_K): | |
| if not self.flat_q: | |
| return [] | |
| qv = self.model.encode( | |
| [query], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False | |
| ) | |
| d, idx = self.nn.kneighbors(qv, n_neighbors=min(top_k, len(self.flat_q))) | |
| sims = 1.0 - d[0] | |
| hits = [] | |
| for ix, s in zip(idx[0], sims): | |
| parent_i = self.parent[int(ix)] | |
| base = self.rows[parent_i] | |
| hits.append({ | |
| "match_q": self.flat_q[int(ix)], | |
| "score": float(s), | |
| "question": base["question"], | |
| "answer": base["answer"], | |
| "followups": base.get("followups") or [], | |
| "source": base.get("source") or "" | |
| }) | |
| # deduplicate by canonical question, keep best score | |
| best = {} | |
| for h in hits: | |
| key = h["question"] | |
| if key not in best or h["score"] > best[key]["score"]: | |
| best[key] = h | |
| hits_dedup = sorted(best.values(), key=lambda x: -x["score"])[:top_k] | |
| return hits_dedup | |
| # ========== Local LLM (opsional rephrasing/merging) ========== | |
| _local_pipe = None | |
| def call_local_llm(prompt: str): | |
| """ | |
| Jika lingkungan tidak punya model lokal, Anda bisa mematikan fungsi ini | |
| dan langsung pakai template jawaban tanpa LLM (rule-based rephrase). | |
| """ | |
| global _local_pipe | |
| try: | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import torch | |
| if _local_pipe is None: | |
| tok = AutoTokenizer.from_pretrained(LOCAL_MODEL) | |
| mdl = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL, torch_dtype=torch.float32) | |
| _local_pipe = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1) | |
| out = _local_pipe( | |
| prompt, | |
| max_new_tokens=MAX_TOKENS, | |
| do_sample=True, | |
| temperature=TEMPERATURE, | |
| pad_token_id=_local_pipe.tokenizer.eos_token_id | |
| ) | |
| return out[0]["generated_text"] | |
| except Exception as e: | |
| # fallback: jika LLM gagal, kembalikan prompt terakhir (akan dipotong di caller) | |
| return f"[LLM unavailable] {prompt}" | |
| # ========== Orchestration ========== | |
| STYLE_GUIDE = { | |
| "Formal": "Nada formal, jelas, dan bernuansa kebijakan publik.", | |
| "Santai": "Nada bersahabat dan ringan, hindari jargon teknis.", | |
| "Ringkas": "Jawaban sangat singkat (1β2 kalimat) namun informatif.", | |
| "Naratif": "Gaya bercerita singkat agar mudah dibayangkan." | |
| } | |
| def craft_prompt(context_bullets, question, style): | |
| style_rule = STYLE_GUIDE.get(style, STYLE_GUIDE["Formal"]) | |
| ctx = "\n".join([f"- {c}" for c in context_bullets if c.strip()]) | |
| return f"""{SYSTEM_PROMPT} | |
| GAYA JAWABAN: {style_rule} | |
| KONTEKS: | |
| {ctx} | |
| PERTANYAAN PENGGUNA: | |
| {question} | |
| TULIS JAWABAN SEKARANG: | |
| """ | |
| def merge_context(hits): | |
| # Ambil 3β5 jawaban teratas sebagai konteks bullet | |
| bullets = [] | |
| for h in hits[:5]: | |
| bullets.append(h["answer"]) | |
| return bullets | |
| def safe_cut(text, marker="TULIS JAWABAN SEKARANG:"): | |
| # Jika pipeline mengembalikan prompt+jawaban, potong bagian setelah marker | |
| if marker in text: | |
| return text.split(marker, 1)[-1].strip() | |
| return text.strip() | |
| def render_followups(hits, max_items=4): | |
| # Kumpulkan followups dari hit terbaik | |
| seen, out = set(), [] | |
| for h in hits: | |
| for f in h.get("followups") or []: | |
| f = norm(f) | |
| if f and f not in seen: | |
| out.append(f) | |
| seen.add(f) | |
| if len(out) >= max_items: | |
| break | |
| if len(out) >= max_items: | |
| break | |
| return out | |
| # ========== Build index ========== | |
| faq = FAQIndex(EMB_MODEL) | |
| faq.build(load_jsonl_with_variants(DATA_PATH)) | |
| # ========== Gradio Callback ========== | |
| def answer_query(msg, chat_history, style, show_sources): | |
| msg = norm(msg) | |
| if not msg: | |
| return "Silakan tulis pertanyaan tentang IPLM." | |
| hits = faq.retrieve(msg, TOP_K) | |
| if not hits: | |
| return "Maaf, datanya belum tersedia di dasar informasi kami." | |
| # Jika ada hit yang sangat kuat, pakai jawabannya langsung tapi tetap dipoles | |
| top = hits[0] | |
| if top["score"] >= THRESHOLD: | |
| base = top["answer"] | |
| # Poles ringan tanpa LLM | |
| if style == "Ringkas": | |
| final = base | |
| elif style == "Santai": | |
| final = f"Singkatnya, {base[0].lower()}{base[1:]}" | |
| elif style == "Naratif": | |
| final = f"Bayangkan kita menilai literasi di daerah. {base}" | |
| else: | |
| final = base | |
| if show_sources: | |
| meta = f"\n\nβ Cocokkan dengan: β{top['question']}β β’ keyakinan ~{top['score']:.2f}" | |
| if top.get("source"): | |
| meta += f" β’ sumber: {top['source']}" | |
| final += meta | |
| # Tambah followups | |
| fups = render_followups(hits) | |
| if fups: | |
| final += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups]) | |
| return final | |
| # Kalau skor belum mantap, gabungkan konteks lalu minta LLM memformulasikan jawaban luwes | |
| ctx = merge_context(hits) | |
| prompt = craft_prompt(ctx, msg, style) | |
| raw = call_local_llm(prompt) | |
| ans = safe_cut(raw) | |
| # Proteksi: jika LLM malah halu/keluar jalur, fallback ke ringkasan rule-based | |
| if not ans or "Maaf" in ans and "tidak" in ans and "tersedia" in ans: | |
| # ringkasan sederhana dari konteks | |
| ans = ctx[0] if ctx else "Maaf, datanya belum tersedia di dasar informasi kami." | |
| if show_sources: | |
| src_lines = [] | |
| for h in hits[:3]: | |
| s = f'β’ β{h["question"]}β (keyakinan ~{h["score"]:.2f})' | |
| if h.get("source"): | |
| s += f' β sumber: {h["source"]}' | |
| src_lines.append(s) | |
| if src_lines: | |
| ans += "\n\nRujukan terdekat:\n" + "\n".join(src_lines) | |
| # Tambah saran follow-up | |
| fups = render_followups(hits) | |
| if fups: | |
| ans += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups]) | |
| return ans | |
| # ========== UI ========== | |
| with gr.Blocks(title="π IPLM Chatbot (luwes)") as demo: | |
| gr.Markdown("## π IPLM Chatbot\nTanya apa saja tentang IPLM. Jawaban berbasis data JSONL, disajikan dengan bahasa yang lebih luwes.") | |
| with gr.Row(): | |
| style = gr.Radio(choices=list(STYLE_GUIDE.keys()), value="Formal", label="Gaya jawaban") | |
| show_sources = gr.Checkbox(value=True, label="Tampilkan rujukan terdekat") | |
| chat = gr.ChatInterface( | |
| fn=lambda m,h: answer_query(m, h, style.value, show_sources.value), | |
| title="IPLM Chatbot", | |
| description="Jawaban hanya berdasarkan data JSONL, namun ditulis dengan gaya bahasa yang lebih natural.", | |
| examples=[ | |
| "Sederhananya, apa itu IPLM?", | |
| "Gimana cara hitung nilai IPLM biar jadi angka 0β100?", | |
| "Bedanya dimensi kepatuhan sama kinerja apa ya?", | |
| "Kalau anggaran BOS, yang dihitung bagian mana?", | |
| "Siapa yang ngumpulin data di daerah dan gimana verifikasinya?" | |
| ], | |
| cache_examples=False | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |