File size: 11,044 Bytes
4edbd5c
 
50c7bda
 
 
 
 
 
579547b
4edbd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579547b
4edbd5c
 
 
 
 
 
 
 
 
 
50c7bda
4edbd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50c7bda
4edbd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a43d06
4edbd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50c7bda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# app.py β€” RAG luwes untuk IPLM
import os, re, json, hashlib
from pathlib import Path
import gradio as gr
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer

# ========== Konfigurasi ==========
DATA_PATH   = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl"))
EMB_MODEL   = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/gemma-2b-it")  # lokal & ringan
TOP_K       = int(os.getenv("TOP_K", "5"))
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.4"))
MAX_TOKENS  = int(os.getenv("MAX_TOKENS", "320"))
THRESHOLD   = float(os.getenv("THRESHOLD", "0.62"))  # naikkan sedikit agar lebih tepercaya

# ========== Prompt (lebih natural) ==========
SYSTEM_PROMPT = """
Kamu adalah asisten pustakawan Perpustakaan Nasional RI untuk topik IPLM (Indeks Pembangunan Literasi Masyarakat).
Tugasmu:
- Jawab hanya berdasarkan KONTEKS yang diberikan (jangan menambah fakta baru).
- Tulis dengan bahasa Indonesia yang alami, ramah, dan mudah dipahami publik.
- Jelaskan dengan contoh singkat bila membantu.
- Jika konteks tidak cukup, katakan dengan jelas apa yang belum tersedia dan berikan langkah/arah yang bisa dilakukan.

Format jawaban:
1) Paragraf inti (1–3 kalimat) sesuai gaya diminta pengguna.
2) Jika perlu, tambahkan poin-poin ringkas (maks 4 bullet) untuk memudahkan.
3) Jika benar-benar tidak ada datanya di konteks, tulis: "Maaf, datanya belum tersedia di dasar informasi kami."
"""

# ========== Utilitas ==========
def norm(s): return re.sub(r"\s+"," ",str(s or "").strip())

def load_jsonl_with_variants(path: Path):
    """
    Mendukung skema:
    - {"question": "...", "answer": "...", "q_variants": [...], "followups": [...], "source": "..."}
    Kolom opsional: q_variants, followups, source
    Jika q_variants tidak ada, pakai question saja.
    """
    items = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip(): 
                continue
            obj = json.loads(line)
            q = obj.get("question") or obj.get("q")
            a = obj.get("answer")   or obj.get("a")
            if not (q and a):
                continue
            qv = obj.get("q_variants") or []
            if not isinstance(qv, list):
                qv = [qv]
            variants = [norm(q)] + [norm(x) for x in qv if x]
            followups = obj.get("followups") or []
            if not isinstance(followups, list):
                followups = []
            items.append({
                "question": norm(q),
                "answer": norm(a),
                "q_variants": variants,
                "followups": followups,
                "source": norm(obj.get("source") or "")
            })
    return items

# ========== Indexer/Retriever ==========
class FAQIndex:
    def __init__(self, emb_model: str):
        self.model_name = emb_model
        self.model = None
        self.rows = []            # setiap row = 1 QA
        self.flat_q = []          # daftar semua query variants
        self.parent = []          # mapping flat_q -> index row induk
        self.nn = None
        self.emb = None

    def build(self, rows):
        self.rows = rows
        self.model = SentenceTransformer(self.model_name)
        self.flat_q, self.parent = [], []
        for i, r in enumerate(rows):
            for qv in r["q_variants"]:
                self.flat_q.append(qv)
                self.parent.append(i)
        self.emb = self.model.encode(
            self.flat_q, 
            normalize_embeddings=True, 
            convert_to_numpy=True, 
            show_progress_bar=False
        )
        self.nn = NearestNeighbors(
            n_neighbors=min(15, len(self.flat_q)), metric="cosine"
        ).fit(self.emb)

    def retrieve(self, query: str, top_k=TOP_K):
        if not self.flat_q:
            return []
        qv = self.model.encode(
            [query], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False
        )
        d, idx = self.nn.kneighbors(qv, n_neighbors=min(top_k, len(self.flat_q)))
        sims = 1.0 - d[0]
        hits = []
        for ix, s in zip(idx[0], sims):
            parent_i = self.parent[int(ix)]
            base = self.rows[parent_i]
            hits.append({
                "match_q": self.flat_q[int(ix)],
                "score": float(s),
                "question": base["question"],
                "answer": base["answer"],
                "followups": base.get("followups") or [],
                "source": base.get("source") or ""
            })
        # deduplicate by canonical question, keep best score
        best = {}
        for h in hits:
            key = h["question"]
            if key not in best or h["score"] > best[key]["score"]:
                best[key] = h
        hits_dedup = sorted(best.values(), key=lambda x: -x["score"])[:top_k]
        return hits_dedup

# ========== Local LLM (opsional rephrasing/merging) ==========
_local_pipe = None
def call_local_llm(prompt: str):
    """
    Jika lingkungan tidak punya model lokal, Anda bisa mematikan fungsi ini
    dan langsung pakai template jawaban tanpa LLM (rule-based rephrase).
    """
    global _local_pipe
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
        import torch
        if _local_pipe is None:
            tok = AutoTokenizer.from_pretrained(LOCAL_MODEL)
            mdl = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL, torch_dtype=torch.float32)
            _local_pipe = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
        out = _local_pipe(
            prompt, 
            max_new_tokens=MAX_TOKENS, 
            do_sample=True, 
            temperature=TEMPERATURE,
            pad_token_id=_local_pipe.tokenizer.eos_token_id
        )
        return out[0]["generated_text"]
    except Exception as e:
        # fallback: jika LLM gagal, kembalikan prompt terakhir (akan dipotong di caller)
        return f"[LLM unavailable] {prompt}"

# ========== Orchestration ==========
STYLE_GUIDE = {
    "Formal": "Nada formal, jelas, dan bernuansa kebijakan publik.",
    "Santai": "Nada bersahabat dan ringan, hindari jargon teknis.",
    "Ringkas": "Jawaban sangat singkat (1–2 kalimat) namun informatif.",
    "Naratif": "Gaya bercerita singkat agar mudah dibayangkan."
}

def craft_prompt(context_bullets, question, style):
    style_rule = STYLE_GUIDE.get(style, STYLE_GUIDE["Formal"])
    ctx = "\n".join([f"- {c}" for c in context_bullets if c.strip()])
    return f"""{SYSTEM_PROMPT}

GAYA JAWABAN: {style_rule}

KONTEKS:
{ctx}

PERTANYAAN PENGGUNA:
{question}

TULIS JAWABAN SEKARANG:
"""

def merge_context(hits):
    # Ambil 3–5 jawaban teratas sebagai konteks bullet
    bullets = []
    for h in hits[:5]:
        bullets.append(h["answer"])
    return bullets

def safe_cut(text, marker="TULIS JAWABAN SEKARANG:"):
    # Jika pipeline mengembalikan prompt+jawaban, potong bagian setelah marker
    if marker in text:
        return text.split(marker, 1)[-1].strip()
    return text.strip()

def render_followups(hits, max_items=4):
    # Kumpulkan followups dari hit terbaik
    seen, out = set(), []
    for h in hits:
        for f in h.get("followups") or []:
            f = norm(f)
            if f and f not in seen:
                out.append(f)
                seen.add(f)
            if len(out) >= max_items:
                break
        if len(out) >= max_items:
            break
    return out

# ========== Build index ==========
faq = FAQIndex(EMB_MODEL)
faq.build(load_jsonl_with_variants(DATA_PATH))

# ========== Gradio Callback ==========
def answer_query(msg, chat_history, style, show_sources):
    msg = norm(msg)
    if not msg:
        return "Silakan tulis pertanyaan tentang IPLM."

    hits = faq.retrieve(msg, TOP_K)
    if not hits:
        return "Maaf, datanya belum tersedia di dasar informasi kami."

    # Jika ada hit yang sangat kuat, pakai jawabannya langsung tapi tetap dipoles
    top = hits[0]
    if top["score"] >= THRESHOLD:
        base = top["answer"]
        # Poles ringan tanpa LLM
        if style == "Ringkas":
            final = base
        elif style == "Santai":
            final = f"Singkatnya, {base[0].lower()}{base[1:]}"
        elif style == "Naratif":
            final = f"Bayangkan kita menilai literasi di daerah. {base}"
        else:
            final = base

        if show_sources:
            meta = f"\n\nβ€” Cocokkan dengan: β€œ{top['question']}” β€’ keyakinan ~{top['score']:.2f}"
            if top.get("source"):
                meta += f" β€’ sumber: {top['source']}"
            final += meta
        # Tambah followups
        fups = render_followups(hits)
        if fups:
            final += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups])
        return final

    # Kalau skor belum mantap, gabungkan konteks lalu minta LLM memformulasikan jawaban luwes
    ctx = merge_context(hits)
    prompt = craft_prompt(ctx, msg, style)
    raw = call_local_llm(prompt)
    ans = safe_cut(raw)

    # Proteksi: jika LLM malah halu/keluar jalur, fallback ke ringkasan rule-based
    if not ans or "Maaf" in ans and "tidak" in ans and "tersedia" in ans:
        # ringkasan sederhana dari konteks
        ans = ctx[0] if ctx else "Maaf, datanya belum tersedia di dasar informasi kami."

    if show_sources:
        src_lines = []
        for h in hits[:3]:
            s = f'β€’ β€œ{h["question"]}” (keyakinan ~{h["score"]:.2f})'
            if h.get("source"):
                s += f' β€” sumber: {h["source"]}'
            src_lines.append(s)
        if src_lines:
            ans += "\n\nRujukan terdekat:\n" + "\n".join(src_lines)

    # Tambah saran follow-up
    fups = render_followups(hits)
    if fups:
        ans += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups])

    return ans

# ========== UI ==========
with gr.Blocks(title="πŸ“š IPLM Chatbot (luwes)") as demo:
    gr.Markdown("## πŸ“š IPLM Chatbot\nTanya apa saja tentang IPLM. Jawaban berbasis data JSONL, disajikan dengan bahasa yang lebih luwes.")
    with gr.Row():
        style = gr.Radio(choices=list(STYLE_GUIDE.keys()), value="Formal", label="Gaya jawaban")
        show_sources = gr.Checkbox(value=True, label="Tampilkan rujukan terdekat")
    chat = gr.ChatInterface(
        fn=lambda m,h: answer_query(m, h, style.value, show_sources.value),
        title="IPLM Chatbot",
        description="Jawaban hanya berdasarkan data JSONL, namun ditulis dengan gaya bahasa yang lebih natural.",
        examples=[
            "Sederhananya, apa itu IPLM?",
            "Gimana cara hitung nilai IPLM biar jadi angka 0–100?",
            "Bedanya dimensi kepatuhan sama kinerja apa ya?",
            "Kalau anggaran BOS, yang dihitung bagian mana?",
            "Siapa yang ngumpulin data di daerah dan gimana verifikasinya?"
        ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()