irhamni commited on
Commit
4edbd5c
Β·
verified Β·
1 Parent(s): 5809478

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -81
app.py CHANGED
@@ -1,5 +1,5 @@
1
- # app.py β€” versi super simpel ala ChatGPT
2
- import os, re, json, pickle, hashlib
3
  from pathlib import Path
4
  import gradio as gr
5
  import numpy as np
@@ -7,87 +7,288 @@ from sklearn.neighbors import NearestNeighbors
7
  from sentence_transformers import SentenceTransformer
8
 
9
  # ========== Konfigurasi ==========
10
- DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl"))
11
- EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
12
- LOCAL_MODEL= os.getenv("LOCAL_MODEL", "google/gemma-2b-it") # model lokal gratis & ringan
13
- TOP_K = int(os.getenv("TOP_K", "4"))
14
- TEMPERATURE= float(os.getenv("TEMPERATURE", "0.2"))
15
- MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
16
- THRESHOLD = float(os.getenv("THRESHOLD", "0.6"))
17
-
18
- SYSTEM_PROMPT = (
19
- "You are an Indonesian librarian assistant. "
20
- "Jawab singkat, akurat, dan sopan. "
21
- "Jawab HANYA berdasarkan konteks yang diberikan. "
22
- "Jika tidak ada jawabannya, balas persis: Data tidak tersedia."
23
- )
24
-
25
- # ========== Utils ==========
 
 
 
 
 
 
 
 
26
  def norm(s): return re.sub(r"\s+"," ",str(s or "").strip())
27
- def dataset_hash(rows):
28
- m=hashlib.md5()
29
- for r in rows: m.update((r["question"]+"|"+r["answer"]).encode())
30
- return m.hexdigest()
31
-
32
- def load_jsonl(path:Path):
33
- rows=[]
34
- with path.open("r",encoding="utf-8") as f:
 
 
35
  for line in f:
36
- if not line.strip(): continue
37
- obj=json.loads(line)
38
- q=obj.get("question") or obj.get("q")
39
- a=obj.get("answer") or obj.get("a")
40
- if q and a: rows.append({"question":norm(q),"answer":norm(a)})
41
- return rows
42
-
43
- # ========== Retriever ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  class FAQIndex:
45
- def __init__(self): self.rows=None; self.model=None; self.nn=None; self.emb=None
46
- def build(self,rows):
47
- self.rows=rows
48
- self.model=SentenceTransformer(EMB_MODEL)
49
- qs=[r["question"] for r in rows]
50
- self.emb=self.model.encode(qs,normalize_embeddings=True,convert_to_numpy=True,show_progress_bar=False)
51
- self.nn=NearestNeighbors(n_neighbors=min(10,len(qs)),metric="cosine").fit(self.emb)
52
- def retrieve(self,query,top_k=TOP_K):
53
- qv=self.model.encode([query],normalize_embeddings=True,convert_to_numpy=True,show_progress_bar=False)
54
- d,i=self.nn.kneighbors(qv,n_neighbors=min(top_k,len(self.rows)))
55
- sims=1.0-d[0]
56
- return [{"question":self.rows[int(ix)]["question"],"answer":self.rows[int(ix)]["answer"],"score":float(s)} for ix,s in zip(i[0],sims)]
57
-
58
- faq=FAQIndex()
59
- faq.build(load_jsonl(DATA_PATH))
60
-
61
- # ========== Local LLM ==========
62
- _local_pipe=None
63
- def call_local(prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  global _local_pipe
65
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
66
- import torch
67
- if _local_pipe is None:
68
- tok=AutoTokenizer.from_pretrained(LOCAL_MODEL)
69
- mdl=AutoModelForCausalLM.from_pretrained(LOCAL_MODEL,torch_dtype=torch.float32)
70
- _local_pipe=pipeline("text-generation",model=mdl,tokenizer=tok,device=-1)
71
- out=_local_pipe(prompt,max_new_tokens=MAX_TOKENS,do_sample=True,temperature=TEMPERATURE)
72
- return out[0]["generated_text"]
73
-
74
- # ========== Orchestrator ==========
75
- def answer_query(msg,history):
76
- hits=faq.retrieve(msg,TOP_K)
77
- if not hits: return "Data tidak tersedia."
78
- if hits[0]["score"]>=THRESHOLD:
79
- return hits[0]["answer"]
80
- ctx="\n".join([f"- {h['answer']}" for h in hits])
81
- prompt=f"{SYSTEM_PROMPT}\n\nKONTEKS:\n{ctx}\n\nPERTANYAAN: {msg}\n\nJAWAB:"
82
- return call_local(prompt)
83
-
84
- # ========== UI Chat Only ==========
85
- demo=gr.ChatInterface(
86
- fn=answer_query,
87
- title="πŸ“š IPLM Chatbot",
88
- description="Tanya apa saja tentang IPLM. Jawaban hanya berdasarkan data JSONL.",
89
- examples=["Apa itu IPLM?","Bagaimana menghitung IPLM?","Apa saja dimensi IPLM?"]
90
- )
91
-
92
- if __name__=="__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  demo.launch()
 
1
+ # app.py β€” RAG luwes untuk IPLM
2
+ import os, re, json, hashlib
3
  from pathlib import Path
4
  import gradio as gr
5
  import numpy as np
 
7
  from sentence_transformers import SentenceTransformer
8
 
9
  # ========== Konfigurasi ==========
10
+ DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl"))
11
+ EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
12
+ LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/gemma-2b-it") # lokal & ringan
13
+ TOP_K = int(os.getenv("TOP_K", "5"))
14
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.4"))
15
+ MAX_TOKENS = int(os.getenv("MAX_TOKENS", "320"))
16
+ THRESHOLD = float(os.getenv("THRESHOLD", "0.62")) # naikkan sedikit agar lebih tepercaya
17
+
18
+ # ========== Prompt (lebih natural) ==========
19
+ SYSTEM_PROMPT = """
20
+ Kamu adalah asisten pustakawan Perpustakaan Nasional RI untuk topik IPLM (Indeks Pembangunan Literasi Masyarakat).
21
+ Tugasmu:
22
+ - Jawab hanya berdasarkan KONTEKS yang diberikan (jangan menambah fakta baru).
23
+ - Tulis dengan bahasa Indonesia yang alami, ramah, dan mudah dipahami publik.
24
+ - Jelaskan dengan contoh singkat bila membantu.
25
+ - Jika konteks tidak cukup, katakan dengan jelas apa yang belum tersedia dan berikan langkah/arah yang bisa dilakukan.
26
+
27
+ Format jawaban:
28
+ 1) Paragraf inti (1–3 kalimat) sesuai gaya diminta pengguna.
29
+ 2) Jika perlu, tambahkan poin-poin ringkas (maks 4 bullet) untuk memudahkan.
30
+ 3) Jika benar-benar tidak ada datanya di konteks, tulis: "Maaf, datanya belum tersedia di dasar informasi kami."
31
+ """
32
+
33
+ # ========== Utilitas ==========
34
  def norm(s): return re.sub(r"\s+"," ",str(s or "").strip())
35
+
36
+ def load_jsonl_with_variants(path: Path):
37
+ """
38
+ Mendukung skema:
39
+ - {"question": "...", "answer": "...", "q_variants": [...], "followups": [...], "source": "..."}
40
+ Kolom opsional: q_variants, followups, source
41
+ Jika q_variants tidak ada, pakai question saja.
42
+ """
43
+ items = []
44
+ with path.open("r", encoding="utf-8") as f:
45
  for line in f:
46
+ if not line.strip():
47
+ continue
48
+ obj = json.loads(line)
49
+ q = obj.get("question") or obj.get("q")
50
+ a = obj.get("answer") or obj.get("a")
51
+ if not (q and a):
52
+ continue
53
+ qv = obj.get("q_variants") or []
54
+ if not isinstance(qv, list):
55
+ qv = [qv]
56
+ variants = [norm(q)] + [norm(x) for x in qv if x]
57
+ followups = obj.get("followups") or []
58
+ if not isinstance(followups, list):
59
+ followups = []
60
+ items.append({
61
+ "question": norm(q),
62
+ "answer": norm(a),
63
+ "q_variants": variants,
64
+ "followups": followups,
65
+ "source": norm(obj.get("source") or "")
66
+ })
67
+ return items
68
+
69
+ # ========== Indexer/Retriever ==========
70
  class FAQIndex:
71
+ def __init__(self, emb_model: str):
72
+ self.model_name = emb_model
73
+ self.model = None
74
+ self.rows = [] # setiap row = 1 QA
75
+ self.flat_q = [] # daftar semua query variants
76
+ self.parent = [] # mapping flat_q -> index row induk
77
+ self.nn = None
78
+ self.emb = None
79
+
80
+ def build(self, rows):
81
+ self.rows = rows
82
+ self.model = SentenceTransformer(self.model_name)
83
+ self.flat_q, self.parent = [], []
84
+ for i, r in enumerate(rows):
85
+ for qv in r["q_variants"]:
86
+ self.flat_q.append(qv)
87
+ self.parent.append(i)
88
+ self.emb = self.model.encode(
89
+ self.flat_q,
90
+ normalize_embeddings=True,
91
+ convert_to_numpy=True,
92
+ show_progress_bar=False
93
+ )
94
+ self.nn = NearestNeighbors(
95
+ n_neighbors=min(15, len(self.flat_q)), metric="cosine"
96
+ ).fit(self.emb)
97
+
98
+ def retrieve(self, query: str, top_k=TOP_K):
99
+ if not self.flat_q:
100
+ return []
101
+ qv = self.model.encode(
102
+ [query], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False
103
+ )
104
+ d, idx = self.nn.kneighbors(qv, n_neighbors=min(top_k, len(self.flat_q)))
105
+ sims = 1.0 - d[0]
106
+ hits = []
107
+ for ix, s in zip(idx[0], sims):
108
+ parent_i = self.parent[int(ix)]
109
+ base = self.rows[parent_i]
110
+ hits.append({
111
+ "match_q": self.flat_q[int(ix)],
112
+ "score": float(s),
113
+ "question": base["question"],
114
+ "answer": base["answer"],
115
+ "followups": base.get("followups") or [],
116
+ "source": base.get("source") or ""
117
+ })
118
+ # deduplicate by canonical question, keep best score
119
+ best = {}
120
+ for h in hits:
121
+ key = h["question"]
122
+ if key not in best or h["score"] > best[key]["score"]:
123
+ best[key] = h
124
+ hits_dedup = sorted(best.values(), key=lambda x: -x["score"])[:top_k]
125
+ return hits_dedup
126
+
127
+ # ========== Local LLM (opsional rephrasing/merging) ==========
128
+ _local_pipe = None
129
+ def call_local_llm(prompt: str):
130
+ """
131
+ Jika lingkungan tidak punya model lokal, Anda bisa mematikan fungsi ini
132
+ dan langsung pakai template jawaban tanpa LLM (rule-based rephrase).
133
+ """
134
  global _local_pipe
135
+ try:
136
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
137
+ import torch
138
+ if _local_pipe is None:
139
+ tok = AutoTokenizer.from_pretrained(LOCAL_MODEL)
140
+ mdl = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL, torch_dtype=torch.float32)
141
+ _local_pipe = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
142
+ out = _local_pipe(
143
+ prompt,
144
+ max_new_tokens=MAX_TOKENS,
145
+ do_sample=True,
146
+ temperature=TEMPERATURE,
147
+ pad_token_id=_local_pipe.tokenizer.eos_token_id
148
+ )
149
+ return out[0]["generated_text"]
150
+ except Exception as e:
151
+ # fallback: jika LLM gagal, kembalikan prompt terakhir (akan dipotong di caller)
152
+ return f"[LLM unavailable] {prompt}"
153
+
154
+ # ========== Orchestration ==========
155
+ STYLE_GUIDE = {
156
+ "Formal": "Nada formal, jelas, dan bernuansa kebijakan publik.",
157
+ "Santai": "Nada bersahabat dan ringan, hindari jargon teknis.",
158
+ "Ringkas": "Jawaban sangat singkat (1–2 kalimat) namun informatif.",
159
+ "Naratif": "Gaya bercerita singkat agar mudah dibayangkan."
160
+ }
161
+
162
+ def craft_prompt(context_bullets, question, style):
163
+ style_rule = STYLE_GUIDE.get(style, STYLE_GUIDE["Formal"])
164
+ ctx = "\n".join([f"- {c}" for c in context_bullets if c.strip()])
165
+ return f"""{SYSTEM_PROMPT}
166
+
167
+ GAYA JAWABAN: {style_rule}
168
+
169
+ KONTEKS:
170
+ {ctx}
171
+
172
+ PERTANYAAN PENGGUNA:
173
+ {question}
174
+
175
+ TULIS JAWABAN SEKARANG:
176
+ """
177
+
178
+ def merge_context(hits):
179
+ # Ambil 3–5 jawaban teratas sebagai konteks bullet
180
+ bullets = []
181
+ for h in hits[:5]:
182
+ bullets.append(h["answer"])
183
+ return bullets
184
+
185
+ def safe_cut(text, marker="TULIS JAWABAN SEKARANG:"):
186
+ # Jika pipeline mengembalikan prompt+jawaban, potong bagian setelah marker
187
+ if marker in text:
188
+ return text.split(marker, 1)[-1].strip()
189
+ return text.strip()
190
+
191
+ def render_followups(hits, max_items=4):
192
+ # Kumpulkan followups dari hit terbaik
193
+ seen, out = set(), []
194
+ for h in hits:
195
+ for f in h.get("followups") or []:
196
+ f = norm(f)
197
+ if f and f not in seen:
198
+ out.append(f)
199
+ seen.add(f)
200
+ if len(out) >= max_items:
201
+ break
202
+ if len(out) >= max_items:
203
+ break
204
+ return out
205
+
206
+ # ========== Build index ==========
207
+ faq = FAQIndex(EMB_MODEL)
208
+ faq.build(load_jsonl_with_variants(DATA_PATH))
209
+
210
+ # ========== Gradio Callback ==========
211
+ def answer_query(msg, chat_history, style, show_sources):
212
+ msg = norm(msg)
213
+ if not msg:
214
+ return "Silakan tulis pertanyaan tentang IPLM."
215
+
216
+ hits = faq.retrieve(msg, TOP_K)
217
+ if not hits:
218
+ return "Maaf, datanya belum tersedia di dasar informasi kami."
219
+
220
+ # Jika ada hit yang sangat kuat, pakai jawabannya langsung tapi tetap dipoles
221
+ top = hits[0]
222
+ if top["score"] >= THRESHOLD:
223
+ base = top["answer"]
224
+ # Poles ringan tanpa LLM
225
+ if style == "Ringkas":
226
+ final = base
227
+ elif style == "Santai":
228
+ final = f"Singkatnya, {base[0].lower()}{base[1:]}"
229
+ elif style == "Naratif":
230
+ final = f"Bayangkan kita menilai literasi di daerah. {base}"
231
+ else:
232
+ final = base
233
+
234
+ if show_sources:
235
+ meta = f"\n\nβ€” Cocokkan dengan: β€œ{top['question']}” β€’ keyakinan ~{top['score']:.2f}"
236
+ if top.get("source"):
237
+ meta += f" β€’ sumber: {top['source']}"
238
+ final += meta
239
+ # Tambah followups
240
+ fups = render_followups(hits)
241
+ if fups:
242
+ final += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups])
243
+ return final
244
+
245
+ # Kalau skor belum mantap, gabungkan konteks lalu minta LLM memformulasikan jawaban luwes
246
+ ctx = merge_context(hits)
247
+ prompt = craft_prompt(ctx, msg, style)
248
+ raw = call_local_llm(prompt)
249
+ ans = safe_cut(raw)
250
+
251
+ # Proteksi: jika LLM malah halu/keluar jalur, fallback ke ringkasan rule-based
252
+ if not ans or "Maaf" in ans and "tidak" in ans and "tersedia" in ans:
253
+ # ringkasan sederhana dari konteks
254
+ ans = ctx[0] if ctx else "Maaf, datanya belum tersedia di dasar informasi kami."
255
+
256
+ if show_sources:
257
+ src_lines = []
258
+ for h in hits[:3]:
259
+ s = f'β€’ β€œ{h["question"]}” (keyakinan ~{h["score"]:.2f})'
260
+ if h.get("source"):
261
+ s += f' β€” sumber: {h["source"]}'
262
+ src_lines.append(s)
263
+ if src_lines:
264
+ ans += "\n\nRujukan terdekat:\n" + "\n".join(src_lines)
265
+
266
+ # Tambah saran follow-up
267
+ fups = render_followups(hits)
268
+ if fups:
269
+ ans += "\n\nCoba juga:\n" + "\n".join([f"- {x}" for x in fups])
270
+
271
+ return ans
272
+
273
+ # ========== UI ==========
274
+ with gr.Blocks(title="πŸ“š IPLM Chatbot (luwes)") as demo:
275
+ gr.Markdown("## πŸ“š IPLM Chatbot\nTanya apa saja tentang IPLM. Jawaban berbasis data JSONL, disajikan dengan bahasa yang lebih luwes.")
276
+ with gr.Row():
277
+ style = gr.Radio(choices=list(STYLE_GUIDE.keys()), value="Formal", label="Gaya jawaban")
278
+ show_sources = gr.Checkbox(value=True, label="Tampilkan rujukan terdekat")
279
+ chat = gr.ChatInterface(
280
+ fn=lambda m,h: answer_query(m, h, style.value, show_sources.value),
281
+ title="IPLM Chatbot",
282
+ description="Jawaban hanya berdasarkan data JSONL, namun ditulis dengan gaya bahasa yang lebih natural.",
283
+ examples=[
284
+ "Sederhananya, apa itu IPLM?",
285
+ "Gimana cara hitung nilai IPLM biar jadi angka 0–100?",
286
+ "Bedanya dimensi kepatuhan sama kinerja apa ya?",
287
+ "Kalau anggaran BOS, yang dihitung bagian mana?",
288
+ "Siapa yang ngumpulin data di daerah dan gimana verifikasinya?"
289
+ ],
290
+ cache_examples=False
291
+ )
292
+
293
+ if __name__ == "__main__":
294
  demo.launch()