irhamni commited on
Commit
579547b
·
verified ·
1 Parent(s): 0e42f0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -140
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py — IPLM Chatbot (UI sederhana ala GPT)
2
  import os, re, json, pickle, hashlib
3
  from pathlib import Path
4
  import gradio as gr
@@ -6,160 +6,88 @@ import numpy as np
6
  from sklearn.neighbors import NearestNeighbors
7
  from sentence_transformers import SentenceTransformer
8
 
9
- # =================== Konfigurasi lewat ENV ===================
10
  DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl"))
11
  EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
12
- LOCAL_MODEL= os.getenv("LOCAL_MODEL", "microsoft/phi-2") # model lokal (CPU)
13
  TOP_K = int(os.getenv("TOP_K", "4"))
14
  TEMPERATURE= float(os.getenv("TEMPERATURE", "0.2"))
15
  MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
16
- THRESHOLD = float(os.getenv("THRESHOLD", "0.60")) # ambil jawaban langsung jika skor >= threshold
17
- SHOW_SOURCES = os.getenv("SHOW_SOURCES", "false").lower() == "true" # set true jika ingin tampilkan sumber terdekat
18
 
19
  SYSTEM_PROMPT = (
20
- "You are an Indonesian librarian assistant. Jawab singkat, akurat, dan sopan. "
21
- "Jawab HANYA berdasarkan konteks yang diberikan. Jika tidak ada jawabannya, "
22
- "balas persis: Data tidak tersedia."
 
23
  )
24
 
25
- # =================== Utilitas ===================
26
- def norm(s: str) -> str:
27
- if s is None: return ""
28
- return re.sub(r"\s+", " ", str(s).strip())
29
-
30
- def dataset_hash(rows) -> str:
31
- m = hashlib.md5()
32
- for r in rows:
33
- m.update((norm(r.get("question","")) + "|" + norm(r.get("answer",""))).encode("utf-8"))
34
  return m.hexdigest()
35
 
36
- def load_jsonl(path: Path):
37
- if not path.exists():
38
- raise FileNotFoundError(f"JSONL tidak ditemukan: {path.resolve()}")
39
- rows = []
40
- with path.open("r", encoding="utf-8") as f:
41
  for line in f:
42
  if not line.strip(): continue
43
- obj = json.loads(line)
44
- q = obj.get("question") or obj.get("pertanyaan") or obj.get("q")
45
- a = obj.get("answer") or obj.get("jawaban") or obj.get("a")
46
- if q and a: rows.append({"question": norm(q), "answer": norm(a)})
47
- if not rows:
48
- raise ValueError("JSONL kosong atau tidak ada pasangan 'question'/'answer'.")
49
- # dedup by question
50
- seen, uniq = set(), []
51
- for r in rows:
52
- if r["question"] in seen: continue
53
- seen.add(r["question"]); uniq.append(r)
54
- return uniq
55
 
56
- # =================== Retriever ===================
57
  class FAQIndex:
58
- def __init__(self):
59
- self.rows=None; self.model=None; self.emb=None; self.nn=None
60
-
61
- def build(self, rows, force=False):
62
- cache_emb = Path("embeddings.pkl")
63
- cache_meta = Path("meta.json")
64
- self.rows = rows
65
- if not force and cache_emb.exists() and cache_meta.exists():
66
- try:
67
- meta = json.loads(cache_meta.read_text(encoding="utf-8"))
68
- if meta.get("hash")==dataset_hash(rows) and meta.get("emb_model")==EMB_MODEL:
69
- cached = pickle.loads(cache_emb.read_bytes())
70
- self.emb, self.nn = cached["emb"], cached["nn"]
71
- if self.model is None: self.model = SentenceTransformer(EMB_MODEL)
72
- return
73
- except Exception:
74
- pass
75
- self.model = SentenceTransformer(EMB_MODEL)
76
- qs = [r["question"] for r in rows]
77
- self.emb = self.model.encode(qs, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
78
- self.nn = NearestNeighbors(n_neighbors=min(10, len(qs)), metric="cosine").fit(self.emb)
79
- cache_emb.write_bytes(pickle.dumps({"emb": self.emb, "nn": self.nn}))
80
- cache_meta.write_text(json.dumps({"hash": dataset_hash(rows), "emb_model": EMB_MODEL}, ensure_ascii=False))
81
-
82
- def retrieve(self, query: str, top_k: int):
83
- if not query.strip(): return []
84
- qv = self.model.encode([query], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
85
- dists, idxs = self.nn.kneighbors(qv, n_neighbors=min(top_k, len(self.rows)))
86
- sims = 1.0 - dists[0]
87
- out = []
88
- for i, sim in zip(idxs[0], sims):
89
- r = self.rows[int(i)]
90
- out.append({"question": r["question"], "answer": r["answer"], "score": float(sim)})
91
- return out
92
-
93
- # =================== Local LLM (CPU) ===================
94
- _local_pipe = None
95
- def generate_with_local(prompt: str, temperature=TEMPERATURE, max_tokens=MAX_TOKENS):
96
  global _local_pipe
97
- try:
98
- if _local_pipe is None:
99
- import torch
100
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
101
- tok = AutoTokenizer.from_pretrained(LOCAL_MODEL)
102
- model = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL, torch_dtype=torch.float32)
103
- _local_pipe = pipeline("text-generation", model=model, tokenizer=tok, device=-1) # CPU
104
- outs = _local_pipe(prompt, do_sample=True, temperature=float(temperature),
105
- max_new_tokens=int(max_tokens), return_full_text=False)
106
- if isinstance(outs, list) and outs and "generated_text" in outs[0]:
107
- return outs[0]["generated_text"]
108
- return str(outs)
109
- except Exception as e:
110
- return f"❌ Gagal menjalankan model lokal: {e}"
111
-
112
- # =================== RAG (deterministic generatif bila perlu) ===================
113
- def build_context(hits):
114
- return "\n\n".join([f"[DOC {i}] {h['answer']}" for i, h in enumerate(hits, 1)])
115
-
116
- def answer_query(user_msg: str) -> str:
117
- hits = faq.retrieve(user_msg, top_k=TOP_K)
118
- if not hits:
119
- return "Data tidak tersedia."
120
-
121
- # Deterministic: kalau yakin pakai jawaban sumber
122
- if hits[0]["score"] >= THRESHOLD:
123
- result = hits[0]['answer']
124
- if SHOW_SOURCES:
125
- bullets = "\n".join([f"- ({h['score']:.2f}) {h['question']}" for h in hits])
126
- result += f"\n\n**Sumber terdekat:**\n{bullets}"
127
- return result
128
-
129
- # Jika kurang yakin → rangkum dengan LLM lokal
130
- context = build_context(hits)
131
- prompt = (
132
- f"SISTEM: {SYSTEM_PROMPT}\n\n"
133
- f"KONTEKS:\n{context}\n\n"
134
- f"PERTANYAAN:\n{user_msg}\n\n"
135
- "Instruksi: Jawab singkat dan HANYA berdasarkan KONTEKS di atas. "
136
- "Jika tidak ada jawabannya, balas persis: Data tidak tersedia."
137
- )
138
- result = generate_with_local(prompt, temperature=TEMPERATURE, max_tokens=MAX_TOKENS)
139
- if SHOW_SOURCES:
140
- bullets = "\n".join([f"- ({h['score']:.2f}) {h['question']}" for h in hits])
141
- result += f"\n\n**Sumber terdekat (lokal):**\n{bullets}"
142
- return result
143
-
144
- # =================== Load data & index ===================
145
- faq = FAQIndex()
146
- _rows = load_jsonl(DATA_PATH)
147
- faq.build(_rows, force=False)
148
-
149
- # =================== UI minimal ===================
150
- def chat_fn(message, history):
151
- return answer_query(message)
152
-
153
- with gr.Blocks(title="IPLM Chatbot") as demo:
154
- gr.Markdown("### 📚 IPLM Chatbot\nTanya apa saja tentang **IPLM**. (UI sengaja disederhanakan)")
155
- gr.ChatInterface(
156
- fn=chat_fn,
157
- title="",
158
- description="",
159
- examples=["Apa itu IPLM?", "Bagaimana menghitung IPLM?", "Apa saja dimensi IPLM?"],
160
- cache_examples=False,
161
- autofocus=True,
162
- )
163
 
164
- if __name__ == "__main__":
165
  demo.launch()
 
1
+ # app.py — versi super simpel ala ChatGPT
2
  import os, re, json, pickle, hashlib
3
  from pathlib import Path
4
  import gradio as gr
 
6
  from sklearn.neighbors import NearestNeighbors
7
  from sentence_transformers import SentenceTransformer
8
 
9
+ # ========== Konfigurasi ==========
10
  DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl"))
11
  EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
12
+ LOCAL_MODEL= os.getenv("LOCAL_MODEL", "google/gemma-2b-it") # model lokal gratis & ringan
13
  TOP_K = int(os.getenv("TOP_K", "4"))
14
  TEMPERATURE= float(os.getenv("TEMPERATURE", "0.2"))
15
  MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
16
+ THRESHOLD = float(os.getenv("THRESHOLD", "0.6"))
 
17
 
18
  SYSTEM_PROMPT = (
19
+ "You are an Indonesian librarian assistant. "
20
+ "Jawab singkat, akurat, dan sopan. "
21
+ "Jawab HANYA berdasarkan konteks yang diberikan. "
22
+ "Jika tidak ada jawabannya, balas persis: Data tidak tersedia."
23
  )
24
 
25
+ # ========== Utils ==========
26
+ def norm(s): return re.sub(r"\s+"," ",str(s or "").strip())
27
+ def dataset_hash(rows):
28
+ m=hashlib.md5()
29
+ for r in rows: m.update((r["question"]+"|"+r["answer"]).encode())
 
 
 
 
30
  return m.hexdigest()
31
 
32
+ def load_jsonl(path:Path):
33
+ rows=[]
34
+ with path.open("r",encoding="utf-8") as f:
 
 
35
  for line in f:
36
  if not line.strip(): continue
37
+ obj=json.loads(line)
38
+ q=obj.get("question") or obj.get("q")
39
+ a=obj.get("answer") or obj.get("a")
40
+ if q and a: rows.append({"question":norm(q),"answer":norm(a)})
41
+ return rows
 
 
 
 
 
 
 
42
 
43
+ # ========== Retriever ==========
44
  class FAQIndex:
45
+ def __init__(self): self.rows=None; self.model=None; self.nn=None; self.emb=None
46
+ def build(self,rows):
47
+ self.rows=rows
48
+ self.model=SentenceTransformer(EMB_MODEL)
49
+ qs=[r["question"] for r in rows]
50
+ self.emb=self.model.encode(qs,normalize_embeddings=True,convert_to_numpy=True,show_progress_bar=False)
51
+ self.nn=NearestNeighbors(n_neighbors=min(10,len(qs)),metric="cosine").fit(self.emb)
52
+ def retrieve(self,query,top_k=TOP_K):
53
+ qv=self.model.encode([query],normalize_embeddings=True,convert_to_numpy=True,show_progress_bar=False)
54
+ d,i=self.nn.kneighbors(qv,n_neighbors=min(top_k,len(self.rows)))
55
+ sims=1.0-d[0]
56
+ return [{"question":self.rows[int(ix)]["question"],"answer":self.rows[int(ix)]["answer"],"score":float(s)} for ix,s in zip(i[0],sims)]
57
+
58
+ faq=FAQIndex()
59
+ faq.build(load_jsonl(DATA_PATH))
60
+
61
+ # ========== Local LLM ==========
62
+ _local_pipe=None
63
+ def call_local(prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  global _local_pipe
65
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
66
+ import torch
67
+ if _local_pipe is None:
68
+ tok=AutoTokenizer.from_pretrained(LOCAL_MODEL)
69
+ mdl=AutoModelForCausalLM.from_pretrained(LOCAL_MODEL,torch_dtype=torch.float32)
70
+ _local_pipe=pipeline("text-generation",model=mdl,tokenizer=tok,device=-1)
71
+ out=_local_pipe(prompt,max_new_tokens=MAX_TOKENS,do_sample=True,temperature=TEMPERATURE)
72
+ return out[0]["generated_text"]
73
+
74
+ # ========== Orchestrator ==========
75
+ def answer_query(msg,history):
76
+ hits=faq.retrieve(msg,TOP_K)
77
+ if not hits: return "Data tidak tersedia."
78
+ if hits[0]["score"]>=THRESHOLD:
79
+ return hits[0]["answer"]
80
+ ctx="\n".join([f"- {h['answer']}" for h in hits])
81
+ prompt=f"{SYSTEM_PROMPT}\n\nKONTEKS:\n{ctx}\n\nPERTANYAAN: {msg}\n\nJAWAB:"
82
+ return call_local(prompt)
83
+
84
+ # ========== UI Chat Only ==========
85
+ demo=gr.ChatInterface(
86
+ fn=answer_query,
87
+ title="📚 IPLM Chatbot",
88
+ description="Tanya apa saja tentang IPLM. Jawaban hanya berdasarkan data JSONL.",
89
+ examples=["Apa itu IPLM?","Bagaimana menghitung IPLM?","Apa saja dimensi IPLM?"]
90
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ if __name__=="__main__":
93
  demo.launch()