Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
import os, re, json, pickle, hashlib, requests
|
| 3 |
from pathlib import Path
|
| 4 |
import gradio as gr
|
|
@@ -8,31 +7,35 @@ from sklearn.neighbors import NearestNeighbors
|
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
|
| 10 |
# =================== Config ===================
|
| 11 |
-
DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl")) #
|
| 12 |
CACHE_EMB = Path("embeddings.pkl")
|
| 13 |
CACHE_META = Path("meta.json")
|
| 14 |
|
| 15 |
-
# Embedding model
|
| 16 |
EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 17 |
|
| 18 |
-
# LLM endpoint (
|
| 19 |
HF_CHAT_URL = os.getenv("HF_CHAT_URL", "https://api-inference.huggingface.co/v1/chat/completions")
|
| 20 |
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
| 21 |
-
LLM_MODEL = os.getenv("LLM_MODEL", "
|
| 22 |
|
| 23 |
TOP_K_DEFAULT = int(os.getenv("TOP_K_DEFAULT", "4"))
|
| 24 |
TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE_DEFAULT", "0.3"))
|
| 25 |
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
|
| 26 |
|
| 27 |
-
SYSTEM_PROMPT = os.getenv(
|
| 28 |
-
"
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
# =================== Utils ===================
|
| 34 |
def norm(s: str) -> str:
|
| 35 |
-
if s is None:
|
|
|
|
| 36 |
s = str(s).strip()
|
| 37 |
s = re.sub(r"\s+", " ", s)
|
| 38 |
return s
|
|
@@ -50,20 +53,21 @@ def load_jsonl(path: Path) -> list:
|
|
| 50 |
with path.open("r", encoding="utf-8") as f:
|
| 51 |
for line in f:
|
| 52 |
line = line.strip()
|
| 53 |
-
if not line:
|
|
|
|
| 54 |
obj = json.loads(line)
|
| 55 |
-
#
|
| 56 |
q = obj.get("question") or obj.get("pertanyaan") or obj.get("q")
|
| 57 |
-
a = obj.get("answer")
|
| 58 |
if q and a:
|
| 59 |
rows.append({"question": norm(q), "answer": norm(a)})
|
| 60 |
if not rows:
|
| 61 |
raise ValueError("JSONL kosong atau tidak mengandung pasangan 'question'/'answer'.")
|
| 62 |
-
#
|
| 63 |
seen = set()
|
| 64 |
uniq = []
|
| 65 |
for r in rows:
|
| 66 |
-
if r["question"] in seen:
|
| 67 |
continue
|
| 68 |
seen.add(r["question"])
|
| 69 |
uniq.append(r)
|
|
@@ -79,32 +83,46 @@ class FAQIndex:
|
|
| 79 |
|
| 80 |
def build(self, rows: list, force=False):
|
| 81 |
self.rows = rows
|
| 82 |
-
#
|
| 83 |
if not force and CACHE_EMB.exists() and CACHE_META.exists():
|
| 84 |
try:
|
| 85 |
meta = json.loads(CACHE_META.read_text(encoding="utf-8"))
|
| 86 |
if meta.get("hash") == dataset_hash(rows) and meta.get("emb_model") == EMB_MODEL:
|
| 87 |
cached = pickle.loads(CACHE_EMB.read_bytes())
|
| 88 |
self.emb = cached["emb"]
|
| 89 |
-
self.nn
|
| 90 |
if self.model is None:
|
| 91 |
self.model = SentenceTransformer(EMB_MODEL)
|
| 92 |
return
|
| 93 |
except Exception:
|
| 94 |
pass
|
| 95 |
-
#
|
| 96 |
self.model = SentenceTransformer(EMB_MODEL)
|
| 97 |
-
# encode
|
| 98 |
qas = [f"Q: {r['question']}\nA: {r['answer']}" for r in rows]
|
| 99 |
-
self.emb = self.model.encode(
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
CACHE_EMB.write_bytes(pickle.dumps({"emb": self.emb, "nn": self.nn}))
|
| 102 |
-
CACHE_META.write_text(
|
|
|
|
|
|
|
| 103 |
|
| 104 |
def retrieve(self, query: str, top_k: int = TOP_K_DEFAULT):
|
| 105 |
if not query.strip():
|
| 106 |
return []
|
| 107 |
-
q_vec = self.model.encode(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
dists, idxs = self.nn.kneighbors(q_vec, n_neighbors=min(top_k, len(self.rows)))
|
| 109 |
sims = 1.0 - dists[0]
|
| 110 |
out = []
|
|
@@ -116,7 +134,7 @@ class FAQIndex:
|
|
| 116 |
# =================== LLM Caller ===================
|
| 117 |
def call_hf_chat(messages, temperature=TEMPERATURE_DEFAULT, max_tokens=MAX_TOKENS):
|
| 118 |
if not HF_TOKEN:
|
| 119 |
-
#
|
| 120 |
return "โ ๏ธ HF_TOKEN belum diatur. Buka Settings โ Secrets dan tambahkan HF_TOKEN agar LLM aktif."
|
| 121 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 122 |
payload = {
|
|
@@ -124,7 +142,7 @@ def call_hf_chat(messages, temperature=TEMPERATURE_DEFAULT, max_tokens=MAX_TOKEN
|
|
| 124 |
"messages": messages,
|
| 125 |
"temperature": float(temperature),
|
| 126 |
"max_tokens": int(max_tokens),
|
| 127 |
-
"stream": False
|
| 128 |
}
|
| 129 |
r = requests.post(HF_CHAT_URL, headers=headers, json=payload, timeout=90)
|
| 130 |
try:
|
|
@@ -148,7 +166,14 @@ def rag_answer(user_msg, top_k=TOP_K_DEFAULT, temperature=TEMPERATURE_DEFAULT):
|
|
| 148 |
context = build_context(hits)
|
| 149 |
messages = [
|
| 150 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 151 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
]
|
| 153 |
out = call_hf_chat(messages, temperature=float(temperature), max_tokens=MAX_TOKENS)
|
| 154 |
bullets = "\n".join([f"- ({h['score']:.2f}) {h['question']}" for h in hits])
|
|
@@ -165,8 +190,10 @@ def upload_jsonl(file_obj):
|
|
| 165 |
return gr.update(value="Tidak ada file.")
|
| 166 |
tmp = Path(file_obj.name)
|
| 167 |
tmp.replace(DATA_PATH)
|
| 168 |
-
if CACHE_EMB.exists():
|
| 169 |
-
|
|
|
|
|
|
|
| 170 |
global rows, faq
|
| 171 |
rows = load_jsonl(DATA_PATH)
|
| 172 |
faq = FAQIndex()
|
|
@@ -175,19 +202,26 @@ def upload_jsonl(file_obj):
|
|
| 175 |
|
| 176 |
# =================== UI ===================
|
| 177 |
with gr.Blocks(title="RAG + LLM (JSONL)") as demo:
|
| 178 |
-
gr.Markdown(
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
with gr.Row():
|
| 181 |
with gr.Column(scale=2):
|
| 182 |
chat = gr.ChatInterface(
|
| 183 |
fn=lambda msg, hist, k, t: rag_answer(msg, top_k=int(k), temperature=float(t)),
|
| 184 |
additional_inputs=[
|
| 185 |
gr.Slider(1, 10, value=TOP_K_DEFAULT, step=1, label="Top-K dokumen"),
|
| 186 |
-
gr.Slider(0.0, 1.0, value=TEMPERATURE_DEFAULT, step=0.05, label="Temperatur")
|
| 187 |
],
|
| 188 |
title="Asisten Perpustakaan (RAG)",
|
| 189 |
description="Jawab *berdasarkan konteks* dari dokumen JSONL Anda.",
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
)
|
| 192 |
with gr.Column(scale=1):
|
| 193 |
gr.Markdown("### ๐ Perbarui Basis Data")
|
|
@@ -195,5 +229,6 @@ with gr.Blocks(title="RAG + LLM (JSONL)") as demo:
|
|
| 195 |
out = gr.Textbox(label="Status", interactive=False)
|
| 196 |
uploader.change(fn=upload_jsonl, inputs=uploader, outputs=out)
|
| 197 |
gr.Markdown("Set **HF_TOKEN** di Settings โ Secrets untuk mengaktifkan LLM.")
|
|
|
|
| 198 |
if __name__ == "__main__":
|
| 199 |
demo.launch()
|
|
|
|
|
|
|
| 1 |
import os, re, json, pickle, hashlib, requests
|
| 2 |
from pathlib import Path
|
| 3 |
import gradio as gr
|
|
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
|
| 9 |
# =================== Config ===================
|
| 10 |
+
DATA_PATH = Path(os.getenv("DATA_PATH", "IPLM_QnA_Chatbot.jsonl")) # nama file JSONL kamu
|
| 11 |
CACHE_EMB = Path("embeddings.pkl")
|
| 12 |
CACHE_META = Path("meta.json")
|
| 13 |
|
| 14 |
+
# Embedding model untuk retrieval (cepat & akurat)
|
| 15 |
EMB_MODEL = os.getenv("EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 16 |
|
| 17 |
+
# LLM endpoint (Inference API / TGI-compatible / OpenAI-compatible route)
|
| 18 |
HF_CHAT_URL = os.getenv("HF_CHAT_URL", "https://api-inference.huggingface.co/v1/chat/completions")
|
| 19 |
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
| 20 |
+
LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen2.5-7B-Instruct") # default publik (non-gated)
|
| 21 |
|
| 22 |
TOP_K_DEFAULT = int(os.getenv("TOP_K_DEFAULT", "4"))
|
| 23 |
TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE_DEFAULT", "0.3"))
|
| 24 |
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
|
| 25 |
|
| 26 |
+
SYSTEM_PROMPT = os.getenv(
|
| 27 |
+
"SYSTEM_PROMPT",
|
| 28 |
+
(
|
| 29 |
+
"You are an Indonesian librarian assistant. Jawab ringkas, akurat, dan sopan. "
|
| 30 |
+
"Gunakan HANYA informasi dari konteks yang diberikan. Jika konteks tidak memuat jawabannya, "
|
| 31 |
+
"katakan bahwa data tidak tersedia di basis pengetahuan."
|
| 32 |
+
),
|
| 33 |
)
|
| 34 |
|
| 35 |
# =================== Utils ===================
|
| 36 |
def norm(s: str) -> str:
|
| 37 |
+
if s is None:
|
| 38 |
+
return ""
|
| 39 |
s = str(s).strip()
|
| 40 |
s = re.sub(r"\s+", " ", s)
|
| 41 |
return s
|
|
|
|
| 53 |
with path.open("r", encoding="utf-8") as f:
|
| 54 |
for line in f:
|
| 55 |
line = line.strip()
|
| 56 |
+
if not line:
|
| 57 |
+
continue
|
| 58 |
obj = json.loads(line)
|
| 59 |
+
# dukung berbagai nama key
|
| 60 |
q = obj.get("question") or obj.get("pertanyaan") or obj.get("q")
|
| 61 |
+
a = obj.get("answer") or obj.get("jawaban") or obj.get("a")
|
| 62 |
if q and a:
|
| 63 |
rows.append({"question": norm(q), "answer": norm(a)})
|
| 64 |
if not rows:
|
| 65 |
raise ValueError("JSONL kosong atau tidak mengandung pasangan 'question'/'answer'.")
|
| 66 |
+
# buang duplikat berdasarkan question
|
| 67 |
seen = set()
|
| 68 |
uniq = []
|
| 69 |
for r in rows:
|
| 70 |
+
if r["question"] in seen:
|
| 71 |
continue
|
| 72 |
seen.add(r["question"])
|
| 73 |
uniq.append(r)
|
|
|
|
| 83 |
|
| 84 |
def build(self, rows: list, force=False):
|
| 85 |
self.rows = rows
|
| 86 |
+
# coba muat cache
|
| 87 |
if not force and CACHE_EMB.exists() and CACHE_META.exists():
|
| 88 |
try:
|
| 89 |
meta = json.loads(CACHE_META.read_text(encoding="utf-8"))
|
| 90 |
if meta.get("hash") == dataset_hash(rows) and meta.get("emb_model") == EMB_MODEL:
|
| 91 |
cached = pickle.loads(CACHE_EMB.read_bytes())
|
| 92 |
self.emb = cached["emb"]
|
| 93 |
+
self.nn = cached["nn"]
|
| 94 |
if self.model is None:
|
| 95 |
self.model = SentenceTransformer(EMB_MODEL)
|
| 96 |
return
|
| 97 |
except Exception:
|
| 98 |
pass
|
| 99 |
+
# bangun baru
|
| 100 |
self.model = SentenceTransformer(EMB_MODEL)
|
| 101 |
+
# encode gabungan Q+A โ lebih โgroundedโ
|
| 102 |
qas = [f"Q: {r['question']}\nA: {r['answer']}" for r in rows]
|
| 103 |
+
self.emb = self.model.encode(
|
| 104 |
+
qas,
|
| 105 |
+
normalize_embeddings=True,
|
| 106 |
+
convert_to_numpy=True,
|
| 107 |
+
show_progress_bar=False,
|
| 108 |
+
)
|
| 109 |
+
self.nn = NearestNeighbors(
|
| 110 |
+
n_neighbors=min(10, len(qas)), metric="cosine"
|
| 111 |
+
).fit(self.emb)
|
| 112 |
CACHE_EMB.write_bytes(pickle.dumps({"emb": self.emb, "nn": self.nn}))
|
| 113 |
+
CACHE_META.write_text(
|
| 114 |
+
json.dumps({"hash": dataset_hash(rows), "emb_model": EMB_MODEL}, ensure_ascii=False)
|
| 115 |
+
)
|
| 116 |
|
| 117 |
def retrieve(self, query: str, top_k: int = TOP_K_DEFAULT):
|
| 118 |
if not query.strip():
|
| 119 |
return []
|
| 120 |
+
q_vec = self.model.encode(
|
| 121 |
+
[query],
|
| 122 |
+
normalize_embeddings=True,
|
| 123 |
+
convert_to_numpy=True,
|
| 124 |
+
show_progress_bar=False,
|
| 125 |
+
)
|
| 126 |
dists, idxs = self.nn.kneighbors(q_vec, n_neighbors=min(top_k, len(self.rows)))
|
| 127 |
sims = 1.0 - dists[0]
|
| 128 |
out = []
|
|
|
|
| 134 |
# =================== LLM Caller ===================
|
| 135 |
def call_hf_chat(messages, temperature=TEMPERATURE_DEFAULT, max_tokens=MAX_TOKENS):
|
| 136 |
if not HF_TOKEN:
|
| 137 |
+
# izinkan fallback non-LLM agar Space tetap hidup
|
| 138 |
return "โ ๏ธ HF_TOKEN belum diatur. Buka Settings โ Secrets dan tambahkan HF_TOKEN agar LLM aktif."
|
| 139 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 140 |
payload = {
|
|
|
|
| 142 |
"messages": messages,
|
| 143 |
"temperature": float(temperature),
|
| 144 |
"max_tokens": int(max_tokens),
|
| 145 |
+
"stream": False,
|
| 146 |
}
|
| 147 |
r = requests.post(HF_CHAT_URL, headers=headers, json=payload, timeout=90)
|
| 148 |
try:
|
|
|
|
| 166 |
context = build_context(hits)
|
| 167 |
messages = [
|
| 168 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 169 |
+
{
|
| 170 |
+
"role": "user",
|
| 171 |
+
"content": (
|
| 172 |
+
f"KONTEKS:\n{context}\n\n"
|
| 173 |
+
f"PERTANYAAN:\n{user_msg}\n\n"
|
| 174 |
+
"Instruksi: Jawab berbasis KONTEKS. Jika tidak ada di konteks, jawab 'Data tidak tersedia.' "
|
| 175 |
+
),
|
| 176 |
+
},
|
| 177 |
]
|
| 178 |
out = call_hf_chat(messages, temperature=float(temperature), max_tokens=MAX_TOKENS)
|
| 179 |
bullets = "\n".join([f"- ({h['score']:.2f}) {h['question']}" for h in hits])
|
|
|
|
| 190 |
return gr.update(value="Tidak ada file.")
|
| 191 |
tmp = Path(file_obj.name)
|
| 192 |
tmp.replace(DATA_PATH)
|
| 193 |
+
if CACHE_EMB.exists():
|
| 194 |
+
CACHE_EMB.unlink()
|
| 195 |
+
if CACHE_META.exists():
|
| 196 |
+
CACHE_META.unlink()
|
| 197 |
global rows, faq
|
| 198 |
rows = load_jsonl(DATA_PATH)
|
| 199 |
faq = FAQIndex()
|
|
|
|
| 202 |
|
| 203 |
# =================== UI ===================
|
| 204 |
with gr.Blocks(title="RAG + LLM (JSONL)") as demo:
|
| 205 |
+
gr.Markdown(
|
| 206 |
+
"# ๐ RAG + LLM โ dari JSONL Q&A\n"
|
| 207 |
+
"Masukkan pertanyaan โ sistem mengambil Q&A paling relevan โ LLM merangkum/menjawab berdasarkan konteks."
|
| 208 |
+
)
|
| 209 |
with gr.Row():
|
| 210 |
with gr.Column(scale=2):
|
| 211 |
chat = gr.ChatInterface(
|
| 212 |
fn=lambda msg, hist, k, t: rag_answer(msg, top_k=int(k), temperature=float(t)),
|
| 213 |
additional_inputs=[
|
| 214 |
gr.Slider(1, 10, value=TOP_K_DEFAULT, step=1, label="Top-K dokumen"),
|
| 215 |
+
gr.Slider(0.0, 1.0, value=TEMPERATURE_DEFAULT, step=0.05, label="Temperatur"),
|
| 216 |
],
|
| 217 |
title="Asisten Perpustakaan (RAG)",
|
| 218 |
description="Jawab *berdasarkan konteks* dari dokumen JSONL Anda.",
|
| 219 |
+
# Harus list-of-lists karena ada additional_inputs
|
| 220 |
+
examples=[
|
| 221 |
+
["Apa itu IPLM?", TOP_K_DEFAULT, TEMPERATURE_DEFAULT],
|
| 222 |
+
["Bagaimana perhitungan TGM?", TOP_K_DEFAULT, TEMPERATURE_DEFAULT],
|
| 223 |
+
["Apa saja tahap pengolahan data?", TOP_K_DEFAULT, TEMPERATURE_DEFAULT],
|
| 224 |
+
],
|
| 225 |
)
|
| 226 |
with gr.Column(scale=1):
|
| 227 |
gr.Markdown("### ๐ Perbarui Basis Data")
|
|
|
|
| 229 |
out = gr.Textbox(label="Status", interactive=False)
|
| 230 |
uploader.change(fn=upload_jsonl, inputs=uploader, outputs=out)
|
| 231 |
gr.Markdown("Set **HF_TOKEN** di Settings โ Secrets untuk mengaktifkan LLM.")
|
| 232 |
+
|
| 233 |
if __name__ == "__main__":
|
| 234 |
demo.launch()
|