semanticsphrase / app.py
allbibek's picture
Update app.py
1d95413 verified
import gradio as gr
from sentence_transformers import SentenceTransformer, CrossEncoder
from supabase import create_client
import os
from dotenv import load_dotenv
from google import genai
import pandas as pd
import time
import math
load_dotenv()
GOOGLE_API_KEY = os.getenv("GEMINI_API")
if not GOOGLE_API_KEY:
print("⚠️ Peringatan: GOOGLE_API_KEY tidak ditemukan, Gemini akan dinonaktifkan.")
gemini_client = None
else:
gemini_client = genai.Client(api_key=GOOGLE_API_KEY)
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
print("--- Daftar Model yang Tersedia ---")
if gemini_client:
for m in gemini_client.models.list():
print(f"Model: {m.name} | Name: {m.display_name}")
print("----------------------------------")
# === Supabase ===
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")
supabase = create_client(supabase_url, supabase_key)
def expand_query(query: str, num_variations: int = 3) -> str:
"""
Memperluas query menggunakan Gemini 2.5 API resmi.
- Otomatis fallback jika API error.
- Cache hasil untuk menghemat pemanggilan API.
"""
if not query.strip():
return query
if gemini_client is None:
return f"Kegiatan usaha yang berkaitan dengan {query}"
prompt = f"""
Anda adalah ahli dalam sistem pencarian KBLI (Klasifikasi Baku Lapangan Usaha Indonesia) 2020.
Tugas Anda adalah membuat {num_variations} variasi dari kueri berikut
untuk meningkatkan hasil pencarian.
Kueri pengguna: "{query}"
Buatkan {num_variations} variasi kueri yang:
1. Menggunakan bahasa formal atau teknis (mis. istilah industri).
2. Menggunakan bahasa sehari-hari.
3. Mengandung kata kunci relevan lain.
Format keluaran HARUS seperti ini:
Variasi 1: ...
Variasi 2: ...
Variasi 3: ...
"""
try:
# Panggil Gemini
response = gemini_client.models.generate_content(
# model="gemini-2.5-flash",
# # model="gemini-robotics-er-1.5-preview",
model="gemini-3.1-flash-lite",
contents=prompt,
)
text_output = response.text.strip()
variations = []
for line in text_output.splitlines():
if line.lower().startswith("variasi"):
parts = line.split(":", 1)
if len(parts) > 1:
variations.append(parts[1].strip())
if not variations:
print("[Gemini Warning] Tidak ada variasi ditemukan. Gunakan fallback.")
return f"Kegiatan usaha yang berkaitan dengan {query}"
# Gabungkan hasil
expanded = query + ". " + " ".join(variations)
print(f"[Gemini Expand] {query} -> {expanded}")
return expanded
except Exception as e:
print(f"[Gemini Error] {e}. Menggunakan fallback lokal.")
return f"Kegiatan usaha yang berkaitan dengan {query}"
def get_embedding(text: str):
"""
Menghasilkan embedding vector dari teks menggunakan model SentenceTransformer.
"""
if not text:
return []
expanded_text = expand_query(text)
embedding = embedder.encode(expanded_text, normalize_embeddings=True).tolist()
return embedding
# ==========================================
# ABLATION STUDY
# ==========================================
# Helper Function
def apply_sigmoid(logit):
return 1 / (1 + math.exp(-logit))
def bm25_only(query: str, match_count: int = 50):
"""Hanya Lexical / Full-Text Search (Tanpa Vector, Tanpa Gemini, Tanpa Reranker)"""
# Catatan: Pastikan Anda sudah membuat RPC 'search_kbli_lexical' di Supabase
response = supabase.rpc(
"search_kbli_lexical",
{"query_text": query, "match_count": match_count}
).execute()
candidates = response.data or []
return {"results": candidates[:10]}
def dense_only(query: str, match_count: int = 50):
"""Hanya Semantic Vector (Tanpa BM25, Tanpa Gemini, Tanpa Reranker)"""
# Gunakan query asli (tanpa expand)
embedding_q = embedder.encode(query, normalize_embeddings=True).tolist()
response = supabase.rpc(
"search_kbli",
{"query_embedding": embedding_q, "match_count": match_count}
).execute()
candidates = response.data or []
return {"results": candidates[:10]}
def semantic_no_gemini(query: str, match_count: int = 50):
"""Semantic Vector + Reranker (TANPA Gemini Query Expansion)"""
expanded = query # Bypass Gemini
embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist()
response = supabase.rpc(
"search_kbli",
{"query_embedding": embedding_q, "match_count": match_count}
).execute()
candidates = response.data or []
if not candidates:
return {"results": []}
pairs = [(expanded, c["judul"] + " " + c["deskripsi"]) for c in candidates]
try:
scores = reranker.predict(pairs)
except Exception as e:
print("Reranker error:", e)
return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]}
rerank_vals = [float(s) for s in scores]
rmin, rmax = min(rerank_vals), max(rerank_vals)
for c, s in zip(candidates, rerank_vals):
c["rerank_score"] = s
if rmax - rmin > 1e-9:
c["rerank_norm"] = (s - rmin) / (rmax - rmin)
else:
c["rerank_norm"] = 0.0
sim = c.get("similarity", 0.0)
c["hybrid_score"] = 0.6 * sim + 0.4 * c["rerank_norm"]
candidates = sorted(candidates, key=lambda x: x["hybrid_score"], reverse=True)
return {"results": candidates[:10]}
def hybrid_search_no_gemini(query: str, match_count: int = 50):
"""Hybrid (BM25 + Dense) + Reranker (TANPA Gemini Query Expansion)"""
expanded = query
embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist()
response = supabase.rpc(
"search_kbli_hybrid",
{
"query_text": query,
"query_embedding": embedding_q,
"match_count": match_count,
"lexical_weight": 0.4, # Diabaikan di SQL, tapi wajib dikirim
"dense_weight": 0.6 # Diabaikan di SQL, tapi wajib dikirim
}
).execute()
candidates = response.data or []
if not candidates:
return {"results": []}
pairs = [(query, c["judul"] + " " + c["deskripsi"]) for c in candidates]
try:
scores = reranker.predict(pairs)
except Exception as e:
print("Reranker error:", e)
return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]}
# Reranker sebagai Hakim Tunggal
for c, s in zip(candidates, scores):
c["rerank_score"] = float(s)
# Gunakan Sigmoid agar nilainya menjadi probabilitas pasti
c["final_score"] = apply_sigmoid(float(s))
# Urutkan berdasarkan keputusan mutlak dari Reranker
candidates = sorted(candidates, key=lambda x: x["final_score"], reverse=True)
return {"results": candidates[:10]}
# ==========================================
# CORE APPS
# ==========================================
def fn_semantic(query: str, match_count: int = 50):
expanded = expand_query(query)
embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist()
response = supabase.rpc(
"search_kbli",
{"query_embedding": embedding_q, "match_count": match_count}
).execute()
candidates = response.data or []
if not candidates:
return {"results": []}
print("=== Candidates BEFORE rerank (top 10) ===")
for c in candidates[:10]:
print(c.get("kode"), c.get("judul")[:80], "sim=", c.get("similarity"))
pairs = [(expanded, c["judul"] + " " + c["deskripsi"]) for c in candidates]
try:
scores = reranker.predict(pairs)
except Exception as e:
print("Reranker error:", e)
return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]}
for c, s in zip(candidates, scores):
c["rerank_score"] = float(s)
print("=== Candidates AFTER rerank (top 10) ===")
for c in candidates[:10]:
print(c.get("kode"), c.get("judul")[:80], "sim=", c.get("similarity"), "rerank=", c.get("rerank_score"))
rerank_vals = [c["rerank_score"] for c in candidates]
rmin, rmax = min(rerank_vals), max(rerank_vals)
for c in candidates:
if rmax - rmin > 1e-9:
c["rerank_norm"] = (c["rerank_score"] - rmin) / (rmax - rmin)
else:
c["rerank_norm"] = 0.0
for c in candidates:
sim = c.get("similarity", 0.0)
c["hybrid_score"] = 0.6 * sim + 0.4 * c["rerank_norm"]
candidates = sorted(candidates, key=lambda x: x["hybrid_score"], reverse=True)
return {"results": candidates[:10]}
def hybrid_search(query: str, match_count: int = 50):
# 1. Query Expansion
expanded = expand_query(query)
# 2. Embedding
# Kita encode query yang sudah di-expand untuk pencarian dense
embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist()
# 3. Panggil Hybrid Search di Supabase
# Kita kirimkan query ASLI untuk Lexical (agar tidak terlalu banyak noise kata),
# dan query EXPANDED untuk Dense (embedding).
response = supabase.rpc(
"search_kbli_hybrid",
{
"query_text": query, # Untuk Lexical Match (tsvector)
"query_embedding": embedding_q, # Untuk Dense Match (pgvector)
"match_count": match_count,
"lexical_weight": 0.2, # Bobot Lexical (bisa disesuaikan untuk Ablation Study)
"dense_weight": 0.8 # Bobot Dense
}
).execute()
candidates = response.data or []
if not candidates:
return {"results": []}
print("=== Candidates dari Hybrid DB BEFORE rerank (top 10) ===")
for c in candidates[:10]:
# Tampilkan similarity yang sekarang merupakan gabungan Lexical & Dense
print(c.get("kode"), c.get("judul")[:80], "hybrid_db_sim=", c.get("similarity"))
# 4. Reranking dengan Cross-Encoder
# Evaluasi kecocokan antara query asli dengan dokumen kandidat
pairs = [(expanded, c["judul"] + " " + c["deskripsi"]) for c in candidates]
try:
scores = reranker.predict(pairs)
except Exception as e:
print("Reranker error:", e)
return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]}
# 5. Normalisasi skor Reranker & Kalkulasi Final Score
rerank_vals = [float(s) for s in scores]
rmin, rmax = min(rerank_vals), max(rerank_vals)
for c, s in zip(candidates, rerank_vals):
c["rerank_score"] = s
# Normalisasi Min-Max
if rmax - rmin > 1e-9:
c["rerank_norm"] = (s - rmin) / (rmax - rmin)
else:
c["rerank_norm"] = 0.0
# Skor Final Sistem Neural IR Anda (gabungan Stage 1: Hybrid Retrieval + Stage 2: Reranking)
# Anda bisa menyesuaikan bobot ini nanti
db_hybrid_sim = c.get("similarity", 0.0)
c["final_score"] = (0.5 * db_hybrid_sim) + (0.5 * c["rerank_norm"])
print("=== Candidates AFTER Cross-Encoder rerank (top 10) ===")
# Urutkan berdasarkan final_score
candidates = sorted(candidates, key=lambda x: x["final_score"], reverse=True)
for c in candidates[:10]:
print(c.get("kode"), c.get("judul")[:80], "final_score=", c.get("final_score"), "rerank=", c.get("rerank_score"))
# Kembalikan 10 teratas (sesuai logika asli Anda)
return {"results": candidates[:10]}
def search_kbli(text: str):
if not text:
return {"embedding": [], "results": []}
embedding = get_embedding(text)
response = supabase.rpc(
"search_kbli",
{"query_embedding": embedding, "match_count": 25}
).execute()
results = response.data if response.data else []
if not results:
return "<p>Tidak ditemukan hasil.</p>"
html = """
<style>
.kbli-item {
border: 1px solid #ddd;
border-radius: 8px;
padding: 10px;
margin-bottom: 8px;
transition: background 0.2s ease;
}
.kbli-item:hover {
background: #f9fafb;
}
.kbli-title {
font-weight: 600;
margin: 0;
}
.kbli-desc {
font-size: 13px;
color: #4b5563;
margin-top: 4px;
}
details {
margin-top: 16px;
border: 1px solid #ddd;
border-radius: 6px;
padding: 8px;
}
details summary {
cursor: pointer;
font-weight: 600;
color: #2563eb;
}
@media (prefers-color-scheme: dark) {
.kbli-item { border: 1px solid #374151; }
.kbli-item:hover { background: #1f2937; }
.kbli-title { color: #f3f4f6; }
.kbli-desc { color: #d1d5db; }
.kbli-item:hover .kbli-title { color: #93c5fd; }
.kbli-item:hover .kbli-desc { color: #e5e7eb; }
details { border: 1px solid #374151; }
details summary { color: #60a5fa; }
}
</style>
<div>
"""
# Top 10 == // for r in results
top_10 = results[:10]
for r in top_10:
html += f"""
<div class="kbli-item">
<p class="kbli-title">{r['kode']}{r['judul']}</p>
<p class="kbli-desc">{r['deskripsi']}</p>
</div>
"""
# Expandable for
others = results[10:]
if others:
html += "<details><summary>Lihat hasil lainnya</summary><div style='margin-top:10px;'>"
for r in others:
html += f"""
<div class="kbli-item">
<p class="kbli-title">{r['kode']}{r['judul']}</p>
<p class="kbli-desc">{r['deskripsi']}</p>
</div>
"""
html += "</div></details>"
# End
html += "</div>"
return html
def calculate_mrr(retrieved_kodes, relevant_kodes_set):
for i, kode in enumerate(retrieved_kodes):
if kode in relevant_kodes_set:
return 1.0 / (i + 1)
return 0.0
def calculate_recall(retrieved_kodes, relevant_kodes_set, k=10):
retrieved_k_set = set(retrieved_kodes[:k])
if not relevant_kodes_set:
return 0.0
return len(retrieved_k_set & relevant_kodes_set) / len(relevant_kodes_set)
def calculate_ndcg(retrieved_kodes, relevance_dict, k=10):
dcg = 0
for i, kode in enumerate(retrieved_kodes[:k]):
rel = relevance_dict.get(kode, 0)
dcg += rel / math.log2(i + 2)
ideal_rels = sorted(relevance_dict.values(), reverse=True)[:k]
idcg = 0
for i, rel in enumerate(ideal_rels):
idcg += rel / math.log2(i + 2)
return dcg / idcg if idcg > 0 else 0.0
def run_evaluation(file_obj, scenario):
if file_obj is None:
return "Peringatan: Silakan unggah file ground_truth.csv terlebih dahulu.", None, None
df = pd.read_csv(file_obj.name)
queries = df.groupby('query_id').first()['query'].to_dict()
ground_truth = {}
for q_id, group in df.groupby('query_id'):
ground_truth[q_id] = dict(zip(group['kode_kbli'].astype(str), group['relevance']))
results_list = []
retrieval_rows = []
for q_id, query_text in queries.items():
start_time = time.perf_counter() # Mulai hitung latensi
# Eksekusi fungsi berdasarkan skenario yang dipilih
if scenario == "BM25 Only (Lexical)":
response = bm25_only(query_text, match_count=50)
elif scenario == "Dense Only (Semantic)":
response = dense_only(query_text, match_count=50)
elif scenario == "Semantic + Reranker (No Gemini)":
response = semantic_no_gemini(query_text, match_count=50)
elif scenario == "Semantic + Reranker (With Gemini)":
response = fn_semantic(query_text, match_count=50)
elif scenario == "Hybrid + Reranker (No Gemini)":
response = hybrid_search_no_gemini(query_text, match_count=50)
elif scenario == "Hybrid + Reranker (With Gemini)":
response = hybrid_search(query_text, match_count=50)
else:
response = {"results": []}
latency = time.perf_counter() - start_time # Hitung selisih waktu
candidates = response.get("results", [])
retrieved_kodes = [str(r.get('kode')) for r in candidates]
for rank, kode in enumerate(retrieved_kodes, start=1):
retrieval_rows.append({
"query_id": q_id,
"query": query_text,
"scenario": scenario,
"rank": rank,
"kode_kbli": kode
})
rel_dict = ground_truth.get(q_id, {})
relevant_kodes_set = {k for k, r in rel_dict.items() if r > 0}
mrr = calculate_mrr(retrieved_kodes, relevant_kodes_set)
recall = calculate_recall(retrieved_kodes, relevant_kodes_set, k=10)
ndcg = calculate_ndcg(retrieved_kodes, rel_dict, k=10)
results_list.append({
"Query ID": q_id,
"Query Text": query_text,
"MRR@10": round(mrr, 4),
"Recall@10": round(recall, 4),
"nDCG@10": round(ndcg, 4),
"Latency (sec)": round(latency, 4) # Menyimpan data latensi per kueri
})
if "With Gemini" in scenario:
time.sleep(1) # Hindari rate limit Gemini API
results_df = pd.DataFrame(results_list)
# Hitung rata-rata
summary = {
"Skenario": scenario,
"Total Query": len(queries),
"Avg MRR@10": round(results_df["MRR@10"].mean(), 4),
"Avg Recall@10": round(results_df["Recall@10"].mean(), 4),
"Avg nDCG@10": round(results_df["nDCG@10"].mean(), 4),
"Avg Latency (sec)": round(results_df["Latency (sec)"].mean(), 4)
}
# Export ke Excel
safe_scenario_name = scenario.replace(" ", "_").replace("(", "").replace(")", "").replace("+", "plus")
output_filename = f"Evaluasi_{safe_scenario_name}.xlsx"
results_df.to_excel(output_filename, index=False)
retrieval_df = pd.DataFrame(retrieval_rows)
retrieval_filename = f"retrieval_results_{safe_scenario_name}.csv"
retrieval_df.to_csv(retrieval_filename, index=False)
return summary, results_df, output_filename, retrieval_filename
with gr.Blocks(css="""
.title {font-size: 22px; font-weight: 700; color: #111827; margin-bottom: 4px;}
.desc {font-size: 14px; color: #6b7280; margin-bottom: 16px;}
button.gr-button {
border-radius: 6px;
}
button.gr-button-primary, button.gr-button-secondary {
border-radius: 6px;
}
.btn-row {display: flex; gap: 8px;}
.btn-row > * {flex: 1;}
.btn-row-search {display: flex; gap: 8px;}
.btn-row-search > * {flex: none;}
@media (max-width: 640px) {
.btn-row, .btn-row-search {flex-direction: column-reverse;}
.btn-row > button,
.btn-row-search > button {
width: 100% !important;
flex: none;
}
}
""") as demo:
gr.Markdown("<div class='title'>Semantic KBLI Search</div>")
gr.Markdown("<div class='desc'>Cari kode KBLI dengan semantic search (Embedding + Matching)</div>")
with gr.Tab("Embedding Only"):
with gr.Row():
with gr.Column(scale=1):
inp1 = gr.Textbox(label="Masukkan teks")
with gr.Row(elem_classes="btn-row"):
btn_clear1 = gr.Button("Clear", variant="secondary")
btn_submit1 = gr.Button("Submit", variant="primary")
with gr.Column(scale=1):
out1 = gr.JSON(label="Embedding Vector")
inp1.submit(get_embedding, inp1, out1, api_name="get_embedding")
btn_clear1.click(lambda: ("", None), None, [inp1, out1])
btn_submit1.click(get_embedding, inp1, out1, api_name="get_embedding")
with gr.Tab("Embedding Fine-tuned"):
with gr.Row():
with gr.Column(scale=1):
inp2 = gr.Textbox(label="Masukkan teks")
with gr.Row(elem_classes="btn-row"):
btn_clear2 = gr.Button("Clear", variant="secondary")
btn_submit2 = gr.Button("Submit", variant="primary")
with gr.Column(scale=1):
out2 = gr.JSON(label="Embedding Vector")
inp2.submit(fn_semantic, inp2, out2, api_name="fn_semantic")
btn_clear2.click(lambda: ("", None), None, [inp2, out2])
btn_submit2.click(fn_semantic, inp2, out2, api_name="fn_semantic")
with gr.Tab("Search KBLI"):
inp3 = gr.Textbox(label="Masukkan teks")
with gr.Row(elem_classes="btn-row-search"):
btn_clear3 = gr.Button("Clear", variant="secondary")
btn_submit3 = gr.Button("Submit", variant="primary")
out3 = gr.HTML(label="Hasil Pencarian Semantic")
inp3.submit(search_kbli, inp3, out3, api_name="search_kbli")
btn_clear3.click(lambda: ("", None), None, [inp3, out3])
btn_submit3.click(search_kbli, inp3, out3, api_name="search_kbli")
with gr.Tab("Hybrid Search (Final)"):
with gr.Row():
with gr.Column(scale=1):
inp4 = gr.Textbox(label="Masukkan teks")
with gr.Row(elem_classes="btn-row"):
btn_clear4 = gr.Button("Clear", variant="secondary")
btn_submit4 = gr.Button("Submit", variant="primary")
with gr.Column(scale=1):
out4 = gr.JSON(label="Hasil Hybrid Search")
inp4.submit(hybrid_search, inp4, out4, api_name="hybrid_search")
btn_clear4.click(lambda: ("", None), None, [inp4, out4])
btn_submit4.click(hybrid_search, inp4, out4, api_name="hybrid_search")
with gr.Tab("Ablation Endpoints (API)"):
gr.Markdown("### Individual Model")
with gr.Row():
with gr.Column(scale=1):
inp5 = gr.Textbox(label="Masukkan kueri teks")
with gr.Row(elem_classes="btn-row"):
btn_bm25 = gr.Button("BM25 Only", variant="primary")
btn_dense = gr.Button("Dense Only", variant="primary")
with gr.Row(elem_classes="btn-row"):
btn_sem_no_gem = gr.Button("Semantic (No Gemini)", variant="primary")
btn_hyb_no_gem = gr.Button("Hybrid (No Gemini)", variant="primary")
with gr.Row():
btn_clear5 = gr.Button("Clear", variant="secondary")
with gr.Column(scale=1):
out5 = gr.JSON(label="Hasil Pencarian Ablation")
# Clear Button
btn_clear5.click(lambda: ("", None), None, [inp5, out5])
# Registrasi Event dan API Name
btn_bm25.click(bm25_only, inputs=[inp5], outputs=[out5], api_name="bm25_only")
btn_dense.click(dense_only, inputs=[inp5], outputs=[out5], api_name="dense_only")
btn_sem_no_gem.click(semantic_no_gemini, inputs=[inp5], outputs=[out5], api_name="semantic_no_gemini")
btn_hyb_no_gem.click(hybrid_search_no_gemini, inputs=[inp5], outputs=[out5], api_name="hybrid_search_no_gemini")
inp5.submit(dense_only, inputs=[inp5], outputs=[out5])
with gr.Tab("Ablation Study"):
gr.Markdown("### Metrics & Latency")
gr.Markdown("Unggah file `ground_truth.csv` Anda untuk menjalankan *batch testing* dan membandingkan skenario.")
with gr.Row():
with gr.Column(scale=1):
eval_file = gr.File(label="Upload ground_truth.csv", file_types=[".csv"])
eval_scenario = gr.Dropdown(
choices=[
"BM25 Only (Lexical)",
"Dense Only (Semantic)",
"Semantic + Reranker (No Gemini)",
"Semantic + Reranker (With Gemini)",
"Hybrid + Reranker (No Gemini)",
"Hybrid + Reranker (With Gemini)"
],
label="Pilih Skenario Evaluasi"
)
btn_run_eval = gr.Button("Jalankan Evaluasi Otomatis", variant="primary")
with gr.Column(scale=1):
eval_summary = gr.JSON(label="Ringkasan Skor Rata-rata & Latensi")
eval_download = gr.File(label="Download Laporan (Excel)")
eval_retrieval_download = gr.File(label="Download Retrieval Results (CSV)")
eval_table = gr.Dataframe(label="Detail Per-Kueri")
btn_run_eval.click(
run_evaluation,
inputs=[eval_file, eval_scenario],
outputs=[eval_summary, eval_table, eval_download, eval_retrieval_download]
)
if __name__ == "__main__":
demo.queue().launch(show_error=True)