import gradio as gr from sentence_transformers import SentenceTransformer, CrossEncoder from supabase import create_client import os from dotenv import load_dotenv from google import genai import pandas as pd import time import math load_dotenv() GOOGLE_API_KEY = os.getenv("GEMINI_API") if not GOOGLE_API_KEY: print("⚠️ Peringatan: GOOGLE_API_KEY tidak ditemukan, Gemini akan dinonaktifkan.") gemini_client = None else: gemini_client = genai.Client(api_key=GOOGLE_API_KEY) embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") print("--- Daftar Model yang Tersedia ---") if gemini_client: for m in gemini_client.models.list(): print(f"Model: {m.name} | Name: {m.display_name}") print("----------------------------------") # === Supabase === supabase_url = os.getenv("SUPABASE_URL") supabase_key = os.getenv("SUPABASE_KEY") supabase = create_client(supabase_url, supabase_key) def expand_query(query: str, num_variations: int = 3) -> str: """ Memperluas query menggunakan Gemini 2.5 API resmi. - Otomatis fallback jika API error. - Cache hasil untuk menghemat pemanggilan API. """ if not query.strip(): return query if gemini_client is None: return f"Kegiatan usaha yang berkaitan dengan {query}" prompt = f""" Anda adalah ahli dalam sistem pencarian KBLI (Klasifikasi Baku Lapangan Usaha Indonesia) 2020. Tugas Anda adalah membuat {num_variations} variasi dari kueri berikut untuk meningkatkan hasil pencarian. Kueri pengguna: "{query}" Buatkan {num_variations} variasi kueri yang: 1. Menggunakan bahasa formal atau teknis (mis. istilah industri). 2. Menggunakan bahasa sehari-hari. 3. Mengandung kata kunci relevan lain. Format keluaran HARUS seperti ini: Variasi 1: ... Variasi 2: ... Variasi 3: ... """ try: # Panggil Gemini response = gemini_client.models.generate_content( # model="gemini-2.5-flash", # # model="gemini-robotics-er-1.5-preview", model="gemini-3.1-flash-lite", contents=prompt, ) text_output = response.text.strip() variations = [] for line in text_output.splitlines(): if line.lower().startswith("variasi"): parts = line.split(":", 1) if len(parts) > 1: variations.append(parts[1].strip()) if not variations: print("[Gemini Warning] Tidak ada variasi ditemukan. Gunakan fallback.") return f"Kegiatan usaha yang berkaitan dengan {query}" # Gabungkan hasil expanded = query + ". " + " ".join(variations) print(f"[Gemini Expand] {query} -> {expanded}") return expanded except Exception as e: print(f"[Gemini Error] {e}. Menggunakan fallback lokal.") return f"Kegiatan usaha yang berkaitan dengan {query}" def get_embedding(text: str): """ Menghasilkan embedding vector dari teks menggunakan model SentenceTransformer. """ if not text: return [] expanded_text = expand_query(text) embedding = embedder.encode(expanded_text, normalize_embeddings=True).tolist() return embedding # ========================================== # ABLATION STUDY # ========================================== # Helper Function def apply_sigmoid(logit): return 1 / (1 + math.exp(-logit)) def bm25_only(query: str, match_count: int = 50): """Hanya Lexical / Full-Text Search (Tanpa Vector, Tanpa Gemini, Tanpa Reranker)""" # Catatan: Pastikan Anda sudah membuat RPC 'search_kbli_lexical' di Supabase response = supabase.rpc( "search_kbli_lexical", {"query_text": query, "match_count": match_count} ).execute() candidates = response.data or [] return {"results": candidates[:10]} def dense_only(query: str, match_count: int = 50): """Hanya Semantic Vector (Tanpa BM25, Tanpa Gemini, Tanpa Reranker)""" # Gunakan query asli (tanpa expand) embedding_q = embedder.encode(query, normalize_embeddings=True).tolist() response = supabase.rpc( "search_kbli", {"query_embedding": embedding_q, "match_count": match_count} ).execute() candidates = response.data or [] return {"results": candidates[:10]} def semantic_no_gemini(query: str, match_count: int = 50): """Semantic Vector + Reranker (TANPA Gemini Query Expansion)""" expanded = query # Bypass Gemini embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist() response = supabase.rpc( "search_kbli", {"query_embedding": embedding_q, "match_count": match_count} ).execute() candidates = response.data or [] if not candidates: return {"results": []} pairs = [(expanded, c["judul"] + " " + c["deskripsi"]) for c in candidates] try: scores = reranker.predict(pairs) except Exception as e: print("Reranker error:", e) return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]} rerank_vals = [float(s) for s in scores] rmin, rmax = min(rerank_vals), max(rerank_vals) for c, s in zip(candidates, rerank_vals): c["rerank_score"] = s if rmax - rmin > 1e-9: c["rerank_norm"] = (s - rmin) / (rmax - rmin) else: c["rerank_norm"] = 0.0 sim = c.get("similarity", 0.0) c["hybrid_score"] = 0.6 * sim + 0.4 * c["rerank_norm"] candidates = sorted(candidates, key=lambda x: x["hybrid_score"], reverse=True) return {"results": candidates[:10]} def hybrid_search_no_gemini(query: str, match_count: int = 50): """Hybrid (BM25 + Dense) + Reranker (TANPA Gemini Query Expansion)""" expanded = query embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist() response = supabase.rpc( "search_kbli_hybrid", { "query_text": query, "query_embedding": embedding_q, "match_count": match_count, "lexical_weight": 0.4, # Diabaikan di SQL, tapi wajib dikirim "dense_weight": 0.6 # Diabaikan di SQL, tapi wajib dikirim } ).execute() candidates = response.data or [] if not candidates: return {"results": []} pairs = [(query, c["judul"] + " " + c["deskripsi"]) for c in candidates] try: scores = reranker.predict(pairs) except Exception as e: print("Reranker error:", e) return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]} # Reranker sebagai Hakim Tunggal for c, s in zip(candidates, scores): c["rerank_score"] = float(s) # Gunakan Sigmoid agar nilainya menjadi probabilitas pasti c["final_score"] = apply_sigmoid(float(s)) # Urutkan berdasarkan keputusan mutlak dari Reranker candidates = sorted(candidates, key=lambda x: x["final_score"], reverse=True) return {"results": candidates[:10]} # ========================================== # CORE APPS # ========================================== def fn_semantic(query: str, match_count: int = 50): expanded = expand_query(query) embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist() response = supabase.rpc( "search_kbli", {"query_embedding": embedding_q, "match_count": match_count} ).execute() candidates = response.data or [] if not candidates: return {"results": []} print("=== Candidates BEFORE rerank (top 10) ===") for c in candidates[:10]: print(c.get("kode"), c.get("judul")[:80], "sim=", c.get("similarity")) pairs = [(expanded, c["judul"] + " " + c["deskripsi"]) for c in candidates] try: scores = reranker.predict(pairs) except Exception as e: print("Reranker error:", e) return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]} for c, s in zip(candidates, scores): c["rerank_score"] = float(s) print("=== Candidates AFTER rerank (top 10) ===") for c in candidates[:10]: print(c.get("kode"), c.get("judul")[:80], "sim=", c.get("similarity"), "rerank=", c.get("rerank_score")) rerank_vals = [c["rerank_score"] for c in candidates] rmin, rmax = min(rerank_vals), max(rerank_vals) for c in candidates: if rmax - rmin > 1e-9: c["rerank_norm"] = (c["rerank_score"] - rmin) / (rmax - rmin) else: c["rerank_norm"] = 0.0 for c in candidates: sim = c.get("similarity", 0.0) c["hybrid_score"] = 0.6 * sim + 0.4 * c["rerank_norm"] candidates = sorted(candidates, key=lambda x: x["hybrid_score"], reverse=True) return {"results": candidates[:10]} def hybrid_search(query: str, match_count: int = 50): # 1. Query Expansion expanded = expand_query(query) # 2. Embedding # Kita encode query yang sudah di-expand untuk pencarian dense embedding_q = embedder.encode(expanded, normalize_embeddings=True).tolist() # 3. Panggil Hybrid Search di Supabase # Kita kirimkan query ASLI untuk Lexical (agar tidak terlalu banyak noise kata), # dan query EXPANDED untuk Dense (embedding). response = supabase.rpc( "search_kbli_hybrid", { "query_text": query, # Untuk Lexical Match (tsvector) "query_embedding": embedding_q, # Untuk Dense Match (pgvector) "match_count": match_count, "lexical_weight": 0.2, # Bobot Lexical (bisa disesuaikan untuk Ablation Study) "dense_weight": 0.8 # Bobot Dense } ).execute() candidates = response.data or [] if not candidates: return {"results": []} print("=== Candidates dari Hybrid DB BEFORE rerank (top 10) ===") for c in candidates[:10]: # Tampilkan similarity yang sekarang merupakan gabungan Lexical & Dense print(c.get("kode"), c.get("judul")[:80], "hybrid_db_sim=", c.get("similarity")) # 4. Reranking dengan Cross-Encoder # Evaluasi kecocokan antara query asli dengan dokumen kandidat pairs = [(expanded, c["judul"] + " " + c["deskripsi"]) for c in candidates] try: scores = reranker.predict(pairs) except Exception as e: print("Reranker error:", e) return {"results": sorted(candidates, key=lambda x: x.get("similarity", 0), reverse=True)[:10]} # 5. Normalisasi skor Reranker & Kalkulasi Final Score rerank_vals = [float(s) for s in scores] rmin, rmax = min(rerank_vals), max(rerank_vals) for c, s in zip(candidates, rerank_vals): c["rerank_score"] = s # Normalisasi Min-Max if rmax - rmin > 1e-9: c["rerank_norm"] = (s - rmin) / (rmax - rmin) else: c["rerank_norm"] = 0.0 # Skor Final Sistem Neural IR Anda (gabungan Stage 1: Hybrid Retrieval + Stage 2: Reranking) # Anda bisa menyesuaikan bobot ini nanti db_hybrid_sim = c.get("similarity", 0.0) c["final_score"] = (0.5 * db_hybrid_sim) + (0.5 * c["rerank_norm"]) print("=== Candidates AFTER Cross-Encoder rerank (top 10) ===") # Urutkan berdasarkan final_score candidates = sorted(candidates, key=lambda x: x["final_score"], reverse=True) for c in candidates[:10]: print(c.get("kode"), c.get("judul")[:80], "final_score=", c.get("final_score"), "rerank=", c.get("rerank_score")) # Kembalikan 10 teratas (sesuai logika asli Anda) return {"results": candidates[:10]} def search_kbli(text: str): if not text: return {"embedding": [], "results": []} embedding = get_embedding(text) response = supabase.rpc( "search_kbli", {"query_embedding": embedding, "match_count": 25} ).execute() results = response.data if response.data else [] if not results: return "
Tidak ditemukan hasil.
" html = """{r['kode']} – {r['judul']}
{r['deskripsi']}
{r['kode']} – {r['judul']}
{r['deskripsi']}