# -*- coding: utf-8 -*- import torch import pandas as pd import faiss import gradio as gr from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForCausalLM from rank_bm25 import BM25Okapi # ========================= # LOAD DATA # ========================= df = pd.read_csv("chunks.csv") index = faiss.read_index("faiss_index.index") # ========================= # EMBEDDING MODEL # ========================= device = "cuda" if torch.cuda.is_available() else "cpu" embed_model = SentenceTransformer( "sentence-transformers/all-MiniLM-L6-v2", device=device ) # ========================= # BM25 # ========================= corpus = [text.split() for text in df["text"].tolist()] bm25 = BM25Okapi(corpus) # ========================= # HYBRID SEARCH (DIKECILKAN) # ========================= def hybrid_search(query, top_k=3, alpha=0.6): bm25_scores = bm25.get_scores(query.split()) query_emb = embed_model.encode( query, convert_to_tensor=True, normalize_embeddings=True ) sem_scores, indices = index.search( query_emb.cpu().numpy().reshape(1, -1), top_k ) # ambil indeks global idxs = indices[0] df_temp = df.iloc[idxs].copy() df_temp["bm25"] = [bm25_scores[i] for i in idxs] df_temp["semantic"] = sem_scores[0] return df_temp # ========================= # LLM (BAHASALAB LOCAL) # ========================= MODEL_ID = "Bahasalab/Bahasa-4b-chat-v2" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) llm = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype=torch.float16 ) # ========================= # PROMPT # ========================= def build_prompt(query, context_df): context = "\n\n".join( f"(Hal {row['page_number']}) {row['text'][:500]}" for _, row in context_df.iterrows() ) return f""" Jawab hanya berdasarkan konteks berikut. Jika tidak ada, jawab: Informasi tidak ditemukan. KONTEKS: {context} PERTANYAAN: {query} JAWABAN: """ # ========================= # GENERATE # ========================= def ask(query): results = hybrid_search(query) if results.empty: return "Informasi tidak ditemukan." prompt = build_prompt(query, results) inputs = tokenizer(prompt, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): output = llm.generate( **inputs, max_new_tokens=80, # diperkecil biar lebih cepat temperature=0.3, do_sample=False ) decoded = tokenizer.decode(output[0], skip_special_tokens=True) if "JAWABAN:" in decoded: answer = decoded.split("JAWABAN:")[-1].strip() else: answer = decoded.strip() sources = { f"{r['source_file']} (Hal {r['page_number']})" for _, r in results.iterrows() } return answer + "\n\n๐ Sumber:\n" + "\n".join(f"- {s}" for s in sources) # ========================= # CHATBOT # ========================= def chatbot_fn(message, history): if history is None: history = [] if not message.strip(): history.append({ "role": "assistant", "content": "Silakan masukkan pertanyaan." }) return "", history history.append({ "role": "user", "content": message }) answer = ask(message) history.append({ "role": "assistant", "content": answer }) return "", history def clear_chat(): return [], "" # ========================= # UI # ========================= custom_css = """ /* ===== FORCE DARK GLOBAL ===== */ :root { color-scheme: dark !important; } html, body, #root, .gradio-container { background-color: #0f172a !important; color: #e5e7eb !important; margin: 0; padding: 0; } /* Hilangkan efek light mode */ * { background-color: transparent; } /* Container utama */ .gradio-container { background-color: #0f172a !important; max-width: 100% !important; } /* Wrapper */ .block-container { max-width: 1200px; margin: auto; padding: 20px; background-color: #0f172a !important; } /* ===== CHATBOT ===== */ [data-testid="chatbot"], [data-testid="chatbot"] *, .gr-chatbot, .gr-chatbot * { background-color: #111827 !important; } /* Scroll area */ .overflow-y-auto { background-color: #111827 !important; } /* Bubble */ .message.bot { background-color: #1f2937 !important; color: #e5e7eb !important; border-radius: 12px; } .message.user { background-color: #2563eb !important; color: white !important; border-radius: 12px; } /* Textbox */ textarea { background-color: #1f2937 !important; color: white !important; border-radius: 12px !important; } /* Input wrapper */ .gr-textbox, .gr-input-container { background-color: #0f172a !important; } /* Label */ .gr-textbox > label { color: #cbd5e1 !important; } /* Button */ button { background: linear-gradient(135deg, #2563eb, #1d4ed8) !important; color: white !important; border-radius: 10px !important; padding: 10px 16px; } /* Text */ h1, h2, h3, p, label { color: #e5e7eb !important; } /* Header */ #hero { text-align: center; margin-bottom: 20px; } #hero p { color: #9ca3af; } /* Hilangkan outline */ *:focus { outline: none !important; } /* ===== HILANGKAN HEADER GRADIO ===== */ footer, .gradio-container .footer, #footer, .built-with, a[href*="gradio.app"], button[aria-label="Use via API"], button[aria-label="Settings"] { display: none !important; } """ # ========================= # UI (FORCE DARK THEME) # ========================= with gr.Blocks( css=custom_css, theme=gr.themes.Base( primary_hue="blue", neutral_hue="slate" ) ) as demo: gr.HTML("""
Chatbot yang dirancang untuk membantu pengguna mendapatkan informasi seputar bangunan gedung, tata ruang, dan regulasi Dinas Cipta Karya dan Tata Ruang DKI Jakarta.