File size: 4,395 Bytes
8926a46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b6b5fa
8926a46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c828795
8926a46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""app.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1iEIEymiaYLbhMx5NqPutAwwI2rMCyIOQ
"""

import pandas as pd
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
import sys

# --- 1. Konfigurasi Model dan Data ---

# Model IndoBERT-Large-P1 (Lebih kuat dari MiniLM)
MODEL_NAME = "indobenchmark/indobert-large-p1"
try:
    print(f"Memuat Model Kuat: {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)
    print("Model berhasil dimuat.")
except Exception as e:
    print(f"FATAL ERROR: Gagal memuat model IndoBERT-Large. Detail: {e}")
    sys.exit(1)

# Fungsi Mean Pooling (Wajib untuk model non-SBERT)
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Fungsi untuk mendapatkan Sentence Embeddings
def get_embeddings(texts):
    """Menghasilkan vektor embedding untuk teks yang diberikan."""
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return F.normalize(sentence_embeddings, p=2, dim=1).numpy()


# --- Pemuatan Data CSV ---
FILE_PATH = "perpustakaan_faq.csv"
df_faq = None

try:
    print(f"Mencoba memuat file: {FILE_PATH}...")
    df_faq = pd.read_csv(FILE_PATH)
    df_faq = df_faq.rename(columns={'user_query': 'question', 'chatbot_response': 'answer'})
    print(f"File '{FILE_PATH}' berhasil dimuat dengan {len(df_faq)} baris data.")

except Exception as e:
    print(f"\nFATAL ERROR: Gagal memuat data CSV. Pastikan file ada di path. Detail: {e}")
    sys.exit(1)

# --- 2. Pelatihan Data (Pre-computation of FAQ Embeddings) ---

faq_embeddings = None
if df_faq is not None and not df_faq.empty:
    print("Menghitung embeddings untuk data FAQ...")
    faq_questions = df_faq['question'].tolist()
    faq_embeddings = get_embeddings(faq_questions)
    print("Penghitungan embeddings selesai.")
else:
    print("FATAL ERROR: Data tidak ditemukan atau kosong.")
    sys.exit(1)


# --- 3. Logika Chatbot (Retrieval) ---

# FIX SOLUTIF: Threshold diturunkan menjadi 0.60
def library_chatbot(user_query, threshold=0.60):
    """Fungsi utama chatbot untuk merespons pertanyaan pengguna."""

    if not user_query:
        return "Halo! Silakan ajukan pertanyaan seputar perpustakaan."

    user_embedding = get_embeddings([user_query])
    similarities = cosine_similarity(user_embedding, faq_embeddings)
    best_match_index = np.argmax(similarities)
    max_similarity = similarities[0][best_match_index]
    best_response = df_faq.loc[best_match_index, 'answer']

    if max_similarity >= threshold:
        return best_response
    else:
        return (
            "Mohon maaf, saya belum dapat menemukan jawaban yang spesifik untuk pertanyaan Anda "
            "dalam data yang saya miliki saat ini (Skor Kemiripan Tertinggi: {:.4f}). "
            "Silakan ajukan pertanyaan dengan kata kunci yang berbeda.".format(max_similarity)
        )


# --- 4. Tampilkan Hasil (Gradio Interface) ---

title = "Perpustakaan BI Kantor Sumatera Selatan"
description = (
    "🤖 Chatbot ini menggunakan Model **IndoBERT-Large-P1** yang lebih kuat dengan arsitektur RoBERTa. "
    "Ambang batas (Threshold) diturunkan menjadi **0.60**."
)

# Buat antarmuka Gradio
iface = gr.Interface(
    fn=library_chatbot,
    inputs=gr.Textbox(lines=2, placeholder="Ketik pertanyaan Anda di sini...", label="Pertanyaan Pengguna"),
    outputs=gr.Textbox(label="Jawaban Chatbot"),
    title=title,
    description=description,
    theme=gr.themes.Soft(),
    allow_flagging='never'
)

# Jalankan aplikasi Gradio
print("\n--- Menjalankan Gradio ---")
print("Tunggu sebentar hingga link publik muncul (Running on public URL: ...)")
iface.launch(share=True)