File size: 3,824 Bytes
3ae8926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import pickle
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
import numpy as np
import random

# Load model
print("πŸ€– Loading model...")
with open('chatbot_caca.pkl', 'rb') as f:
    data = pickle.load(f)

qa_pairs = data['qa_pairs']
bm25 = data['bm25']
tfidf = data['tfidf']
tfidf_matrix = data['tfidf_matrix']
answers = data['answers']

print(f"βœ… Loaded {len(qa_pairs)} QA pairs")


def preprocess(text):
    return text.lower().strip()


def get_bm25_score(user_input, top_k=3):
    tokenized_query = preprocess(user_input).split()
    scores = bm25.get_scores(tokenized_query)
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return [(idx, scores[idx]) for idx in top_indices]


def get_tfidf_score(user_input, top_k=3):
    user_vector = tfidf.transform([preprocess(user_input)])
    similarities = cosine_similarity(user_vector, tfidf_matrix)[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    return [(idx, similarities[idx]) for idx in top_indices]


def get_fuzzy_score(user_input, candidate_idx):
    question = qa_pairs[candidate_idx]['question']
    return SequenceMatcher(None, preprocess(user_input), preprocess(question)).ratio()


def fallback_response(confidence=0.0):
    if confidence > 0.15:
        responses = [
            "hmm kayaknya aku tau sih maksudmu, tapi ga terlalu yakin... coba tanya dengan kata lain? πŸ€”",
            "aku nangkep sedikit sih, tapi ga confident buat jawab. bisa diperjelas ga?",
        ]
    else:
        responses = [
            "waduh, pertanyaan ini di luar kemampuanku nih. Lyon-nya kurang ngajarin kayaknya πŸ™„",
            "jujur aja ya, aku ga ngerti maksudmu πŸ˜‚ coba tanya yang lain deh",
            "kayaknya pertanyaan ini terlalu advanced buat AI bernama Caca Kecil πŸ˜…",
            "hmm aku belum tau jawabannya nih. Lyon-nya lagi males update dataset kayaknya 😀",
            "maaf belum bisa jawab yang itu. tapi aku usahain belajar ya! *semangat meski nama ngaco*",
        ]
    return random.choice(responses)


def chat(message, history):
    """Chat function untuk Gradio"""
    
    # Get scores
    bm25_results = get_bm25_score(message, top_k=3)
    tfidf_results = get_tfidf_score(message, top_k=3)
    
    # Combine scores
    combined_scores = {}
    
    for idx, score in bm25_results:
        normalized_score = min(score / 20, 1.0)
        combined_scores[idx] = combined_scores.get(idx, 0) + (normalized_score * 0.4)
    
    for idx, score in tfidf_results:
        combined_scores[idx] = combined_scores.get(idx, 0) + (score * 0.5)
    
    if not combined_scores:
        return fallback_response(0.0)
    
    best_idx = max(combined_scores, key=combined_scores.get)
    best_score = combined_scores[best_idx]
    
    # Fuzzy bonus
    fuzzy_score = get_fuzzy_score(message, best_idx)
    final_score = best_score + (fuzzy_score * 0.1)
    
    threshold = 0.25
    
    if final_score >= threshold:
        return answers[best_idx]
    else:
        return fallback_response(final_score)


# Create Gradio interface
demo = gr.ChatInterface(
    fn=chat,
    title="πŸ’¬ Chatbot Caca",
    description="""
    Chatbot berbasis retrieval (BM25 + TF-IDF) untuk QA Bahasa Indonesia.
    
    **Fun fact:** AI ini namanya Caca Kecil karena creator-nya (Lyon) punya selera penamaan yang... unik πŸ˜‚
    
    Model size: 2.83 MB | QA pairs: 3,500+ | No LLM needed!
    """,
    examples=[
        "siapa nama kamu?",
        "ceritakan tentang dirimu",
        "siapa itu Lyon?",
        "kenapa namamu Caca?",
        "kamu bisa apa?",
    ],
    theme="soft",
    chatbot=gr.Chatbot(height=400),
)

if __name__ == "__main__":
    demo.launch()