File size: 3,824 Bytes
3ae8926 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | import gradio as gr
import pickle
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
import numpy as np
import random
# Load model
print("π€ Loading model...")
with open('chatbot_caca.pkl', 'rb') as f:
data = pickle.load(f)
qa_pairs = data['qa_pairs']
bm25 = data['bm25']
tfidf = data['tfidf']
tfidf_matrix = data['tfidf_matrix']
answers = data['answers']
print(f"β
Loaded {len(qa_pairs)} QA pairs")
def preprocess(text):
return text.lower().strip()
def get_bm25_score(user_input, top_k=3):
tokenized_query = preprocess(user_input).split()
scores = bm25.get_scores(tokenized_query)
top_indices = np.argsort(scores)[-top_k:][::-1]
return [(idx, scores[idx]) for idx in top_indices]
def get_tfidf_score(user_input, top_k=3):
user_vector = tfidf.transform([preprocess(user_input)])
similarities = cosine_similarity(user_vector, tfidf_matrix)[0]
top_indices = np.argsort(similarities)[-top_k:][::-1]
return [(idx, similarities[idx]) for idx in top_indices]
def get_fuzzy_score(user_input, candidate_idx):
question = qa_pairs[candidate_idx]['question']
return SequenceMatcher(None, preprocess(user_input), preprocess(question)).ratio()
def fallback_response(confidence=0.0):
if confidence > 0.15:
responses = [
"hmm kayaknya aku tau sih maksudmu, tapi ga terlalu yakin... coba tanya dengan kata lain? π€",
"aku nangkep sedikit sih, tapi ga confident buat jawab. bisa diperjelas ga?",
]
else:
responses = [
"waduh, pertanyaan ini di luar kemampuanku nih. Lyon-nya kurang ngajarin kayaknya π",
"jujur aja ya, aku ga ngerti maksudmu π coba tanya yang lain deh",
"kayaknya pertanyaan ini terlalu advanced buat AI bernama Caca Kecil π
",
"hmm aku belum tau jawabannya nih. Lyon-nya lagi males update dataset kayaknya π€",
"maaf belum bisa jawab yang itu. tapi aku usahain belajar ya! *semangat meski nama ngaco*",
]
return random.choice(responses)
def chat(message, history):
"""Chat function untuk Gradio"""
# Get scores
bm25_results = get_bm25_score(message, top_k=3)
tfidf_results = get_tfidf_score(message, top_k=3)
# Combine scores
combined_scores = {}
for idx, score in bm25_results:
normalized_score = min(score / 20, 1.0)
combined_scores[idx] = combined_scores.get(idx, 0) + (normalized_score * 0.4)
for idx, score in tfidf_results:
combined_scores[idx] = combined_scores.get(idx, 0) + (score * 0.5)
if not combined_scores:
return fallback_response(0.0)
best_idx = max(combined_scores, key=combined_scores.get)
best_score = combined_scores[best_idx]
# Fuzzy bonus
fuzzy_score = get_fuzzy_score(message, best_idx)
final_score = best_score + (fuzzy_score * 0.1)
threshold = 0.25
if final_score >= threshold:
return answers[best_idx]
else:
return fallback_response(final_score)
# Create Gradio interface
demo = gr.ChatInterface(
fn=chat,
title="π¬ Chatbot Caca",
description="""
Chatbot berbasis retrieval (BM25 + TF-IDF) untuk QA Bahasa Indonesia.
**Fun fact:** AI ini namanya Caca Kecil karena creator-nya (Lyon) punya selera penamaan yang... unik π
Model size: 2.83 MB | QA pairs: 3,500+ | No LLM needed!
""",
examples=[
"siapa nama kamu?",
"ceritakan tentang dirimu",
"siapa itu Lyon?",
"kenapa namamu Caca?",
"kamu bisa apa?",
],
theme="soft",
chatbot=gr.Chatbot(height=400),
)
if __name__ == "__main__":
demo.launch() |