|
|
import gradio as gr |
|
|
import pickle |
|
|
from rank_bm25 import BM25Okapi |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
from difflib import SequenceMatcher |
|
|
import numpy as np |
|
|
import random |
|
|
|
|
|
|
|
|
print("π€ Loading model...") |
|
|
with open('chatbot_caca.pkl', 'rb') as f: |
|
|
data = pickle.load(f) |
|
|
|
|
|
qa_pairs = data['qa_pairs'] |
|
|
bm25 = data['bm25'] |
|
|
tfidf = data['tfidf'] |
|
|
tfidf_matrix = data['tfidf_matrix'] |
|
|
answers = data['answers'] |
|
|
|
|
|
print(f"β
Loaded {len(qa_pairs)} QA pairs") |
|
|
|
|
|
|
|
|
def preprocess(text): |
|
|
return text.lower().strip() |
|
|
|
|
|
|
|
|
def get_bm25_score(user_input, top_k=3): |
|
|
tokenized_query = preprocess(user_input).split() |
|
|
scores = bm25.get_scores(tokenized_query) |
|
|
top_indices = np.argsort(scores)[-top_k:][::-1] |
|
|
return [(idx, scores[idx]) for idx in top_indices] |
|
|
|
|
|
|
|
|
def get_tfidf_score(user_input, top_k=3): |
|
|
user_vector = tfidf.transform([preprocess(user_input)]) |
|
|
similarities = cosine_similarity(user_vector, tfidf_matrix)[0] |
|
|
top_indices = np.argsort(similarities)[-top_k:][::-1] |
|
|
return [(idx, similarities[idx]) for idx in top_indices] |
|
|
|
|
|
|
|
|
def get_fuzzy_score(user_input, candidate_idx): |
|
|
question = qa_pairs[candidate_idx]['question'] |
|
|
return SequenceMatcher(None, preprocess(user_input), preprocess(question)).ratio() |
|
|
|
|
|
|
|
|
def fallback_response(confidence=0.0): |
|
|
if confidence > 0.15: |
|
|
responses = [ |
|
|
"hmm kayaknya aku tau sih maksudmu, tapi ga terlalu yakin... coba tanya dengan kata lain? π€", |
|
|
"aku nangkep sedikit sih, tapi ga confident buat jawab. bisa diperjelas ga?", |
|
|
] |
|
|
else: |
|
|
responses = [ |
|
|
"waduh, pertanyaan ini di luar kemampuanku nih. Lyon-nya kurang ngajarin kayaknya π", |
|
|
"jujur aja ya, aku ga ngerti maksudmu π coba tanya yang lain deh", |
|
|
"kayaknya pertanyaan ini terlalu advanced buat AI bernama Caca Kecil π
", |
|
|
"hmm aku belum tau jawabannya nih. Lyon-nya lagi males update dataset kayaknya π€", |
|
|
"maaf belum bisa jawab yang itu. tapi aku usahain belajar ya! *semangat meski nama ngaco*", |
|
|
] |
|
|
return random.choice(responses) |
|
|
|
|
|
|
|
|
def chat(message, history): |
|
|
"""Chat function untuk Gradio""" |
|
|
|
|
|
|
|
|
bm25_results = get_bm25_score(message, top_k=3) |
|
|
tfidf_results = get_tfidf_score(message, top_k=3) |
|
|
|
|
|
|
|
|
combined_scores = {} |
|
|
|
|
|
for idx, score in bm25_results: |
|
|
normalized_score = min(score / 20, 1.0) |
|
|
combined_scores[idx] = combined_scores.get(idx, 0) + (normalized_score * 0.4) |
|
|
|
|
|
for idx, score in tfidf_results: |
|
|
combined_scores[idx] = combined_scores.get(idx, 0) + (score * 0.5) |
|
|
|
|
|
if not combined_scores: |
|
|
return fallback_response(0.0) |
|
|
|
|
|
best_idx = max(combined_scores, key=combined_scores.get) |
|
|
best_score = combined_scores[best_idx] |
|
|
|
|
|
|
|
|
fuzzy_score = get_fuzzy_score(message, best_idx) |
|
|
final_score = best_score + (fuzzy_score * 0.1) |
|
|
|
|
|
threshold = 0.25 |
|
|
|
|
|
if final_score >= threshold: |
|
|
return answers[best_idx] |
|
|
else: |
|
|
return fallback_response(final_score) |
|
|
|
|
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
fn=chat, |
|
|
title="π¬ Chatbot Caca", |
|
|
description=""" |
|
|
Chatbot berbasis retrieval (BM25 + TF-IDF) untuk QA Bahasa Indonesia. |
|
|
|
|
|
**Fun fact:** AI ini namanya Caca Kecil karena creator-nya (Lyon) punya selera penamaan yang... unik π |
|
|
|
|
|
Model size: 2.83 MB | QA pairs: 3,500+ | No LLM needed! |
|
|
""", |
|
|
examples=[ |
|
|
"siapa nama kamu?", |
|
|
"ceritakan tentang dirimu", |
|
|
"siapa itu Lyon?", |
|
|
"kenapa namamu Caca?", |
|
|
"kamu bisa apa?", |
|
|
], |
|
|
theme="soft", |
|
|
chatbot=gr.Chatbot(height=400), |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |