import gradio as gr import pickle from rank_bm25 import BM25Okapi from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from difflib import SequenceMatcher import numpy as np import random # Load model print("🤖 Loading model...") with open('chatbot_caca.pkl', 'rb') as f: data = pickle.load(f) qa_pairs = data['qa_pairs'] bm25 = data['bm25'] tfidf = data['tfidf'] tfidf_matrix = data['tfidf_matrix'] answers = data['answers'] print(f"✅ Loaded {len(qa_pairs)} QA pairs") def preprocess(text): return text.lower().strip() def get_bm25_score(user_input, top_k=3): tokenized_query = preprocess(user_input).split() scores = bm25.get_scores(tokenized_query) top_indices = np.argsort(scores)[-top_k:][::-1] return [(idx, scores[idx]) for idx in top_indices] def get_tfidf_score(user_input, top_k=3): user_vector = tfidf.transform([preprocess(user_input)]) similarities = cosine_similarity(user_vector, tfidf_matrix)[0] top_indices = np.argsort(similarities)[-top_k:][::-1] return [(idx, similarities[idx]) for idx in top_indices] def get_fuzzy_score(user_input, candidate_idx): question = qa_pairs[candidate_idx]['question'] return SequenceMatcher(None, preprocess(user_input), preprocess(question)).ratio() def fallback_response(confidence=0.0): if confidence > 0.15: responses = [ "hmm kayaknya aku tau sih maksudmu, tapi ga terlalu yakin... coba tanya dengan kata lain? 🤔", "aku nangkep sedikit sih, tapi ga confident buat jawab. bisa diperjelas ga?", ] else: responses = [ "waduh, pertanyaan ini di luar kemampuanku nih. Lyon-nya kurang ngajarin kayaknya 🙄", "jujur aja ya, aku ga ngerti maksudmu 😂 coba tanya yang lain deh", "kayaknya pertanyaan ini terlalu advanced buat AI bernama Caca Kecil 😅", "hmm aku belum tau jawabannya nih. Lyon-nya lagi males update dataset kayaknya 😤", "maaf belum bisa jawab yang itu. tapi aku usahain belajar ya! *semangat meski nama ngaco*", ] return random.choice(responses) def chat(message, history): """Chat function untuk Gradio""" # Get scores bm25_results = get_bm25_score(message, top_k=3) tfidf_results = get_tfidf_score(message, top_k=3) # Combine scores combined_scores = {} for idx, score in bm25_results: normalized_score = min(score / 20, 1.0) combined_scores[idx] = combined_scores.get(idx, 0) + (normalized_score * 0.4) for idx, score in tfidf_results: combined_scores[idx] = combined_scores.get(idx, 0) + (score * 0.5) if not combined_scores: return fallback_response(0.0) best_idx = max(combined_scores, key=combined_scores.get) best_score = combined_scores[best_idx] # Fuzzy bonus fuzzy_score = get_fuzzy_score(message, best_idx) final_score = best_score + (fuzzy_score * 0.1) threshold = 0.25 if final_score >= threshold: return answers[best_idx] else: return fallback_response(final_score) # Create Gradio interface demo = gr.ChatInterface( fn=chat, title="💬 Chatbot Caca", description=""" Chatbot berbasis retrieval (BM25 + TF-IDF) untuk QA Bahasa Indonesia. **Fun fact:** AI ini namanya Caca Kecil karena creator-nya (Lyon) punya selera penamaan yang... unik 😂 Model size: 2.83 MB | QA pairs: 3,500+ | No LLM needed! """, examples=[ "siapa nama kamu?", "ceritakan tentang dirimu", "siapa itu Lyon?", "kenapa namamu Caca?", "kamu bisa apa?", ], theme="soft", chatbot=gr.Chatbot(height=400), ) if __name__ == "__main__": demo.launch()