Lyon28
/

Caca-Chatbot

+import gradio as gr
+import pickle
+from rank_bm25 import BM25Okapi
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from difflib import SequenceMatcher
+import numpy as np
+import random
+# Load model
+print("🤖 Loading model...")
+with open('chatbot_caca.pkl', 'rb') as f:
+    data = pickle.load(f)
+qa_pairs = data['qa_pairs']
+bm25 = data['bm25']
+tfidf = data['tfidf']
+tfidf_matrix = data['tfidf_matrix']
+answers = data['answers']
+print(f"✅ Loaded {len(qa_pairs)} QA pairs")
+def preprocess(text):
+    return text.lower().strip()
+def get_bm25_score(user_input, top_k=3):
+    tokenized_query = preprocess(user_input).split()
+    scores = bm25.get_scores(tokenized_query)
+    top_indices = np.argsort(scores)[-top_k:][::-1]
+    return [(idx, scores[idx]) for idx in top_indices]
+def get_tfidf_score(user_input, top_k=3):
+    user_vector = tfidf.transform([preprocess(user_input)])
+    similarities = cosine_similarity(user_vector, tfidf_matrix)[0]
+    top_indices = np.argsort(similarities)[-top_k:][::-1]
+    return [(idx, similarities[idx]) for idx in top_indices]
+def get_fuzzy_score(user_input, candidate_idx):
+    question = qa_pairs[candidate_idx]['question']
+    return SequenceMatcher(None, preprocess(user_input), preprocess(question)).ratio()
+def fallback_response(confidence=0.0):
+    if confidence > 0.15:
+        responses = [
+            "hmm kayaknya aku tau sih maksudmu, tapi ga terlalu yakin... coba tanya dengan kata lain? 🤔",
+            "aku nangkep sedikit sih, tapi ga confident buat jawab. bisa diperjelas ga?",
+        ]
+    else:
+        responses = [
+            "waduh, pertanyaan ini di luar kemampuanku nih. Lyon-nya kurang ngajarin kayaknya 🙄",
+            "jujur aja ya, aku ga ngerti maksudmu 😂 coba tanya yang lain deh",
+            "kayaknya pertanyaan ini terlalu advanced buat AI bernama Caca Kecil 😅",
+            "hmm aku belum tau jawabannya nih. Lyon-nya lagi males update dataset kayaknya 😤",
+            "maaf belum bisa jawab yang itu. tapi aku usahain belajar ya! *semangat meski nama ngaco*",
+        ]
+    return random.choice(responses)
+def chat(message, history):
+    """Chat function untuk Gradio"""
+    # Get scores
+    bm25_results = get_bm25_score(message, top_k=3)
+    tfidf_results = get_tfidf_score(message, top_k=3)
+    # Combine scores
+    combined_scores = {}
+    for idx, score in bm25_results:
+        normalized_score = min(score / 20, 1.0)
+        combined_scores[idx] = combined_scores.get(idx, 0) + (normalized_score * 0.4)
+    for idx, score in tfidf_results:
+        combined_scores[idx] = combined_scores.get(idx, 0) + (score * 0.5)
+    if not combined_scores:
+        return fallback_response(0.0)
+    best_idx = max(combined_scores, key=combined_scores.get)
+    best_score = combined_scores[best_idx]
+    # Fuzzy bonus
+    fuzzy_score = get_fuzzy_score(message, best_idx)
+    final_score = best_score + (fuzzy_score * 0.1)
+    threshold = 0.25
+    if final_score >= threshold:
+        return answers[best_idx]
+    else:
+        return fallback_response(final_score)
+# Create Gradio interface
+demo = gr.ChatInterface(
+    fn=chat,
+    title="💬 Chatbot Caca",
+    description="""
+    Chatbot berbasis retrieval (BM25 + TF-IDF) untuk QA Bahasa Indonesia.
+    **Fun fact:** AI ini namanya Caca Kecil karena creator-nya (Lyon) punya selera penamaan yang... unik 😂
+    Model size: 2.83 MB | QA pairs: 3,500+ | No LLM needed!
+    """,
+    examples=[
+        "siapa nama kamu?",
+        "ceritakan tentang dirimu",
+        "siapa itu Lyon?",
+        "kenapa namamu Caca?",
+        "kamu bisa apa?",
+    ],
+    theme="soft",
+    chatbot=gr.Chatbot(height=400),
+)
+if __name__ == "__main__":
+    demo.launch()