import json import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity import random from sentence_transformers import SentenceTransformer import gradio as gr import time # Load datasets lecturer_data = pd.read_csv('lecturers.csv', dtype={"phone_number": str}).astype(str) doc_link_data = pd.read_csv('docs_link.csv') with open('anjibot_data.json', 'r', encoding='utf-8') as file: anjibot_data = json.load(file) def load_default_responses(filename): with open(filename, 'r', encoding='utf-8') as file: default_responses = file.readlines() return [response.strip() for response in default_responses] # Load default responses from file default_responses = load_default_responses('default_responses.txt') # Load Sentence Transformer model model = SentenceTransformer('paraphrase-MiniLM-L6-v2') def encode_text(text): # Encode text using Sentence Transformer embeddings = model.encode([text]) return embeddings[0] # function to answer general queries def answer_general_query(user_question): user_question_embedding = encode_text(user_question) questions = [item['question'] for item in anjibot_data] question_embeddings = np.array([encode_text(q) for q in questions]) similarities = cosine_similarity([user_question_embedding], question_embeddings) most_similar_index = np.argmax(similarities) max_similarity = similarities[0][most_similar_index] # Set a threshold for similarity if max_similarity > 0.5: return anjibot_data[most_similar_index]['answer'] elif max_similarity > 0.3: # Select a random default response default_response = random.choice(default_responses) return default_response else: return "I'm sorry, I couldn't find the answer to your question. Please meet Anji or any of the class excos." def normalize_text(text): # Convert text to lowercase and remove non-alphanumeric characters clean_text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace()) # Split text into words and remove possessive forms words = clean_text.split() normalized_words = [] for word in words: # Remove possessive apostrophe if present word = word.rstrip("'s") normalized_words.append(word) return set(normalized_words) exceptions = ["mr", "dr", "the", "i", "to", "ayo", "in", "of", "and", 'mrs.', 'in', 'and', 'of', 'a', 'for', 'the', 'with', 'by', 'at'] # custom similarity matching function def word_lookup(text, query, exceptions=exceptions): # Normalize text and query text_words = normalize_text(text) query_words = normalize_text(query) # Find matching sequences excluding exceptions matching_sequences = set() for word in text_words: if word in query_words and word not in exceptions: matching_sequences.add(word) # Return the count of matching sequences return len(matching_sequences) def get_phone_number_response(best_match): if best_match['phone_number']: return f"Sure! {best_match['name']} the {best_match['course']} ({best_match['course_code']}) lecturer's phone number is {best_match['phone_number']}." else: return "Sorry, the phone number is not available." def get_office_response(best_match): if best_match['office'] == "No longer in Babcock": return f"Oops! {best_match['name']} the {best_match['course']} ({best_match['course_code']}) lecturer is {best_match['office']}." elif best_match['office']: return f"Sure thing! {best_match['name']} the {best_match['course']} ({best_match['course_code']}) lecturer's office is at {best_match['office']}." else: return "Sorry, the office location is not available." def get_basic_info_response(query, best_match): if "code" in query: return f"The course code for {best_match['course']} is {best_match['course_code']}" else: return f"{best_match['name']} is the {best_match['course']} ({best_match['course_code']}) lecturer." def get_default_response(best_match): return f"{best_match['course']} has the course code: {best_match['course_code']}" def process_query(query, best_match): if "phone number" in query or "number" in query: return get_phone_number_response(best_match) elif "office" in query: return get_office_response(best_match) elif any(word in query for word in ["lecturer", "who", "code"]): return get_basic_info_response(query, best_match) else: return get_default_response(best_match) def answer_lecturer_query(query): query = query.lower() max_score = 0 best_match = None for index, row in lecturer_data.iterrows(): text = f"{row['course']} {row['course_code']} {row['name']}".lower() score = word_lookup(query, text) # Find the highest score if score > max_score: max_score = score best_match = row if max_score >= 1: if any(word in query for word in ["cosc", "geds", "ged"]): for i, word in enumerate(query.split()): if word.isdigit(): # Retrieve the prefix from the previous word query_course_code = f"{query.split()[i - 1]} {word}" if query_course_code.upper() == best_match['course_code']: return process_query(query, best_match) else: return "Sorry, I couldn't find info about the course you've mentioned." else: return process_query(query, best_match) else: return answer_general_query(query) def get_links_response(query, best_match): school_files = ["past questions", "pst questions", "pq", "pstq", "slides for"] study_smarter = ["flashcards", "study set", "study", "study app", "study link", "slides", "today", "class", "lecturer"] if any(keyword in query for keyword in school_files): if best_match['School files Link'] != "Unavailable": return f"Looking for slides and/or past questions for {best_match['course']} ({best_match['course_code']})? This link should help you: {best_match['School files Link']}" else: return f"Oops! Sorry, I can't find slides or past questions for that course." elif any(keyword in query for keyword in study_smarter): if best_match['Study Smarter Link'] != "Unavailable": return f"The Study Smarter study set for {best_match['course']} ({best_match['course_code']}) contains the recent slides sent by the lecturer (and possibly flashcards, notes, and more learning resources). The link to the study set: {best_match['Study Smarter Link']}" else: return f"I'm sorry, I can't find any study smarter study set for that course." def answer_doc_link_query(query): query = query.lower() max_score = 0 best_match = None for index, row in doc_link_data.iterrows(): text = f"{row['course']} {row['course_code']}".lower() score = word_lookup(query, text) # Find the highest score if score > max_score: max_score = score best_match = row if max_score >= 1: if any(word in query for word in ["cosc", "geds", "ged"]): for i, word in enumerate(query.split()): if word.isdigit(): # Retrieve the prefix from the previous word query_course_code = f"{query.split()[i - 1]} {word}" if query_course_code.upper() == best_match['course_code']: return get_links_response(query, best_match) else: return "Sorry, I couldn't find info about the course you've mentioned." else: return get_links_response(query, best_match) else: return "Sure! To assist you better, please provide the name or code of the course you are referring to, along with the entire query." # Define function to determine intent def get_intent(query): # Define keywords or phrases associated with each intent lecturer_keywords = ["lecturer", "lecturer's" "phone number", "number", "office", "who", "code", "course", "name"] doc_link_keywords = ["past questions", "pstq", "pq", "pst", "study materials", "flashcards", "studysmarter", "study smarter", "slides", "slide", "pdf"] unknown_keywords = ["email", "missed", "write"] # Check for keywords in the query query_lower = query.lower() if any(keyword in query_lower for keyword in unknown_keywords): return "unknown" elif any(keyword in query_lower for keyword in lecturer_keywords): return "lecturer" elif any(keyword in query_lower for keyword in doc_link_keywords): return "doc_link" else: return "general" def get_response(query): intent = get_intent(query) if query == "": response = "Yo! Don't send me stickers, I don't understand them anyway 😕" elif intent == "unknown": response = "Ugh, your query is quite beyond me. Please meet Anji directly :)" elif intent == "lecturer": response = answer_lecturer_query(query) elif intent == 'doc_link': response = answer_doc_link_query(query) else: response = answer_general_query(query) return response with gr.Blocks() as iface: gr.Markdown( """ # Anjibot Hi friend! I'm Anjibot, CS Group A AI Course Rep. How can I assist you today? """) chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="Type your question here", label="User") submit = gr.Button("Submit") clear = gr.ClearButton([msg, chatbot]) with gr.Accordion("Read this, pleaseeee"): gr.Markdown( """ #### As you interact with me, please note: - Our chats are not private. - I'm still undergoing training (I'm not perfect). - I'm not ChatGPT (My knowledge base is limited to class-related issues). - I'm British ;) """) def respond(message, chat_history): bot_message = get_response(message) chat_history.append( (f"**You:** {message}", f"**Anjibot:** {bot_message}")) time.sleep(2) return "", chat_history submit.click(respond, [msg, chatbot], [msg, chatbot]) msg.submit(respond, [msg, chatbot], [msg, chatbot]) if __name__ == "__main__": iface.launch()