import os import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import train_test_split class DataProcessor: def __init__(self, file_path=None): if not file_path: # Resolve path relative to this file file_path = os.path.join(os.path.dirname(__file__), "cleaned_medquad.csv") print(f"๐Ÿ“‚ Loading data from: {file_path}") self.df = pd.read_csv(file_path) self.clean_data() self.vectorizer = TfidfVectorizer() self.tfidf_matrix = self.vectorizer.fit_transform(self.df['question']) def clean_data(self): print("๐Ÿงน Cleaning data...") print("๐Ÿ” Missing values:\n", self.df.isnull().sum()) # Remove rows with 'key points' in the answer self.df = self.df[~self.df['answer'].str.contains(r'key points', case=False, na=False)] # Preprocess question and answer columns self.df['question'] = self.df['question'].str.lower().str.strip() self.df['answer'] = self.df['answer'].str.lower().str.strip() # Group by question to merge duplicate entries self.df = self.df.groupby("question", as_index=False).agg({ "answer": lambda x: " || ".join(x.astype(str)), "source": lambda x: ", ".join(x.astype(str).unique()) }) # Split into train/test for potential future use self.df, self.test_df = train_test_split(self.df, test_size=0.2, random_state=42) print("โœ… Data cleaned and split.") class Chatbot: def __init__(self, data_processor, similarity_threshold=0.5): self.data_processor = data_processor self.similarity_threshold = similarity_threshold def get_response(self, user_query): user_query = user_query.lower().strip() user_tfidf = self.data_processor.vectorizer.transform([user_query]) similarities = cosine_similarity(user_tfidf, self.data_processor.tfidf_matrix) best_match_index = similarities.argmax() best_score = similarities[0, best_match_index] if best_score < self.similarity_threshold: return "UNKNOWN", "I'm sorry, I don't have enough information to answer that question.", "N/A" best_question = self.data_processor.df.iloc[best_match_index]['question'] best_answer = self.data_processor.df.iloc[best_match_index]['answer'].replace(" || ", "\n- ") source = self.data_processor.df.iloc[best_match_index]['source'] return best_question, best_answer, source def chat(self): print("๐Ÿ’ฌ Medical Chatbot is ready! Type 'exit' to stop chatting.") while True: user_input = input("\nYou: ") if user_input.lower() == "exit": print("๐Ÿ‘‹ Chatbot: Goodbye! Take care!") break best_question, response, source = self.get_response(user_input) if best_question == "UNKNOWN": print(f"\n๐Ÿค– Chatbot: {response}") else: print(f"\n๐Ÿค– Chatbot (matched question): {best_question}") print(f"๐Ÿ’ก Answer: {response}") print(f"๐Ÿ“š Source: {source}") # โœ… Print to confirm successful module load print("โœ… chatbot_function.py loaded successfully")