Spaces:
Sleeping
Sleeping
File size: 3,358 Bytes
c361372 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
class DataProcessor:
def __init__(self, file_path=None):
if not file_path:
# Resolve path relative to this file
file_path = os.path.join(os.path.dirname(__file__), "cleaned_medquad.csv")
print(f"π Loading data from: {file_path}")
self.df = pd.read_csv(file_path)
self.clean_data()
self.vectorizer = TfidfVectorizer()
self.tfidf_matrix = self.vectorizer.fit_transform(self.df['question'])
def clean_data(self):
print("π§Ή Cleaning data...")
print("π Missing values:\n", self.df.isnull().sum())
# Remove rows with 'key points' in the answer
self.df = self.df[~self.df['answer'].str.contains(r'key points', case=False, na=False)]
# Preprocess question and answer columns
self.df['question'] = self.df['question'].str.lower().str.strip()
self.df['answer'] = self.df['answer'].str.lower().str.strip()
# Group by question to merge duplicate entries
self.df = self.df.groupby("question", as_index=False).agg({
"answer": lambda x: " || ".join(x.astype(str)),
"source": lambda x: ", ".join(x.astype(str).unique())
})
# Split into train/test for potential future use
self.df, self.test_df = train_test_split(self.df, test_size=0.2, random_state=42)
print("β
Data cleaned and split.")
class Chatbot:
def __init__(self, data_processor, similarity_threshold=0.5):
self.data_processor = data_processor
self.similarity_threshold = similarity_threshold
def get_response(self, user_query):
user_query = user_query.lower().strip()
user_tfidf = self.data_processor.vectorizer.transform([user_query])
similarities = cosine_similarity(user_tfidf, self.data_processor.tfidf_matrix)
best_match_index = similarities.argmax()
best_score = similarities[0, best_match_index]
if best_score < self.similarity_threshold:
return "UNKNOWN", "I'm sorry, I don't have enough information to answer that question.", "N/A"
best_question = self.data_processor.df.iloc[best_match_index]['question']
best_answer = self.data_processor.df.iloc[best_match_index]['answer'].replace(" || ", "\n- ")
source = self.data_processor.df.iloc[best_match_index]['source']
return best_question, best_answer, source
def chat(self):
print("π¬ Medical Chatbot is ready! Type 'exit' to stop chatting.")
while True:
user_input = input("\nYou: ")
if user_input.lower() == "exit":
print("π Chatbot: Goodbye! Take care!")
break
best_question, response, source = self.get_response(user_input)
if best_question == "UNKNOWN":
print(f"\nπ€ Chatbot: {response}")
else:
print(f"\nπ€ Chatbot (matched question): {best_question}")
print(f"π‘ Answer: {response}")
print(f"π Source: {source}")
# β
Print to confirm successful module load
print("β
chatbot_function.py loaded successfully") |