File size: 3,358 Bytes
c361372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


class DataProcessor:
    def __init__(self, file_path=None):
        if not file_path:
            # Resolve path relative to this file
            file_path = os.path.join(os.path.dirname(__file__), "cleaned_medquad.csv")
        print(f"πŸ“‚ Loading data from: {file_path}")

        self.df = pd.read_csv(file_path)
        self.clean_data()
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.df['question'])

    def clean_data(self):
        print("🧹 Cleaning data...")
        print("πŸ” Missing values:\n", self.df.isnull().sum())

        # Remove rows with 'key points' in the answer
        self.df = self.df[~self.df['answer'].str.contains(r'key points', case=False, na=False)]

        # Preprocess question and answer columns
        self.df['question'] = self.df['question'].str.lower().str.strip()
        self.df['answer'] = self.df['answer'].str.lower().str.strip()

        # Group by question to merge duplicate entries
        self.df = self.df.groupby("question", as_index=False).agg({
            "answer": lambda x: " || ".join(x.astype(str)),
            "source": lambda x: ", ".join(x.astype(str).unique())
        })

        # Split into train/test for potential future use
        self.df, self.test_df = train_test_split(self.df, test_size=0.2, random_state=42)
        print("βœ… Data cleaned and split.")


class Chatbot:
    def __init__(self, data_processor, similarity_threshold=0.5):
        self.data_processor = data_processor
        self.similarity_threshold = similarity_threshold

    def get_response(self, user_query):
        user_query = user_query.lower().strip()
        user_tfidf = self.data_processor.vectorizer.transform([user_query])
        similarities = cosine_similarity(user_tfidf, self.data_processor.tfidf_matrix)

        best_match_index = similarities.argmax()
        best_score = similarities[0, best_match_index]

        if best_score < self.similarity_threshold:
            return "UNKNOWN", "I'm sorry, I don't have enough information to answer that question.", "N/A"

        best_question = self.data_processor.df.iloc[best_match_index]['question']
        best_answer = self.data_processor.df.iloc[best_match_index]['answer'].replace(" || ", "\n- ")
        source = self.data_processor.df.iloc[best_match_index]['source']

        return best_question, best_answer, source

    def chat(self):
        print("πŸ’¬ Medical Chatbot is ready! Type 'exit' to stop chatting.")
        while True:
            user_input = input("\nYou: ")
            if user_input.lower() == "exit":
                print("πŸ‘‹ Chatbot: Goodbye! Take care!")
                break
            best_question, response, source = self.get_response(user_input)
            if best_question == "UNKNOWN":
                print(f"\nπŸ€– Chatbot: {response}")
            else:
                print(f"\nπŸ€– Chatbot (matched question): {best_question}")
                print(f"πŸ’‘ Answer: {response}")
                print(f"πŸ“š Source: {source}")

# βœ… Print to confirm successful module load
print("βœ… chatbot_function.py loaded successfully")