QueenS5Ella commited on
Commit
f5b6b34
Β·
verified Β·
1 Parent(s): e6a0ae8

Upload medic_bot.py

Browse files
Files changed (1) hide show
  1. medic_bot.py +359 -0
medic_bot.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Medic_bot.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/#fileId=https%3A//huggingface.co/spaces/QueenS5Ella/Royalty/blob/main/Medic_bot.ipynb
8
+ """
9
+
10
+ # IMPORT THE NECESSARY LIBARIES 1
11
+ #Import Python libraries: Numpy and Pandas
12
+ import pandas as pd
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ from openai import OpenAI
16
+ import faiss
17
+ import numpy as np
18
+
19
+ #import libraries &modules for data visualization
20
+ from pandas.plotting import scatter_matrix
21
+ from matplotlib import pyplot
22
+
23
+ #import scikit-learn module for algoruthm/model: Linear Regression
24
+ from sklearn.neighbors import KNeighborsRegressor
25
+
26
+ #import scikit learn module to split the dataset into train/test sub-datasets
27
+ from sklearn.model_selection import train_test_split
28
+
29
+ #Import scikit-learn module for K-fold cross validation - algorithm/model evluation & vallidation
30
+ from sklearn.model_selection import KFold
31
+ from sklearn.model_selection import cross_val_score
32
+
33
+ #Import sckit-learn module for classification report
34
+ from sklearn.metrics import classification_report
35
+
36
+ from sklearn.preprocessing import LabelEncoder
37
+ from sklearn.preprocessing import OrdinalEncoder
38
+
39
+ # IMPORTATION OF NECESSARY LIBRARIES 2
40
+ import os # for handling data
41
+ import re # for text preprocessing
42
+
43
+ # For Natural Language Processing tasks
44
+ import nltk
45
+ from sklearn.model_selection import train_test_split
46
+
47
+ nltk.download("punkt")
48
+ nltk.download("stopwords")
49
+
50
+ # Optional: for vectorization and building of the models
51
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
52
+
53
+ #IMPORTATION OF THE DIFFERENT MODELS FOR THE CHATBOT
54
+ from sklearn.linear_model import LogisticRegression
55
+ from sklearn.ensemble import RandomForestRegressor
56
+ import xgboost as xgb
57
+ from sklearn.linear_model import Ridge
58
+ from sklearn.neural_network import MLPRegressor
59
+
60
+ import scipy
61
+ print(scipy.__version__)
62
+
63
+ import gradio as gr
64
+
65
+ # πŸ”‘ Replace with your real OpenAI API key
66
+ client = OpenAI(api_key = "sk-...") # <- Replace this with your actual API key
67
+
68
+ # πŸ“„ Load dataset
69
+ d1 = pd.read_csv("ai-medical-chatbot.csv")
70
+ d1.dropna(subset=["Description", "Doctor"], inplace=True)
71
+
72
+ vector1 = TfidfVectorizer()
73
+ # Keep the sparse matrix β€” don't convert to dense
74
+ qvs = vector1.fit_transform(d1["Description"]) # No .toarray()
75
+
76
+ d1.head()
77
+
78
+ def find_best_match(user_input):
79
+ user_vec = vector1.transform([user_input]) # Still a sparse matrix
80
+ similarities = cosine_similarity(user_vec, qvs)
81
+ best_idx = np.argmax(similarities[0])
82
+ best_score = float(similarities[0][best_idx])
83
+ return d1.iloc[best_idx]["Description"], d1.iloc[best_idx]["Doctor"], best_score
84
+
85
+ # πŸ” Vectorize questions
86
+ #vectorizer = TfidfVectorizer()
87
+ #question_vectors = vectorizer.fit_transform(df["Question"]).toarray()
88
+
89
+ # πŸ”Ž Find the most similar FAQ match
90
+ #def find_best_match(user_input):
91
+ #user_vec = vectorizer.transform([user_input]).toarray()
92
+ #similarities = cosine_similarity(user_vec, question_vectors)
93
+ #best_idx = np.argmax(similarities[0])
94
+ # best_score = float(similarities[0][best_idx])
95
+ # return df.iloc[best_idx]["Question"], df.iloc[best_idx]["Answer"], best_score
96
+
97
+ # πŸ€– Query OpenAI if no good FAQ match
98
+ def query_gpt(user_input):
99
+ try:
100
+ response = client.chat.completions.create(
101
+ model="gpt-4", # or use "gpt-3.5-turbo"
102
+ messages=[
103
+ {"role": "system", "content": "You are a pediatric pulmonology expert."},
104
+ {"role": "user", "content": user_input},
105
+ {"role": "assistant", "content": "Hello"}
106
+ ]
107
+ )
108
+ return response.choices[0].message["content"]
109
+ except Exception as e:
110
+ return f"⚠️ GPT Error: {str(e)}"
111
+
112
+ # πŸ’¬ Chatbot response logic
113
+ def chatbot_response(user_input):
114
+ if not user_input.strip():
115
+ return "Please enter a question."
116
+
117
+ try:
118
+ matched_q, matched_a, score = find_best_match(user_input)
119
+ if score > 0.75:
120
+ return f"πŸ“š **Answer from FAQ**:\n\n**Q:** {matched_q}\n**A:** {matched_a}"
121
+ else:
122
+ gpt_answer = query_gpt(user_input)
123
+ return f"πŸ€– **Answer from GPT-4**:\n\n{gpt_answer}"
124
+ except Exception as e:
125
+ return f"❌ Error processing your question: {str(e)}"
126
+
127
+ # 🌐 Launch Gradio interface
128
+ gr.Interface(
129
+ fn=chatbot_response,
130
+ inputs=gr.Textbox(label="Ask any pediatric pulmonology related questions"),
131
+ outputs=gr.Textbox(label="Response", lines=10),
132
+ title="Pediatric Pulmonology Medicbot",
133
+ description="Answers common non-critical questions about pediatric pulmonology using a mix of FAQ and GPT-4."
134
+ ).launch(share=True)
135
+
136
+ # Set your OpenAI key
137
+ #openai.api_key = "sk-..." # <- Replace this with your actual API key
138
+
139
+ # Load CSV
140
+ chat = pd.read_csv("PedMedQA_final.csv")
141
+
142
+ chat.head()
143
+
144
+ chat.describe()
145
+
146
+ chat.isnull().sum()
147
+
148
+ chat.shape
149
+
150
+ chat.info()
151
+
152
+ chat["answer"]. unique()
153
+
154
+ chat["answer"].value_counts()
155
+
156
+ chat["answer"] = chat["answer"].fillna("Reassurance")
157
+ print(chat["answer"])
158
+
159
+ chat["age_years"].unique
160
+
161
+ chat["age_years"].value_counts
162
+
163
+ chat.head()
164
+
165
+ chat.dtypes
166
+
167
+ chat.dropna(subset=["question", "answer"], inplace=True)
168
+ chat.drop_duplicates(subset=["question"], inplace=True)
169
+
170
+ chat.isnull().sum()
171
+
172
+ #oe = OrdinalEncoder()
173
+
174
+ #chat["index"] = oe.fit_transform(chat[["index"]])
175
+ chat["index"].head(3)
176
+
177
+ #chat["meta_info"] = oe.fit_transform(chat[["meta_info"]])
178
+ chat["meta_info"].head(3)
179
+
180
+ #chat["question"] = oe.fit_transform(chat[["question"]])
181
+ chat["question"].head(3)
182
+
183
+ #chat["answer_idx"] = oe.fit_transform(chat[["answer_idx"]])
184
+ chat["answer_idx"].head(3)
185
+
186
+ #chat["answer"] = oe.fit_transform(chat[["answer"]])
187
+ chat["answer"].head(3)
188
+
189
+ #chat["options"] = oe.fit_transform(chat[["options"]])
190
+ chat["options"].head(3)
191
+
192
+ chat.shape
193
+
194
+ chat.columns
195
+
196
+ from sklearn.linear_model import LassoCV
197
+ from sklearn.feature_selection import SelectFromModel
198
+
199
+ #clf = LassoCV.fit(X_train, Y_trarin)
200
+ #importance = np.abs(clf.coef)
201
+ #print(importance)
202
+
203
+ while True:
204
+ user_input = input("You can ask me any pediatric pulmonology related question (or type 'exit'): ")
205
+
206
+ if user_input.lower() == "exit":
207
+ break
208
+
209
+ response = chatbot_response(user_input)
210
+ print(response)
211
+
212
+ #response = chatbot_response(ui)
213
+ #print(response)
214
+ chat.dropna(subset=["question", "answer"], inplace=True)
215
+
216
+ from sklearn.feature_extraction.text import TfidfVectorizer
217
+
218
+ # Vectorize the questions using TF-IDF
219
+ # βœ… 1. Fit and transform your dataset questions
220
+ vector1 = TfidfVectorizer()
221
+ qvs = vector1.fit_transform(chat["question"]).toarray()
222
+
223
+ # βœ… 2. Later, transform user input using the same vectorizer
224
+ user_vec = vector1.transform([user_input]).toarray()
225
+
226
+ # πŸ”Œ Connect to OpenAI
227
+ #openai.api_key = "your-openai-api-key" # Replace with your real key
228
+
229
+ # πŸ“„ Step 1: Load your dataset
230
+ df.dropna(subset=["Question", "Answer"], inplace=True)
231
+
232
+ # 🧠 Step 2: Vectorize dataset questions
233
+ #vectorizer = TfidfVectorizer()
234
+ #question_vectors = vectorizer.fit_transform(df["Question"]).toarray()
235
+
236
+ # πŸ” Step 3: Find most similar question
237
+ def find_best_match(user_input):
238
+ user_vec = vector1.transform([user_input]).toarray()
239
+ similarities = cosine_similarity(user_vec, qvs)
240
+ best_idx = np.argmax(similarities[0])
241
+ best_score = similarities[0][answer_idx]
242
+ return df.iloc[best_idx]["question"], chat.iloc[best_idx]["answer"], best_score
243
+
244
+ # πŸ€– Step 4: Fallback to GPT-4 if no good match
245
+ def query_gpt(user_input):
246
+ response = client.chat.completions.create(
247
+ model="gpt-4",
248
+ messages=[
249
+ {"role": "system", "content": "You are a pediatric pulmonology expert."},
250
+ {"role": "user", "content": user_input}
251
+ ]
252
+ )
253
+ try:
254
+ # some risky code
255
+ risky_function()
256
+ except Exception as e:
257
+ print(f"An error occurred: {e}")
258
+
259
+ # πŸ’¬ Step 5: Define chatbot logic
260
+ def chatbot_response(user_input):
261
+ matched_q, matched_a, score = find_best_match(user_input)
262
+ if score > 0.75:
263
+ return f"πŸ“š Answer from FAQ:\nQ: {matched_q}\nA: {matched_a}"
264
+ else:
265
+ return f"πŸ€– Answer from GPT-4:\n{query_gpt(user_input)}"
266
+
267
+ # 🌐 Step 6: Launch Gradio interface
268
+ gr.Interface(
269
+ fn=chatbot_response,
270
+ inputs=gr.Textbox(label="Ask any pediatric pulmonology related question"),
271
+ outputs=gr.Textbox(label="Response"),
272
+ title="Royalty Medic_bot",
273
+ description="Get non-crtical answers to common pediatric respiratory health questions."
274
+ ).launch(share=True)
275
+
276
+
277
+ def find_best_match(user_input):
278
+ input_vec = vectorizer.transform([user_input]).toarray()
279
+ sims = cosine_similarity(input_vec, question_vectors)
280
+ idx = np.argmax(sims)
281
+ score = sims[0][answer_idx]
282
+ return chat.iloc[answer_idx]["Question"], chat.iloc[answer_idx]["Answer"], score
283
+
284
+ while True:
285
+ user_input = input("πŸ§’ Ask a pediatric pulmonology question (or type 'exit'): ")
286
+ if user_input.lower() == "exit":
287
+ print("πŸ‘‹ Goodbye!")
288
+ break
289
+ print(chatbot_response(user_input))
290
+
291
+
292
+ def chatbot_gradio_interface(user_input):
293
+ return chatbot_response(user_input)
294
+
295
+ gr.Interface(fn=chatbot_gradio_interface,
296
+ inputs="text",
297
+ outputs="text",
298
+ title="Pediatric Pulmonology Medicbot",
299
+ description="Ask any question related to pediatric lung health.").launch(share=True)
300
+
301
+
302
+
303
+
304
+
305
+ # Build FAISS index for similarity search
306
+ index = faiss.IndexFlatL2(question_vectors.shape[1])
307
+ index.add(np.array(question_vectors))
308
+
309
+ # Function to find the closest question
310
+ def find_most_similar_question(user_question, top_k=1):
311
+ user_vec = vectorizer.transform([user_question]).toarray()
312
+ D, I = index.search(user_vec, top_k)
313
+ return df.iloc[I[0][0]]["Question"], df.iloc[I[0][0]]["Answer"]
314
+
315
+ # Function to query a language model
316
+ def ask_openai(question, model="gpt-4"):
317
+ try:
318
+ response = client.chat.completions.create(
319
+ model=model,
320
+ messages=[
321
+ {"role": "system", "content": "You are a pediatric pulmonology expert."},
322
+ {"role": "user", "content": question},
323
+ ],
324
+ temperature=0.3,
325
+ )
326
+ return response.choices[0].message["content"]
327
+ except Exception as e:
328
+ print(f"Error with {model}: {e}")
329
+ return None
330
+
331
+ # Main chatbot function
332
+ def pediatric_pulmonology_chatbot(user_input):
333
+ matched_question, matched_answer = find_most_similar_question(user_input)
334
+
335
+ similarity = cosine_similarity(
336
+ vectorizer.transform([user_input]), vectorizer.transform([matched_question])
337
+ )[0][0]
338
+
339
+ if similarity > 0.7:
340
+ return f"(From Knowledge Base)\nQ: {matched_question}\nA: {matched_answer}"
341
+ else:
342
+ # Try GPT-4 first
343
+ reply = ask_openai(user_input, model="gpt-4")
344
+ if reply:
345
+ return f"(From GPT-4)\n{reply}"
346
+ else:
347
+ # Fallback to GPT-3.5
348
+ reply = ask_openai(user_input, model="gpt-3.5-turbo")
349
+ if reply:
350
+ return f"(From GPT-3.5)\n{reply}"
351
+ else:
352
+ return "Sorry, I couldn't find an answer to that."
353
+
354
+ # πŸ” Example interaction
355
+ while True:
356
+ user_input = input("\nπŸ‘Ά Ask a pediatric pulmonology question (or type 'exit'): ")
357
+ if user_input.lower() == "exit":
358
+ break
359
+ print(pediatric_pulmonology_chatbot(user_input))