QueenS5Ella commited on
Commit
c26d1cd
·
verified ·
1 Parent(s): b891774

Upload Medic_bot.ipynb

Browse files
Files changed (1) hide show
  1. Medic_bot.ipynb +1533 -0
Medic_bot.ipynb ADDED
@@ -0,0 +1,1533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6a1699ee-e3d0-4cd8-8a0f-b4b749a9ed95",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# IMPORT THE NECESSARY LIBARIES 1\n",
11
+ "#Import Python libraries: Numpy and Pandas\n",
12
+ "import pandas as pd\n",
13
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
14
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
15
+ "import openai\n",
16
+ "import faiss\n",
17
+ "import numpy as np\n",
18
+ "\n",
19
+ "#import libraries &modules for data visualization\n",
20
+ "from pandas.plotting import scatter_matrix\n",
21
+ "from matplotlib import pyplot\n",
22
+ "\n",
23
+ "#import scikit-learn module for algoruthm/model: Linear Regression\n",
24
+ "from sklearn.neighbors import KNeighborsRegressor\n",
25
+ "\n",
26
+ "#import scikit learn module to split the dataset into train/test sub-datasets\n",
27
+ "from sklearn.model_selection import train_test_split\n",
28
+ "\n",
29
+ "#Import scikit-learn module for K-fold cross validation - algorithm/model evluation & vallidation\n",
30
+ "from sklearn.model_selection import KFold\n",
31
+ "from sklearn.model_selection import cross_val_score\n",
32
+ "\n",
33
+ "#Import sckit-learn module for classification report\n",
34
+ "from sklearn.metrics import classification_report\n",
35
+ "\n",
36
+ "from sklearn.preprocessing import LabelEncoder\n",
37
+ "from sklearn.preprocessing import OrdinalEncoder"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 3,
43
+ "id": "43cd247a-6452-4686-b5e0-99d0c303a51e",
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stderr",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "[nltk_data] Downloading package punkt to C:\\Users\\Sharon-\n",
51
+ "[nltk_data] Rose\\AppData\\Roaming\\nltk_data...\n",
52
+ "[nltk_data] Package punkt is already up-to-date!\n",
53
+ "[nltk_data] Downloading package stopwords to C:\\Users\\Sharon-\n",
54
+ "[nltk_data] Rose\\AppData\\Roaming\\nltk_data...\n",
55
+ "[nltk_data] Package stopwords is already up-to-date!\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "# IMPORTATION OF NECESSARY LIBRARIES 2\n",
61
+ "import os # for handling data\n",
62
+ "import re # for text preprocessing\n",
63
+ "\n",
64
+ "# For Natural Language Processing tasks\n",
65
+ "import nltk\n",
66
+ "from sklearn.model_selection import train_test_split\n",
67
+ "\n",
68
+ "nltk.download(\"punkt\")\n",
69
+ "nltk.download(\"stopwords\")\n",
70
+ "\n",
71
+ "# Optional: for vectorization and building of the models\n",
72
+ "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
73
+ "\n",
74
+ "#IMPORTATION OF THE DIFFERENT MODELS FOR THE CHATBOT\n",
75
+ "from sklearn.linear_model import LogisticRegression\n",
76
+ "from sklearn.ensemble import RandomForestRegressor\n",
77
+ "import xgboost as xgb\n",
78
+ "from sklearn.linear_model import Ridge\n",
79
+ "from sklearn.neural_network import MLPRegressor"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 5,
85
+ "id": "15b532ac-c058-4676-814a-ac52d46ef3f2",
86
+ "metadata": {},
87
+ "outputs": [
88
+ {
89
+ "name": "stdout",
90
+ "output_type": "stream",
91
+ "text": [
92
+ "1.16.0\n"
93
+ ]
94
+ }
95
+ ],
96
+ "source": [
97
+ "import scipy\n",
98
+ "print(scipy.__version__)"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 11,
104
+ "id": "cec20cc7-22c4-4505-8779-5692d946eca2",
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "import pandas as pd\n",
109
+ "import numpy as np\n",
110
+ "import openai\n",
111
+ "import gradio as gr\n",
112
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
113
+ "from sklearn.metrics.pairwise import cosine_similarity"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 19,
119
+ "id": "121c1914-e27a-4220-a445-2e7f2e297845",
120
+ "metadata": {},
121
+ "outputs": [
122
+ {
123
+ "data": {
124
+ "text/html": [
125
+ "<div>\n",
126
+ "<style scoped>\n",
127
+ " .dataframe tbody tr th:only-of-type {\n",
128
+ " vertical-align: middle;\n",
129
+ " }\n",
130
+ "\n",
131
+ " .dataframe tbody tr th {\n",
132
+ " vertical-align: top;\n",
133
+ " }\n",
134
+ "\n",
135
+ " .dataframe thead th {\n",
136
+ " text-align: right;\n",
137
+ " }\n",
138
+ "</style>\n",
139
+ "<table border=\"1\" class=\"dataframe\">\n",
140
+ " <thead>\n",
141
+ " <tr style=\"text-align: right;\">\n",
142
+ " <th></th>\n",
143
+ " <th>Description</th>\n",
144
+ " <th>Patient</th>\n",
145
+ " <th>Doctor</th>\n",
146
+ " </tr>\n",
147
+ " </thead>\n",
148
+ " <tbody>\n",
149
+ " <tr>\n",
150
+ " <th>0</th>\n",
151
+ " <td>Q. What does abutment of the nerve root mean?</td>\n",
152
+ " <td>Hi doctor,I am just wondering what is abutting...</td>\n",
153
+ " <td>Hi. I have gone through your query with dilige...</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>1</th>\n",
157
+ " <td>Q. What should I do to reduce my weight gained...</td>\n",
158
+ " <td>Hi doctor, I am a 22-year-old female who was d...</td>\n",
159
+ " <td>Hi. You have really done well with the hypothy...</td>\n",
160
+ " </tr>\n",
161
+ " <tr>\n",
162
+ " <th>2</th>\n",
163
+ " <td>Q. I have started to get lots of acne on my fa...</td>\n",
164
+ " <td>Hi doctor! I used to have clear skin but since...</td>\n",
165
+ " <td>Hi there Acne has multifactorial etiology. Onl...</td>\n",
166
+ " </tr>\n",
167
+ " <tr>\n",
168
+ " <th>3</th>\n",
169
+ " <td>Q. Why do I have uncomfortable feeling between...</td>\n",
170
+ " <td>Hello doctor,I am having an uncomfortable feel...</td>\n",
171
+ " <td>Hello. The popping and discomfort what you fel...</td>\n",
172
+ " </tr>\n",
173
+ " <tr>\n",
174
+ " <th>4</th>\n",
175
+ " <td>Q. My symptoms after intercourse threatns me e...</td>\n",
176
+ " <td>Hello doctor,Before two years had sex with a c...</td>\n",
177
+ " <td>Hello. The HIV test uses a finger prick blood ...</td>\n",
178
+ " </tr>\n",
179
+ " </tbody>\n",
180
+ "</table>\n",
181
+ "</div>"
182
+ ],
183
+ "text/plain": [
184
+ " Description \\\n",
185
+ "0 Q. What does abutment of the nerve root mean? \n",
186
+ "1 Q. What should I do to reduce my weight gained... \n",
187
+ "2 Q. I have started to get lots of acne on my fa... \n",
188
+ "3 Q. Why do I have uncomfortable feeling between... \n",
189
+ "4 Q. My symptoms after intercourse threatns me e... \n",
190
+ "\n",
191
+ " Patient \\\n",
192
+ "0 Hi doctor,I am just wondering what is abutting... \n",
193
+ "1 Hi doctor, I am a 22-year-old female who was d... \n",
194
+ "2 Hi doctor! I used to have clear skin but since... \n",
195
+ "3 Hello doctor,I am having an uncomfortable feel... \n",
196
+ "4 Hello doctor,Before two years had sex with a c... \n",
197
+ "\n",
198
+ " Doctor \n",
199
+ "0 Hi. I have gone through your query with dilige... \n",
200
+ "1 Hi. You have really done well with the hypothy... \n",
201
+ "2 Hi there Acne has multifactorial etiology. Onl... \n",
202
+ "3 Hello. The popping and discomfort what you fel... \n",
203
+ "4 Hello. The HIV test uses a finger prick blood ... "
204
+ ]
205
+ },
206
+ "execution_count": 19,
207
+ "metadata": {},
208
+ "output_type": "execute_result"
209
+ }
210
+ ],
211
+ "source": [
212
+ "# 🔑 Replace with your real OpenAI API key\n",
213
+ "openai.api_key = \"sk-...\" # <- Replace this with your actual API key\n",
214
+ "\n",
215
+ "# 📄 Load dataset\n",
216
+ "d1 = pd.read_csv(\"ai-medical-chatbot.csv\")\n",
217
+ "d1.dropna(subset=[\"Description\", \"Doctor\"], inplace=True)\n",
218
+ "\n",
219
+ "vector1 = TfidfVectorizer()\n",
220
+ "# Keep the sparse matrix — don't convert to dense\n",
221
+ "qvs = vector1.fit_transform(d1[\"Description\"]) # No .toarray()\n",
222
+ "\n",
223
+ "d1.head()"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 21,
229
+ "id": "7c0d1a74-52bd-484f-bfc7-ceed36983140",
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "def find_best_match(user_input):\n",
234
+ " user_vec = vector1.transform([user_input]) # Still a sparse matrix\n",
235
+ " similarities = cosine_similarity(user_vec, qvs)\n",
236
+ " best_idx = np.argmax(similarities[0])\n",
237
+ " best_score = float(similarities[0][best_idx])\n",
238
+ " return d1.iloc[best_idx][\"Description\"], d1.iloc[best_idx][\"Doctor\"], best_score"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 77,
244
+ "id": "4898c3af-3e91-42d0-bede-532b65897993",
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "name": "stdout",
249
+ "output_type": "stream",
250
+ "text": [
251
+ "* Running on local URL: http://127.0.0.1:7862\n",
252
+ "\n",
253
+ "Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
254
+ ]
255
+ },
256
+ {
257
+ "data": {
258
+ "text/html": [
259
+ "<div><iframe src=\"http://127.0.0.1:7862/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
260
+ ],
261
+ "text/plain": [
262
+ "<IPython.core.display.HTML object>"
263
+ ]
264
+ },
265
+ "metadata": {},
266
+ "output_type": "display_data"
267
+ },
268
+ {
269
+ "data": {
270
+ "text/plain": []
271
+ },
272
+ "execution_count": 77,
273
+ "metadata": {},
274
+ "output_type": "execute_result"
275
+ }
276
+ ],
277
+ "source": [
278
+ "# 🔍 Vectorize questions\n",
279
+ "#vectorizer = TfidfVectorizer()\n",
280
+ "#question_vectors = vectorizer.fit_transform(df[\"Question\"]).toarray()\n",
281
+ "\n",
282
+ "# 🔎 Find the most similar FAQ match\n",
283
+ "#def find_best_match(user_input):\n",
284
+ " #user_vec = vectorizer.transform([user_input]).toarray()\n",
285
+ " #similarities = cosine_similarity(user_vec, question_vectors)\n",
286
+ " #best_idx = np.argmax(similarities[0])\n",
287
+ " # best_score = float(similarities[0][best_idx])\n",
288
+ " # return df.iloc[best_idx][\"Question\"], df.iloc[best_idx][\"Answer\"], best_score\n",
289
+ "\n",
290
+ "# 🤖 Query OpenAI if no good FAQ match\n",
291
+ "def query_gpt(user_input):\n",
292
+ " try:\n",
293
+ " response = openai.ChatCompletion.create(\n",
294
+ " model=\"gpt-4\", # or use \"gpt-3.5-turbo\"\n",
295
+ " messages=[\n",
296
+ " {\"role\": \"system\", \"content\": \"You are a pediatric pulmonology expert.\"},\n",
297
+ " {\"role\": \"user\", \"content\": user_input},\n",
298
+ " {\"role\": \"assistant\", \"content\": \"Hello\"}\n",
299
+ "\n",
300
+ " ]\n",
301
+ " )\n",
302
+ " return response.choices[0].message[\"content\"]\n",
303
+ " except Exception as e:\n",
304
+ " return f\"⚠️ GPT Error: {str(e)}\"\n",
305
+ "\n",
306
+ "# 💬 Chatbot response logic\n",
307
+ "def chatbot_response(user_input):\n",
308
+ " if not user_input.strip():\n",
309
+ " return \"Please enter a question.\"\n",
310
+ "\n",
311
+ " try:\n",
312
+ " matched_q, matched_a, score = find_best_match(user_input)\n",
313
+ " if score > 0.75:\n",
314
+ " return f\"📚 **Answer from FAQ**:\\n\\n**Q:** {matched_q}\\n**A:** {matched_a}\"\n",
315
+ " else:\n",
316
+ " gpt_answer = query_gpt(user_input)\n",
317
+ " return f\"🤖 **Answer from GPT-4**:\\n\\n{gpt_answer}\"\n",
318
+ " except Exception as e:\n",
319
+ " return f\"❌ Error processing your question: {str(e)}\"\n",
320
+ "\n",
321
+ "# 🌐 Launch Gradio interface\n",
322
+ "gr.Interface(\n",
323
+ " fn=chatbot_response,\n",
324
+ " inputs=gr.Textbox(label=\"Ask a pediatric pulmonology question\"),\n",
325
+ " outputs=gr.Textbox(label=\"Response\", lines=10),\n",
326
+ " title=\"Pediatric Pulmonology Chatbot\",\n",
327
+ " description=\"Answers common non-critical questions about pediatric pulmonology using a mix of FAQ and GPT-4.\"\n",
328
+ ").launch(share=True)"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": 27,
334
+ "id": "823966da-b528-48e2-a81f-927d72f386ed",
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "# Set your OpenAI key\n",
339
+ "openai.api_key = \"sk-...\" # <- Replace this with your actual API key\n",
340
+ "\n",
341
+ "# Load CSV\n",
342
+ "chat = pd.read_csv(\"PedMedQA_final.csv\")"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 29,
348
+ "id": "0e1055dc-28cc-499c-8303-6b922fcd7057",
349
+ "metadata": {},
350
+ "outputs": [
351
+ {
352
+ "data": {
353
+ "text/html": [
354
+ "<div>\n",
355
+ "<style scoped>\n",
356
+ " .dataframe tbody tr th:only-of-type {\n",
357
+ " vertical-align: middle;\n",
358
+ " }\n",
359
+ "\n",
360
+ " .dataframe tbody tr th {\n",
361
+ " vertical-align: top;\n",
362
+ " }\n",
363
+ "\n",
364
+ " .dataframe thead th {\n",
365
+ " text-align: right;\n",
366
+ " }\n",
367
+ "</style>\n",
368
+ "<table border=\"1\" class=\"dataframe\">\n",
369
+ " <thead>\n",
370
+ " <tr style=\"text-align: right;\">\n",
371
+ " <th></th>\n",
372
+ " <th>index</th>\n",
373
+ " <th>meta_info</th>\n",
374
+ " <th>question</th>\n",
375
+ " <th>answer_idx</th>\n",
376
+ " <th>answer</th>\n",
377
+ " <th>options</th>\n",
378
+ " <th>age_years</th>\n",
379
+ " </tr>\n",
380
+ " </thead>\n",
381
+ " <tbody>\n",
382
+ " <tr>\n",
383
+ " <th>0</th>\n",
384
+ " <td>1</td>\n",
385
+ " <td>step2&amp;3</td>\n",
386
+ " <td>A 3-month-old baby died suddenly at night whil...</td>\n",
387
+ " <td>A</td>\n",
388
+ " <td>Placing the infant in a supine position on a f...</td>\n",
389
+ " <td>[{'key': 'A', 'value': 'Placing the infant in ...</td>\n",
390
+ " <td>0.25</td>\n",
391
+ " </tr>\n",
392
+ " <tr>\n",
393
+ " <th>1</th>\n",
394
+ " <td>2</td>\n",
395
+ " <td>step1</td>\n",
396
+ " <td>A mother brings her 3-week-old infant to the p...</td>\n",
397
+ " <td>A</td>\n",
398
+ " <td>Abnormal migration of ventral pancreatic bud</td>\n",
399
+ " <td>[{'key': 'A', 'value': 'Abnormal migration of ...</td>\n",
400
+ " <td>0.06</td>\n",
401
+ " </tr>\n",
402
+ " <tr>\n",
403
+ " <th>2</th>\n",
404
+ " <td>7</td>\n",
405
+ " <td>step1</td>\n",
406
+ " <td>A 3900-g (8.6-lb) male infant is delivered at ...</td>\n",
407
+ " <td>A</td>\n",
408
+ " <td>Gastric fundus in the thorax</td>\n",
409
+ " <td>[{'key': 'A', 'value': 'Gastric fundus in the ...</td>\n",
410
+ " <td>NaN</td>\n",
411
+ " </tr>\n",
412
+ " <tr>\n",
413
+ " <th>3</th>\n",
414
+ " <td>11</td>\n",
415
+ " <td>step2&amp;3</td>\n",
416
+ " <td>A 1-year-old boy presents to the emergency dep...</td>\n",
417
+ " <td>D</td>\n",
418
+ " <td>Blockade of presynaptic acetylcholine release ...</td>\n",
419
+ " <td>[{'key': 'A', 'value': 'Antibodies against pos...</td>\n",
420
+ " <td>1.00</td>\n",
421
+ " </tr>\n",
422
+ " <tr>\n",
423
+ " <th>4</th>\n",
424
+ " <td>12</td>\n",
425
+ " <td>step1</td>\n",
426
+ " <td>A 9-month-old female is brought to the emergen...</td>\n",
427
+ " <td>D</td>\n",
428
+ " <td>Pleiotropy</td>\n",
429
+ " <td>[{'key': 'A', 'value': 'Anticipation'}\\n {'key...</td>\n",
430
+ " <td>0.75</td>\n",
431
+ " </tr>\n",
432
+ " </tbody>\n",
433
+ "</table>\n",
434
+ "</div>"
435
+ ],
436
+ "text/plain": [
437
+ " index meta_info question \\\n",
438
+ "0 1 step2&3 A 3-month-old baby died suddenly at night whil... \n",
439
+ "1 2 step1 A mother brings her 3-week-old infant to the p... \n",
440
+ "2 7 step1 A 3900-g (8.6-lb) male infant is delivered at ... \n",
441
+ "3 11 step2&3 A 1-year-old boy presents to the emergency dep... \n",
442
+ "4 12 step1 A 9-month-old female is brought to the emergen... \n",
443
+ "\n",
444
+ " answer_idx answer \\\n",
445
+ "0 A Placing the infant in a supine position on a f... \n",
446
+ "1 A Abnormal migration of ventral pancreatic bud \n",
447
+ "2 A Gastric fundus in the thorax \n",
448
+ "3 D Blockade of presynaptic acetylcholine release ... \n",
449
+ "4 D Pleiotropy \n",
450
+ "\n",
451
+ " options age_years \n",
452
+ "0 [{'key': 'A', 'value': 'Placing the infant in ... 0.25 \n",
453
+ "1 [{'key': 'A', 'value': 'Abnormal migration of ... 0.06 \n",
454
+ "2 [{'key': 'A', 'value': 'Gastric fundus in the ... NaN \n",
455
+ "3 [{'key': 'A', 'value': 'Antibodies against pos... 1.00 \n",
456
+ "4 [{'key': 'A', 'value': 'Anticipation'}\\n {'key... 0.75 "
457
+ ]
458
+ },
459
+ "execution_count": 29,
460
+ "metadata": {},
461
+ "output_type": "execute_result"
462
+ }
463
+ ],
464
+ "source": [
465
+ "chat.head()"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 31,
471
+ "id": "69bd354e-482c-42e2-ab78-55d8cda2acee",
472
+ "metadata": {},
473
+ "outputs": [
474
+ {
475
+ "data": {
476
+ "text/html": [
477
+ "<div>\n",
478
+ "<style scoped>\n",
479
+ " .dataframe tbody tr th:only-of-type {\n",
480
+ " vertical-align: middle;\n",
481
+ " }\n",
482
+ "\n",
483
+ " .dataframe tbody tr th {\n",
484
+ " vertical-align: top;\n",
485
+ " }\n",
486
+ "\n",
487
+ " .dataframe thead th {\n",
488
+ " text-align: right;\n",
489
+ " }\n",
490
+ "</style>\n",
491
+ "<table border=\"1\" class=\"dataframe\">\n",
492
+ " <thead>\n",
493
+ " <tr style=\"text-align: right;\">\n",
494
+ " <th></th>\n",
495
+ " <th>index</th>\n",
496
+ " <th>age_years</th>\n",
497
+ " </tr>\n",
498
+ " </thead>\n",
499
+ " <tbody>\n",
500
+ " <tr>\n",
501
+ " <th>count</th>\n",
502
+ " <td>2683.000000</td>\n",
503
+ " <td>2383.000000</td>\n",
504
+ " </tr>\n",
505
+ " <tr>\n",
506
+ " <th>mean</th>\n",
507
+ " <td>6266.011927</td>\n",
508
+ " <td>7.152585</td>\n",
509
+ " </tr>\n",
510
+ " <tr>\n",
511
+ " <th>std</th>\n",
512
+ " <td>3657.727022</td>\n",
513
+ " <td>5.722108</td>\n",
514
+ " </tr>\n",
515
+ " <tr>\n",
516
+ " <th>min</th>\n",
517
+ " <td>1.000000</td>\n",
518
+ " <td>0.000000</td>\n",
519
+ " </tr>\n",
520
+ " <tr>\n",
521
+ " <th>25%</th>\n",
522
+ " <td>3064.000000</td>\n",
523
+ " <td>2.000000</td>\n",
524
+ " </tr>\n",
525
+ " <tr>\n",
526
+ " <th>50%</th>\n",
527
+ " <td>6193.000000</td>\n",
528
+ " <td>6.000000</td>\n",
529
+ " </tr>\n",
530
+ " <tr>\n",
531
+ " <th>75%</th>\n",
532
+ " <td>9492.500000</td>\n",
533
+ " <td>12.000000</td>\n",
534
+ " </tr>\n",
535
+ " <tr>\n",
536
+ " <th>max</th>\n",
537
+ " <td>12709.000000</td>\n",
538
+ " <td>35.000000</td>\n",
539
+ " </tr>\n",
540
+ " </tbody>\n",
541
+ "</table>\n",
542
+ "</div>"
543
+ ],
544
+ "text/plain": [
545
+ " index age_years\n",
546
+ "count 2683.000000 2383.000000\n",
547
+ "mean 6266.011927 7.152585\n",
548
+ "std 3657.727022 5.722108\n",
549
+ "min 1.000000 0.000000\n",
550
+ "25% 3064.000000 2.000000\n",
551
+ "50% 6193.000000 6.000000\n",
552
+ "75% 9492.500000 12.000000\n",
553
+ "max 12709.000000 35.000000"
554
+ ]
555
+ },
556
+ "execution_count": 31,
557
+ "metadata": {},
558
+ "output_type": "execute_result"
559
+ }
560
+ ],
561
+ "source": [
562
+ "chat.describe()"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": 33,
568
+ "id": "4b6591e9-7501-43fa-a847-bd9cd922124e",
569
+ "metadata": {},
570
+ "outputs": [
571
+ {
572
+ "data": {
573
+ "text/plain": [
574
+ "index 0\n",
575
+ "meta_info 0\n",
576
+ "question 0\n",
577
+ "answer_idx 0\n",
578
+ "answer 1\n",
579
+ "options 0\n",
580
+ "age_years 300\n",
581
+ "dtype: int64"
582
+ ]
583
+ },
584
+ "execution_count": 33,
585
+ "metadata": {},
586
+ "output_type": "execute_result"
587
+ }
588
+ ],
589
+ "source": [
590
+ "chat.isnull().sum()"
591
+ ]
592
+ },
593
+ {
594
+ "cell_type": "code",
595
+ "execution_count": 35,
596
+ "id": "f8e9bdce-80f8-4942-88f6-abaddc1c5d72",
597
+ "metadata": {},
598
+ "outputs": [
599
+ {
600
+ "data": {
601
+ "text/plain": [
602
+ "(2683, 7)"
603
+ ]
604
+ },
605
+ "execution_count": 35,
606
+ "metadata": {},
607
+ "output_type": "execute_result"
608
+ }
609
+ ],
610
+ "source": [
611
+ "chat.shape"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "code",
616
+ "execution_count": 37,
617
+ "id": "5fd1e9dd-6748-4b06-a74f-f3827ae16ae5",
618
+ "metadata": {},
619
+ "outputs": [
620
+ {
621
+ "name": "stdout",
622
+ "output_type": "stream",
623
+ "text": [
624
+ "<class 'pandas.core.frame.DataFrame'>\n",
625
+ "RangeIndex: 2683 entries, 0 to 2682\n",
626
+ "Data columns (total 7 columns):\n",
627
+ " # Column Non-Null Count Dtype \n",
628
+ "--- ------ -------------- ----- \n",
629
+ " 0 index 2683 non-null int64 \n",
630
+ " 1 meta_info 2683 non-null object \n",
631
+ " 2 question 2683 non-null object \n",
632
+ " 3 answer_idx 2683 non-null object \n",
633
+ " 4 answer 2682 non-null object \n",
634
+ " 5 options 2683 non-null object \n",
635
+ " 6 age_years 2383 non-null float64\n",
636
+ "dtypes: float64(1), int64(1), object(5)\n",
637
+ "memory usage: 146.9+ KB\n"
638
+ ]
639
+ }
640
+ ],
641
+ "source": [
642
+ "chat.info()"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": 39,
648
+ "id": "82989bf3-abc6-486d-917e-2b78677bed49",
649
+ "metadata": {},
650
+ "outputs": [
651
+ {
652
+ "data": {
653
+ "text/plain": [
654
+ "array(['Placing the infant in a supine position on a firm mattress while sleeping',\n",
655
+ " 'Abnormal migration of ventral pancreatic bud',\n",
656
+ " 'Gastric fundus in the thorax', ..., 'Ixodes scapularis',\n",
657
+ " 'Scalded skin syndrome', 'Apply a simple shoulder sling'],\n",
658
+ " dtype=object)"
659
+ ]
660
+ },
661
+ "execution_count": 39,
662
+ "metadata": {},
663
+ "output_type": "execute_result"
664
+ }
665
+ ],
666
+ "source": [
667
+ "chat[\"answer\"]. unique()"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": 41,
673
+ "id": "21020a56-1b88-4541-9739-354362899149",
674
+ "metadata": {},
675
+ "outputs": [
676
+ {
677
+ "data": {
678
+ "text/plain": [
679
+ "answer\n",
680
+ "Reassurance 16\n",
681
+ "Ventricular septal defect 7\n",
682
+ "Autism spectrum disorder 7\n",
683
+ "Streptococcus pneumoniae 6\n",
684
+ "Patent ductus arteriosus 6\n",
685
+ " ..\n",
686
+ "Adrenal hemorrhage 1\n",
687
+ "C5 and C6 nerve roots 1\n",
688
+ "Viral upper respiratory tract infection 1\n",
689
+ "Failure of the vitelline duct to close 1\n",
690
+ "Apply a simple shoulder sling 1\n",
691
+ "Name: count, Length: 2284, dtype: int64"
692
+ ]
693
+ },
694
+ "execution_count": 41,
695
+ "metadata": {},
696
+ "output_type": "execute_result"
697
+ }
698
+ ],
699
+ "source": [
700
+ "chat[\"answer\"].value_counts()"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "code",
705
+ "execution_count": 43,
706
+ "id": "a3b442d1-33e1-4400-8e96-4d1ea5d9d699",
707
+ "metadata": {},
708
+ "outputs": [
709
+ {
710
+ "name": "stdout",
711
+ "output_type": "stream",
712
+ "text": [
713
+ "0 Placing the infant in a supine position on a f...\n",
714
+ "1 Abnormal migration of ventral pancreatic bud\n",
715
+ "2 Gastric fundus in the thorax\n",
716
+ "3 Blockade of presynaptic acetylcholine release ...\n",
717
+ "4 Pleiotropy\n",
718
+ " ... \n",
719
+ "2678 X-linked recessive\n",
720
+ "2679 Insulin production by the pancreas is insuffic...\n",
721
+ "2680 Ixodes scapularis\n",
722
+ "2681 Scalded skin syndrome\n",
723
+ "2682 Apply a simple shoulder sling\n",
724
+ "Name: answer, Length: 2683, dtype: object\n"
725
+ ]
726
+ }
727
+ ],
728
+ "source": [
729
+ "chat[\"answer\"] = chat[\"answer\"].fillna(\"Reassurance\")\n",
730
+ "print(chat[\"answer\"])"
731
+ ]
732
+ },
733
+ {
734
+ "cell_type": "code",
735
+ "execution_count": 45,
736
+ "id": "74fd9008-f566-4fd9-92b8-d3e5b69dfa33",
737
+ "metadata": {},
738
+ "outputs": [
739
+ {
740
+ "data": {
741
+ "text/plain": [
742
+ "<bound method Series.unique of 0 0.25\n",
743
+ "1 0.06\n",
744
+ "2 NaN\n",
745
+ "3 1.00\n",
746
+ "4 0.75\n",
747
+ " ... \n",
748
+ "2678 3.00\n",
749
+ "2679 16.00\n",
750
+ "2680 14.00\n",
751
+ "2681 0.02\n",
752
+ "2682 15.00\n",
753
+ "Name: age_years, Length: 2683, dtype: float64>"
754
+ ]
755
+ },
756
+ "execution_count": 45,
757
+ "metadata": {},
758
+ "output_type": "execute_result"
759
+ }
760
+ ],
761
+ "source": [
762
+ "chat[\"age_years\"].unique"
763
+ ]
764
+ },
765
+ {
766
+ "cell_type": "code",
767
+ "execution_count": 47,
768
+ "id": "ced90873-67bd-4f2a-aaaa-c901b163ac6b",
769
+ "metadata": {},
770
+ "outputs": [
771
+ {
772
+ "data": {
773
+ "text/plain": [
774
+ "<bound method IndexOpsMixin.value_counts of 0 0.25\n",
775
+ "1 0.06\n",
776
+ "2 NaN\n",
777
+ "3 1.00\n",
778
+ "4 0.75\n",
779
+ " ... \n",
780
+ "2678 3.00\n",
781
+ "2679 16.00\n",
782
+ "2680 14.00\n",
783
+ "2681 0.02\n",
784
+ "2682 15.00\n",
785
+ "Name: age_years, Length: 2683, dtype: float64>"
786
+ ]
787
+ },
788
+ "execution_count": 47,
789
+ "metadata": {},
790
+ "output_type": "execute_result"
791
+ }
792
+ ],
793
+ "source": [
794
+ "chat[\"age_years\"].value_counts"
795
+ ]
796
+ },
797
+ {
798
+ "cell_type": "code",
799
+ "execution_count": 49,
800
+ "id": "3ca55f08-fb7b-4b0b-bd9c-f8d1ccc15618",
801
+ "metadata": {},
802
+ "outputs": [
803
+ {
804
+ "data": {
805
+ "text/html": [
806
+ "<div>\n",
807
+ "<style scoped>\n",
808
+ " .dataframe tbody tr th:only-of-type {\n",
809
+ " vertical-align: middle;\n",
810
+ " }\n",
811
+ "\n",
812
+ " .dataframe tbody tr th {\n",
813
+ " vertical-align: top;\n",
814
+ " }\n",
815
+ "\n",
816
+ " .dataframe thead th {\n",
817
+ " text-align: right;\n",
818
+ " }\n",
819
+ "</style>\n",
820
+ "<table border=\"1\" class=\"dataframe\">\n",
821
+ " <thead>\n",
822
+ " <tr style=\"text-align: right;\">\n",
823
+ " <th></th>\n",
824
+ " <th>index</th>\n",
825
+ " <th>meta_info</th>\n",
826
+ " <th>question</th>\n",
827
+ " <th>answer_idx</th>\n",
828
+ " <th>answer</th>\n",
829
+ " <th>options</th>\n",
830
+ " <th>age_years</th>\n",
831
+ " </tr>\n",
832
+ " </thead>\n",
833
+ " <tbody>\n",
834
+ " <tr>\n",
835
+ " <th>0</th>\n",
836
+ " <td>1</td>\n",
837
+ " <td>step2&amp;3</td>\n",
838
+ " <td>A 3-month-old baby died suddenly at night whil...</td>\n",
839
+ " <td>A</td>\n",
840
+ " <td>Placing the infant in a supine position on a f...</td>\n",
841
+ " <td>[{'key': 'A', 'value': 'Placing the infant in ...</td>\n",
842
+ " <td>0.25</td>\n",
843
+ " </tr>\n",
844
+ " <tr>\n",
845
+ " <th>1</th>\n",
846
+ " <td>2</td>\n",
847
+ " <td>step1</td>\n",
848
+ " <td>A mother brings her 3-week-old infant to the p...</td>\n",
849
+ " <td>A</td>\n",
850
+ " <td>Abnormal migration of ventral pancreatic bud</td>\n",
851
+ " <td>[{'key': 'A', 'value': 'Abnormal migration of ...</td>\n",
852
+ " <td>0.06</td>\n",
853
+ " </tr>\n",
854
+ " <tr>\n",
855
+ " <th>2</th>\n",
856
+ " <td>7</td>\n",
857
+ " <td>step1</td>\n",
858
+ " <td>A 3900-g (8.6-lb) male infant is delivered at ...</td>\n",
859
+ " <td>A</td>\n",
860
+ " <td>Gastric fundus in the thorax</td>\n",
861
+ " <td>[{'key': 'A', 'value': 'Gastric fundus in the ...</td>\n",
862
+ " <td>NaN</td>\n",
863
+ " </tr>\n",
864
+ " <tr>\n",
865
+ " <th>3</th>\n",
866
+ " <td>11</td>\n",
867
+ " <td>step2&amp;3</td>\n",
868
+ " <td>A 1-year-old boy presents to the emergency dep...</td>\n",
869
+ " <td>D</td>\n",
870
+ " <td>Blockade of presynaptic acetylcholine release ...</td>\n",
871
+ " <td>[{'key': 'A', 'value': 'Antibodies against pos...</td>\n",
872
+ " <td>1.00</td>\n",
873
+ " </tr>\n",
874
+ " <tr>\n",
875
+ " <th>4</th>\n",
876
+ " <td>12</td>\n",
877
+ " <td>step1</td>\n",
878
+ " <td>A 9-month-old female is brought to the emergen...</td>\n",
879
+ " <td>D</td>\n",
880
+ " <td>Pleiotropy</td>\n",
881
+ " <td>[{'key': 'A', 'value': 'Anticipation'}\\n {'key...</td>\n",
882
+ " <td>0.75</td>\n",
883
+ " </tr>\n",
884
+ " </tbody>\n",
885
+ "</table>\n",
886
+ "</div>"
887
+ ],
888
+ "text/plain": [
889
+ " index meta_info question \\\n",
890
+ "0 1 step2&3 A 3-month-old baby died suddenly at night whil... \n",
891
+ "1 2 step1 A mother brings her 3-week-old infant to the p... \n",
892
+ "2 7 step1 A 3900-g (8.6-lb) male infant is delivered at ... \n",
893
+ "3 11 step2&3 A 1-year-old boy presents to the emergency dep... \n",
894
+ "4 12 step1 A 9-month-old female is brought to the emergen... \n",
895
+ "\n",
896
+ " answer_idx answer \\\n",
897
+ "0 A Placing the infant in a supine position on a f... \n",
898
+ "1 A Abnormal migration of ventral pancreatic bud \n",
899
+ "2 A Gastric fundus in the thorax \n",
900
+ "3 D Blockade of presynaptic acetylcholine release ... \n",
901
+ "4 D Pleiotropy \n",
902
+ "\n",
903
+ " options age_years \n",
904
+ "0 [{'key': 'A', 'value': 'Placing the infant in ... 0.25 \n",
905
+ "1 [{'key': 'A', 'value': 'Abnormal migration of ... 0.06 \n",
906
+ "2 [{'key': 'A', 'value': 'Gastric fundus in the ... NaN \n",
907
+ "3 [{'key': 'A', 'value': 'Antibodies against pos... 1.00 \n",
908
+ "4 [{'key': 'A', 'value': 'Anticipation'}\\n {'key... 0.75 "
909
+ ]
910
+ },
911
+ "execution_count": 49,
912
+ "metadata": {},
913
+ "output_type": "execute_result"
914
+ }
915
+ ],
916
+ "source": [
917
+ "chat.head()"
918
+ ]
919
+ },
920
+ {
921
+ "cell_type": "code",
922
+ "execution_count": 51,
923
+ "id": "47fa0f95-72e2-4b85-919b-aec5fe5aa5af",
924
+ "metadata": {},
925
+ "outputs": [
926
+ {
927
+ "data": {
928
+ "text/plain": [
929
+ "index int64\n",
930
+ "meta_info object\n",
931
+ "question object\n",
932
+ "answer_idx object\n",
933
+ "answer object\n",
934
+ "options object\n",
935
+ "age_years float64\n",
936
+ "dtype: object"
937
+ ]
938
+ },
939
+ "execution_count": 51,
940
+ "metadata": {},
941
+ "output_type": "execute_result"
942
+ }
943
+ ],
944
+ "source": [
945
+ "chat.dtypes"
946
+ ]
947
+ },
948
+ {
949
+ "cell_type": "code",
950
+ "execution_count": 53,
951
+ "id": "133b6b98-4408-47dc-b6c8-24b6e19ac2f9",
952
+ "metadata": {},
953
+ "outputs": [],
954
+ "source": [
955
+ "chat.dropna(subset=[\"question\", \"answer\"], inplace=True)\n",
956
+ "chat.drop_duplicates(subset=[\"question\"], inplace=True)"
957
+ ]
958
+ },
959
+ {
960
+ "cell_type": "code",
961
+ "execution_count": 55,
962
+ "id": "934ff55c-2ff4-4761-b401-d19749402d98",
963
+ "metadata": {},
964
+ "outputs": [
965
+ {
966
+ "data": {
967
+ "text/plain": [
968
+ "index 0\n",
969
+ "meta_info 0\n",
970
+ "question 0\n",
971
+ "answer_idx 0\n",
972
+ "answer 0\n",
973
+ "options 0\n",
974
+ "age_years 300\n",
975
+ "dtype: int64"
976
+ ]
977
+ },
978
+ "execution_count": 55,
979
+ "metadata": {},
980
+ "output_type": "execute_result"
981
+ }
982
+ ],
983
+ "source": [
984
+ "chat.isnull().sum()"
985
+ ]
986
+ },
987
+ {
988
+ "cell_type": "code",
989
+ "execution_count": 57,
990
+ "id": "4cf16cdd-6457-4edb-87d5-c307f850450a",
991
+ "metadata": {},
992
+ "outputs": [],
993
+ "source": [
994
+ "#oe = OrdinalEncoder()"
995
+ ]
996
+ },
997
+ {
998
+ "cell_type": "code",
999
+ "execution_count": 59,
1000
+ "id": "c4328f7d-a148-40c8-8207-66fa6b67d8b3",
1001
+ "metadata": {},
1002
+ "outputs": [
1003
+ {
1004
+ "data": {
1005
+ "text/plain": [
1006
+ "0 1\n",
1007
+ "1 2\n",
1008
+ "2 7\n",
1009
+ "Name: index, dtype: int64"
1010
+ ]
1011
+ },
1012
+ "execution_count": 59,
1013
+ "metadata": {},
1014
+ "output_type": "execute_result"
1015
+ }
1016
+ ],
1017
+ "source": [
1018
+ "#chat[\"index\"] = oe.fit_transform(chat[[\"index\"]])\n",
1019
+ "chat[\"index\"].head(3)"
1020
+ ]
1021
+ },
1022
+ {
1023
+ "cell_type": "code",
1024
+ "execution_count": 61,
1025
+ "id": "222839fa-3070-41fa-842f-18c6998704cb",
1026
+ "metadata": {},
1027
+ "outputs": [
1028
+ {
1029
+ "data": {
1030
+ "text/plain": [
1031
+ "0 step2&3\n",
1032
+ "1 step1\n",
1033
+ "2 step1\n",
1034
+ "Name: meta_info, dtype: object"
1035
+ ]
1036
+ },
1037
+ "execution_count": 61,
1038
+ "metadata": {},
1039
+ "output_type": "execute_result"
1040
+ }
1041
+ ],
1042
+ "source": [
1043
+ "#chat[\"meta_info\"] = oe.fit_transform(chat[[\"meta_info\"]])\n",
1044
+ "chat[\"meta_info\"].head(3)"
1045
+ ]
1046
+ },
1047
+ {
1048
+ "cell_type": "code",
1049
+ "execution_count": 63,
1050
+ "id": "acfbeea1-92a5-4558-b82b-6511c0b8de47",
1051
+ "metadata": {},
1052
+ "outputs": [
1053
+ {
1054
+ "data": {
1055
+ "text/plain": [
1056
+ "0 A 3-month-old baby died suddenly at night whil...\n",
1057
+ "1 A mother brings her 3-week-old infant to the p...\n",
1058
+ "2 A 3900-g (8.6-lb) male infant is delivered at ...\n",
1059
+ "Name: question, dtype: object"
1060
+ ]
1061
+ },
1062
+ "execution_count": 63,
1063
+ "metadata": {},
1064
+ "output_type": "execute_result"
1065
+ }
1066
+ ],
1067
+ "source": [
1068
+ "#chat[\"question\"] = oe.fit_transform(chat[[\"question\"]])\n",
1069
+ "chat[\"question\"].head(3)"
1070
+ ]
1071
+ },
1072
+ {
1073
+ "cell_type": "code",
1074
+ "execution_count": 65,
1075
+ "id": "8346763f-045b-4ade-bcaf-fdfe35555a2f",
1076
+ "metadata": {},
1077
+ "outputs": [
1078
+ {
1079
+ "data": {
1080
+ "text/plain": [
1081
+ "0 A\n",
1082
+ "1 A\n",
1083
+ "2 A\n",
1084
+ "Name: answer_idx, dtype: object"
1085
+ ]
1086
+ },
1087
+ "execution_count": 65,
1088
+ "metadata": {},
1089
+ "output_type": "execute_result"
1090
+ }
1091
+ ],
1092
+ "source": [
1093
+ "#chat[\"answer_idx\"] = oe.fit_transform(chat[[\"answer_idx\"]])\n",
1094
+ "chat[\"answer_idx\"].head(3)"
1095
+ ]
1096
+ },
1097
+ {
1098
+ "cell_type": "code",
1099
+ "execution_count": 67,
1100
+ "id": "8d054a13-3dfc-4710-bf6c-7d4e62ec5d5c",
1101
+ "metadata": {},
1102
+ "outputs": [
1103
+ {
1104
+ "data": {
1105
+ "text/plain": [
1106
+ "0 Placing the infant in a supine position on a f...\n",
1107
+ "1 Abnormal migration of ventral pancreatic bud\n",
1108
+ "2 Gastric fundus in the thorax\n",
1109
+ "Name: answer, dtype: object"
1110
+ ]
1111
+ },
1112
+ "execution_count": 67,
1113
+ "metadata": {},
1114
+ "output_type": "execute_result"
1115
+ }
1116
+ ],
1117
+ "source": [
1118
+ "#chat[\"answer\"] = oe.fit_transform(chat[[\"answer\"]])\n",
1119
+ "chat[\"answer\"].head(3)"
1120
+ ]
1121
+ },
1122
+ {
1123
+ "cell_type": "code",
1124
+ "execution_count": 69,
1125
+ "id": "10295266-f340-4a7f-81c2-a05ad219285a",
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "data": {
1130
+ "text/plain": [
1131
+ "0 [{'key': 'A', 'value': 'Placing the infant in ...\n",
1132
+ "1 [{'key': 'A', 'value': 'Abnormal migration of ...\n",
1133
+ "2 [{'key': 'A', 'value': 'Gastric fundus in the ...\n",
1134
+ "Name: options, dtype: object"
1135
+ ]
1136
+ },
1137
+ "execution_count": 69,
1138
+ "metadata": {},
1139
+ "output_type": "execute_result"
1140
+ }
1141
+ ],
1142
+ "source": [
1143
+ "#chat[\"options\"] = oe.fit_transform(chat[[\"options\"]])\n",
1144
+ "chat[\"options\"].head(3)"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "cell_type": "code",
1149
+ "execution_count": 71,
1150
+ "id": "7c284442-9b72-432a-840e-5543e6c8adf4",
1151
+ "metadata": {},
1152
+ "outputs": [
1153
+ {
1154
+ "data": {
1155
+ "text/plain": [
1156
+ "(2683, 7)"
1157
+ ]
1158
+ },
1159
+ "execution_count": 71,
1160
+ "metadata": {},
1161
+ "output_type": "execute_result"
1162
+ }
1163
+ ],
1164
+ "source": [
1165
+ "chat.shape"
1166
+ ]
1167
+ },
1168
+ {
1169
+ "cell_type": "code",
1170
+ "execution_count": 73,
1171
+ "id": "46308806-7545-480a-bf57-7434babe4efc",
1172
+ "metadata": {},
1173
+ "outputs": [
1174
+ {
1175
+ "data": {
1176
+ "text/plain": [
1177
+ "Index(['index', 'meta_info', 'question', 'answer_idx', 'answer', 'options',\n",
1178
+ " 'age_years'],\n",
1179
+ " dtype='object')"
1180
+ ]
1181
+ },
1182
+ "execution_count": 73,
1183
+ "metadata": {},
1184
+ "output_type": "execute_result"
1185
+ }
1186
+ ],
1187
+ "source": [
1188
+ "chat.columns"
1189
+ ]
1190
+ },
1191
+ {
1192
+ "cell_type": "code",
1193
+ "execution_count": 131,
1194
+ "id": "7610d011-cdc9-4416-ade5-e93756b820ee",
1195
+ "metadata": {},
1196
+ "outputs": [],
1197
+ "source": [
1198
+ "from sklearn.linear_model import LassoCV\n",
1199
+ "from sklearn.feature_selection import SelectFromModel"
1200
+ ]
1201
+ },
1202
+ {
1203
+ "cell_type": "code",
1204
+ "execution_count": 133,
1205
+ "id": "5da7ae38-db35-4f5d-ab18-5e0f25e9fd02",
1206
+ "metadata": {},
1207
+ "outputs": [],
1208
+ "source": [
1209
+ "#clf = LassoCV.fit(X_train, Y_trarin)\n",
1210
+ "#importance = np.abs(clf.coef)\n",
1211
+ "#print(importance)"
1212
+ ]
1213
+ },
1214
+ {
1215
+ "cell_type": "code",
1216
+ "execution_count": 135,
1217
+ "id": "99d49912-ba82-4670-8420-e5188e6ead27",
1218
+ "metadata": {},
1219
+ "outputs": [
1220
+ {
1221
+ "name": "stdin",
1222
+ "output_type": "stream",
1223
+ "text": [
1224
+ "You can ask me any pediatric pulmonology related question (or type 'exit'): exit\n"
1225
+ ]
1226
+ }
1227
+ ],
1228
+ "source": [
1229
+ "while True:\n",
1230
+ " user_input = input(\"You can ask me any pediatric pulmonology related question (or type 'exit'): \")\n",
1231
+ "\n",
1232
+ " if user_input.lower() == \"exit\":\n",
1233
+ " break\n",
1234
+ "\n",
1235
+ " response = chatbot_response(user_input)\n",
1236
+ " print(response)"
1237
+ ]
1238
+ },
1239
+ {
1240
+ "cell_type": "code",
1241
+ "execution_count": 147,
1242
+ "id": "4057a702-de90-4697-b080-3cea436a290e",
1243
+ "metadata": {},
1244
+ "outputs": [],
1245
+ "source": [
1246
+ "#response = chatbot_response(ui)\n",
1247
+ "#print(response)\n",
1248
+ "chat.dropna(subset=[\"question\", \"answer\"], inplace=True)"
1249
+ ]
1250
+ },
1251
+ {
1252
+ "cell_type": "code",
1253
+ "execution_count": 149,
1254
+ "id": "8ea73391-d8ba-4e24-ba44-b0b93321ef2c",
1255
+ "metadata": {},
1256
+ "outputs": [],
1257
+ "source": [
1258
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1259
+ "\n",
1260
+ "# Vectorize the questions using TF-IDF\n",
1261
+ "# ✅ 1. Fit and transform your dataset questions\n",
1262
+ "vector1 = TfidfVectorizer()\n",
1263
+ "qvs = vector1.fit_transform(chat[\"question\"]).toarray()\n",
1264
+ "\n",
1265
+ "# ✅ 2. Later, transform user input using the same vectorizer\n",
1266
+ "user_vec = vector1.transform([user_input]).toarray()"
1267
+ ]
1268
+ },
1269
+ {
1270
+ "cell_type": "code",
1271
+ "execution_count": 155,
1272
+ "id": "60701e12-49cc-4bef-8865-65dd5ebb3ae6",
1273
+ "metadata": {},
1274
+ "outputs": [
1275
+ {
1276
+ "ename": "SyntaxError",
1277
+ "evalue": "invalid syntax (1206184978.py, line 29)",
1278
+ "output_type": "error",
1279
+ "traceback": [
1280
+ "\u001b[1;36m Cell \u001b[1;32mIn[155], line 29\u001b[1;36m\u001b[0m\n\u001b[1;33m except Exception as e:\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
1281
+ ]
1282
+ }
1283
+ ],
1284
+ "source": [
1285
+ "# 🔌 Connect to OpenAI\n",
1286
+ "openai.api_key = \"your-openai-api-key\" # Replace with your real key\n",
1287
+ "\n",
1288
+ "# 📄 Step 1: Load your dataset\n",
1289
+ "df.dropna(subset=[\"Question\", \"Answer\"], inplace=True)\n",
1290
+ "\n",
1291
+ "# 🧠 Step 2: Vectorize dataset questions\n",
1292
+ "#vectorizer = TfidfVectorizer()\n",
1293
+ "#question_vectors = vectorizer.fit_transform(df[\"Question\"]).toarray()\n",
1294
+ "\n",
1295
+ "# 🔍 Step 3: Find most similar question\n",
1296
+ "def find_best_match(user_input):\n",
1297
+ " user_vec = vector1.transform([user_input]).toarray()\n",
1298
+ " similarities = cosine_similarity(user_vec, qvs)\n",
1299
+ " best_idx = np.argmax(similarities[0])\n",
1300
+ " best_score = similarities[0][answer_idx]\n",
1301
+ " return df.iloc[best_idx][\"question\"], chat.iloc[best_idx][\"answer\"], best_score\n",
1302
+ "\n",
1303
+ "# 🤖 Step 4: Fallback to GPT-4 if no good match\n",
1304
+ "def query_gpt(user_input):\n",
1305
+ " response = openai.ChatCompletion.create(\n",
1306
+ " model=\"gpt-4\",\n",
1307
+ " messages=[\n",
1308
+ " {\"role\": \"system\", \"content\": \"You are a pediatric pulmonology expert.\"},\n",
1309
+ " {\"role\": \"user\", \"content\": user_input}\n",
1310
+ " ]\n",
1311
+ " )\n",
1312
+ " return response.choices[0].message[\"content\"]\n",
1313
+ " except Exception as e:\n",
1314
+ " return f\"⚠️ GPT Error: {e}\"\n",
1315
+ "\n",
1316
+ "# 💬 Step 5: Define chatbot logic\n",
1317
+ "def chatbot_response(user_input):\n",
1318
+ " matched_q, matched_a, score = find_best_match(user_input)\n",
1319
+ " if score > 0.75:\n",
1320
+ " return f\"📚 Answer from FAQ:\\nQ: {matched_q}\\nA: {matched_a}\"\n",
1321
+ " else:\n",
1322
+ " return f\"🤖 Answer from GPT-4:\\n{query_gpt(user_input)}\"\n",
1323
+ "\n",
1324
+ "# 🌐 Step 6: Launch Gradio interface\n",
1325
+ "gr.Interface(\n",
1326
+ " fn=chatbot_response,\n",
1327
+ " inputs=gr.Textbox(label=\"Ask any pediatric pulmonology related question\"),\n",
1328
+ " outputs=gr.Textbox(label=\"Response\"),\n",
1329
+ " title=\"Royalty Medic_bot\",\n",
1330
+ " description=\"Get non-crtical answers to common pediatric respiratory health questions.\"\n",
1331
+ ").launch()\n"
1332
+ ]
1333
+ },
1334
+ {
1335
+ "cell_type": "code",
1336
+ "execution_count": null,
1337
+ "id": "572732aa-1b8b-4202-97a3-0d4ffd272f82",
1338
+ "metadata": {},
1339
+ "outputs": [],
1340
+ "source": [
1341
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1342
+ "import numpy as np\n",
1343
+ "\n",
1344
+ "def find_best_match(user_input):\n",
1345
+ " input_vec = vectorizer.transform([user_input]).toarray()\n",
1346
+ " sims = cosine_similarity(input_vec, question_vectors)\n",
1347
+ " idx = np.argmax(sims)\n",
1348
+ " score = sims[0][answer_idx]\n",
1349
+ " return chat.iloc[answer_idx][\"Question\"], chat.iloc[answer_idx][\"Answer\"], score"
1350
+ ]
1351
+ },
1352
+ {
1353
+ "cell_type": "code",
1354
+ "execution_count": 71,
1355
+ "id": "ed94c25c-7951-4cdb-bead-c169d3e0c1a4",
1356
+ "metadata": {},
1357
+ "outputs": [
1358
+ {
1359
+ "name": "stdin",
1360
+ "output_type": "stream",
1361
+ "text": [
1362
+ "🧒 Ask a pediatric pulmonology question (or type 'exit'): exit\n"
1363
+ ]
1364
+ },
1365
+ {
1366
+ "name": "stdout",
1367
+ "output_type": "stream",
1368
+ "text": [
1369
+ "👋 Goodbye!\n"
1370
+ ]
1371
+ }
1372
+ ],
1373
+ "source": [
1374
+ "while True:\n",
1375
+ " user_input = input(\"🧒 Ask a pediatric pulmonology question (or type 'exit'): \")\n",
1376
+ " if user_input.lower() == \"exit\":\n",
1377
+ " print(\"👋 Goodbye!\")\n",
1378
+ " break\n",
1379
+ " print(chatbot_response(user_input))"
1380
+ ]
1381
+ },
1382
+ {
1383
+ "cell_type": "code",
1384
+ "execution_count": 73,
1385
+ "id": "8f9aa311-3e70-47c5-b103-db71d1d65ac3",
1386
+ "metadata": {},
1387
+ "outputs": [
1388
+ {
1389
+ "name": "stdout",
1390
+ "output_type": "stream",
1391
+ "text": [
1392
+ "* Running on local URL: http://127.0.0.1:7860\n",
1393
+ "* To create a public link, set `share=True` in `launch()`.\n"
1394
+ ]
1395
+ },
1396
+ {
1397
+ "data": {
1398
+ "text/html": [
1399
+ "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
1400
+ ],
1401
+ "text/plain": [
1402
+ "<IPython.core.display.HTML object>"
1403
+ ]
1404
+ },
1405
+ "metadata": {},
1406
+ "output_type": "display_data"
1407
+ },
1408
+ {
1409
+ "data": {
1410
+ "text/plain": []
1411
+ },
1412
+ "execution_count": 73,
1413
+ "metadata": {},
1414
+ "output_type": "execute_result"
1415
+ }
1416
+ ],
1417
+ "source": [
1418
+ "import gradio as gr\n",
1419
+ "\n",
1420
+ "def chatbot_gradio_interface(user_input):\n",
1421
+ " return chatbot_response(user_input)\n",
1422
+ "\n",
1423
+ "gr.Interface(fn=chatbot_gradio_interface,\n",
1424
+ " inputs=\"text\",\n",
1425
+ " outputs=\"text\",\n",
1426
+ " title=\"Pediatric Pulmonology Chatbot\",\n",
1427
+ " description=\"Ask any question related to pediatric lung health.\").launch()"
1428
+ ]
1429
+ },
1430
+ {
1431
+ "cell_type": "code",
1432
+ "execution_count": null,
1433
+ "id": "b2436f4d-bc62-4f5b-8559-0ac2f1449912",
1434
+ "metadata": {},
1435
+ "outputs": [],
1436
+ "source": []
1437
+ },
1438
+ {
1439
+ "cell_type": "code",
1440
+ "execution_count": null,
1441
+ "id": "07c643f3-72ed-46c1-bfcb-d7da0b78337e",
1442
+ "metadata": {},
1443
+ "outputs": [],
1444
+ "source": []
1445
+ },
1446
+ {
1447
+ "cell_type": "code",
1448
+ "execution_count": null,
1449
+ "id": "b63ca794-008f-491f-a317-999755b9a964",
1450
+ "metadata": {},
1451
+ "outputs": [],
1452
+ "source": [
1453
+ "\n",
1454
+ "# Build FAISS index for similarity search\n",
1455
+ "index = faiss.IndexFlatL2(question_vectors.shape[1])\n",
1456
+ "index.add(np.array(question_vectors))\n",
1457
+ "\n",
1458
+ "# Function to find the closest question\n",
1459
+ "def find_most_similar_question(user_question, top_k=1):\n",
1460
+ " user_vec = vectorizer.transform([user_question]).toarray()\n",
1461
+ " D, I = index.search(user_vec, top_k)\n",
1462
+ " return df.iloc[I[0][0]][\"Question\"], df.iloc[I[0][0]][\"Answer\"]\n",
1463
+ "\n",
1464
+ "# Function to query a language model\n",
1465
+ "def ask_openai(question, model=\"gpt-4\"):\n",
1466
+ " try:\n",
1467
+ " response = openai.ChatCompletion.create(\n",
1468
+ " model=model,\n",
1469
+ " messages=[\n",
1470
+ " {\"role\": \"system\", \"content\": \"You are a pediatric pulmonology expert.\"},\n",
1471
+ " {\"role\": \"user\", \"content\": question},\n",
1472
+ " ],\n",
1473
+ " temperature=0.3,\n",
1474
+ " )\n",
1475
+ " return response.choices[0].message[\"content\"]\n",
1476
+ " except Exception as e:\n",
1477
+ " print(f\"Error with {model}: {e}\")\n",
1478
+ " return None\n",
1479
+ "\n",
1480
+ "# Main chatbot function\n",
1481
+ "def pediatric_pulmonology_chatbot(user_input):\n",
1482
+ " matched_question, matched_answer = find_most_similar_question(user_input)\n",
1483
+ "\n",
1484
+ " similarity = cosine_similarity(\n",
1485
+ " vectorizer.transform([user_input]), vectorizer.transform([matched_question])\n",
1486
+ " )[0][0]\n",
1487
+ "\n",
1488
+ " if similarity > 0.7:\n",
1489
+ " return f\"(From Knowledge Base)\\nQ: {matched_question}\\nA: {matched_answer}\"\n",
1490
+ " else:\n",
1491
+ " # Try GPT-4 first\n",
1492
+ " reply = ask_openai(user_input, model=\"gpt-4\")\n",
1493
+ " if reply:\n",
1494
+ " return f\"(From GPT-4)\\n{reply}\"\n",
1495
+ " else:\n",
1496
+ " # Fallback to GPT-3.5\n",
1497
+ " reply = ask_openai(user_input, model=\"gpt-3.5-turbo\")\n",
1498
+ " if reply:\n",
1499
+ " return f\"(From GPT-3.5)\\n{reply}\"\n",
1500
+ " else:\n",
1501
+ " return \"Sorry, I couldn't find an answer to that.\"\n",
1502
+ "\n",
1503
+ "# 🔁 Example interaction\n",
1504
+ "while True:\n",
1505
+ " user_input = input(\"\\n👶 Ask a pediatric pulmonology question (or type 'exit'): \")\n",
1506
+ " if user_input.lower() == \"exit\":\n",
1507
+ " break\n",
1508
+ " print(pediatric_pulmonology_chatbot(user_input))\n"
1509
+ ]
1510
+ }
1511
+ ],
1512
+ "metadata": {
1513
+ "kernelspec": {
1514
+ "display_name": "Python 3 (ipykernel)",
1515
+ "language": "python",
1516
+ "name": "python3"
1517
+ },
1518
+ "language_info": {
1519
+ "codemirror_mode": {
1520
+ "name": "ipython",
1521
+ "version": 3
1522
+ },
1523
+ "file_extension": ".py",
1524
+ "mimetype": "text/x-python",
1525
+ "name": "python",
1526
+ "nbconvert_exporter": "python",
1527
+ "pygments_lexer": "ipython3",
1528
+ "version": "3.12.7"
1529
+ }
1530
+ },
1531
+ "nbformat": 4,
1532
+ "nbformat_minor": 5
1533
+ }