Sazzz02 commited on
Commit
0b792da
Β·
verified Β·
1 Parent(s): 777b714

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +900 -293
app.py CHANGED
@@ -1,28 +1,7 @@
1
- """
2
- ╔══════════════════════════════════════════════════════════════════╗
3
- β•‘ Cross-Medical-System Drug Recommendation Engine β•‘
4
- β•‘ Master's Thesis Project β€” Hugging Face Gradio App β•‘
5
- β•‘ Medical Systems: Allopathic | Ayurvedic | Unani | β•‘
6
- β•‘ Homeopathic | Herbal β•‘
7
- β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
8
-
9
- HOW TO DEPLOY ON HUGGING FACE SPACES:
10
- 1. Create a new Space β†’ SDK: Gradio
11
- 2. Upload: app.py, requirements.txt, and the entire models/ folder
12
- 3. The Space will auto-install requirements and launch
13
-
14
- FOLDER STRUCTURE on HF Space:
15
- app.py
16
- requirements.txt
17
- models/
18
- tfidf_vectorizer.pkl
19
- tfidf_matrix.pkl
20
- svd_reducer.pkl
21
- kmeans_model.pkl
22
- drug_database.csv
23
- model_metadata.json
24
- """
25
 
 
 
 
26
  import gradio as gr
27
  import pandas as pd
28
  import numpy as np
@@ -31,22 +10,147 @@ import json
31
  import os
32
  import re
33
  import warnings
 
 
 
 
34
 
35
  warnings.filterwarnings("ignore")
36
 
37
- # ─── Load Models ────────────────────────────────────────────────────────────
38
- MODEL_DIR = os.path.join(os.path.dirname(__file__), "models")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
40
 
41
  def load_models():
42
- print("Loading models from PKL files...")
43
- vectorizer = joblib.load(os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl"))
44
- tfidf_matrix = joblib.load(os.path.join(MODEL_DIR, "tfidf_matrix.pkl"))
45
- drug_db = pd.read_csv(os.path.join(MODEL_DIR, "drug_database.csv"))
46
  with open(os.path.join(MODEL_DIR, "model_metadata.json")) as f:
47
- metadata = json.load(f)
48
- print(f"βœ… Loaded {len(drug_db):,} drugs | {tfidf_matrix.shape[1]} features")
49
- return vectorizer, tfidf_matrix, drug_db, metadata
50
 
51
 
52
  try:
@@ -54,338 +158,841 @@ try:
54
  vectorizer, tfidf_matrix, drug_db, metadata = load_models()
55
  MEDICAL_SYSTEMS = ["All Systems"] + sorted(drug_db["medical_system"].unique().tolist())
56
  MODEL_LOADED = True
57
- except Exception as e:
58
- print(f"Model load error: {e}")
59
  MODEL_LOADED = False
60
  MEDICAL_SYSTEMS = ["All Systems"]
 
 
61
 
62
 
63
- # ─── Core Recommendation Function ───────────────────────────────────────────
 
 
64
 
65
- def recommend_drugs(query: str, medical_system: str, top_n: int, min_score: float):
66
- """
67
- Core recommendation engine using TF-IDF + Cosine Similarity.
68
- Returns a formatted DataFrame of recommendations.
69
- """
70
- if not MODEL_LOADED:
71
- return pd.DataFrame({"Error": ["Models not loaded. Check /models folder."]}), "❌ Models not loaded"
72
 
73
- if not query or not query.strip():
74
- return pd.DataFrame({"Info": ["Please enter a drug name or query."]}), "⚠️ Empty query"
75
 
76
- # Clean query
77
- query_clean = re.sub(r"[^a-z0-9\s\+\-\.]", " ", query.lower().strip())
78
- query_clean = re.sub(r"\s+", " ", query_clean).strip()
 
 
79
 
80
- # Vectorize
81
- q_vec = vectorizer.transform([query_clean])
82
- sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
83
-
84
- # Apply system filter
85
- if medical_system and medical_system != "All Systems":
86
- mask = drug_db["medical_system"] == medical_system
87
  sims_work = sims.copy()
88
- sims_work[~mask] = 0
89
  else:
90
  sims_work = sims
91
 
92
- # Get top indices
93
- top_idx = sims_work.argsort()[-(top_n * 3):][::-1]
94
- top_idx = [i for i in top_idx if sims[i] >= min_score][:top_n]
95
 
96
- if not top_idx:
97
- return (
98
- pd.DataFrame({"Result": [f"No results found above similarity threshold {min_score}."
99
- f" Try lowering threshold or broader query."]}),
100
- f"⚠️ No results for '{query}'"
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
 
103
- results = drug_db.iloc[top_idx][[
104
- "brand_name", "generic_name", "dosage_form", "strength",
105
- "medical_system", "manufacturer"
106
  ]].copy()
107
- results["similarity_score"] = sims[top_idx].round(4)
108
- results = results.sort_values("similarity_score", ascending=False).reset_index(drop=True)
109
- results.index += 1
110
- results.index.name = "Rank"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- # Rename columns for display
113
- results.columns = ["Brand Name", "Generic Name", "Dosage Form",
114
- "Strength", "Medical System", "Manufacturer", "Score"]
115
 
116
- n_systems = results["Medical System"].nunique()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  summary = (
118
- f"βœ… Found **{len(results)}** drugs"
119
- f"{' in ' + medical_system if medical_system != 'All Systems' else ' across ' + str(n_systems) + ' medical systems'}"
120
- f" for query: **'{query}'**"
121
  )
122
- return results, summary
123
 
124
 
125
- def cross_system_compare(query: str, top_per_system: int):
126
- """
127
- Return best N results from EACH medical system β€” the core thesis contribution.
128
- """
129
- if not MODEL_LOADED:
130
- return pd.DataFrame({"Error": ["Models not loaded."]}), "❌ Models not loaded"
131
- if not query or not query.strip():
132
- return pd.DataFrame({"Info": ["Please enter a query."]}), "⚠️ Empty query"
133
-
134
- systems = [s for s in drug_db["medical_system"].unique()]
135
- all_results = []
136
-
137
- query_clean = re.sub(r"[^a-z0-9\s\+\-\.]", " ", query.lower().strip())
138
- q_vec = vectorizer.transform([query_clean])
139
- sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
140
-
141
- for system in systems:
142
- mask = drug_db["medical_system"] == system
143
- sims_sys = sims.copy()
144
- sims_sys[~mask] = 0
145
- top_idx = sims_sys.argsort()[-top_per_system:][::-1]
146
- top_idx = [i for i in top_idx if sims[i] > 0.01][:top_per_system]
147
- if top_idx:
148
- sub = drug_db.iloc[top_idx][["brand_name", "generic_name",
149
- "dosage_form", "strength",
150
- "medical_system", "manufacturer"]].copy()
151
- sub["similarity_score"] = sims[top_idx].round(4)
152
- all_results.append(sub)
153
-
154
- if not all_results:
155
- return pd.DataFrame({"Result": ["No cross-system results found."]}), "No results"
156
-
157
- combined = pd.concat(all_results, ignore_index=True)
158
- combined = combined.sort_values(["medical_system", "similarity_score"], ascending=[True, False])
159
- combined.index = range(1, len(combined) + 1)
160
- combined.index.name = "Rank"
161
- combined.columns = ["Brand Name", "Generic Name", "Dosage Form",
162
- "Strength", "Medical System", "Manufacturer", "Score"]
163
-
164
- summary = f"βœ… Cross-system comparison for **'{query}'** β€” {len(combined)} drugs across {len(systems)} systems"
165
- return combined, summary
166
-
167
-
168
- def get_stats():
169
- """Return dataset statistics as markdown."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  if not MODEL_LOADED:
171
  return "Models not loaded."
172
- sys_dist = drug_db["medical_system"].value_counts()
173
- dosage_dist = drug_db["dosage_form"].value_counts().head(8)
174
-
175
- stats_md = f"""
176
- ## πŸ“Š Dataset Statistics
177
 
178
  | Metric | Value |
179
  |--------|-------|
180
- | Total Drugs | {len(drug_db):,} |
181
- | Medical Systems | {drug_db['medical_system'].nunique()} |
182
- | Unique Manufacturers | {drug_db['manufacturer'].nunique():,} |
183
- | Unique Brand Names | {drug_db['brand_name'].nunique():,} |
184
- | TF-IDF Features | {metadata.get('n_features', 10000):,} |
185
- | Silhouette Score | {metadata.get('silhouette_score', 'N/A')} |
186
-
187
- ### πŸ₯ Medical System Distribution
 
188
  """
189
- for sys, cnt in sys_dist.items():
190
- pct = cnt / len(drug_db) * 100
191
- bar = "β–ˆ" * int(pct / 2)
192
- stats_md += f"\n- **{sys}**: {cnt:,} ({pct:.1f}%) `{bar}`"
 
 
 
 
193
 
194
- stats_md += "\n\n### πŸ’Š Top Dosage Forms\n"
195
- for dosage, cnt in dosage_dist.items():
196
- stats_md += f"\n- {dosage}: {cnt:,}"
197
 
198
- return stats_md
 
 
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- # ─── Gradio UI ──────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- EXAMPLE_QUERIES = [
204
- ["Azithromycin 500mg", "All Systems", 10, 0.05],
205
- ["paracetamol fever tablet", "Ayurvedic", 8, 0.05],
206
- ["omeprazole capsule 20mg", "Allopathic", 10, 0.1],
207
- ["blood pressure tablet", "Homeopathic", 6, 0.05],
208
- ["herbal digestive liquid", "Herbal", 5, 0.05],
209
- ["antibiotic suspension", "Unani", 6, 0.05],
210
- ]
211
 
212
- CROSS_SYSTEM_EXAMPLES = [
213
- ["Azithromycin antibiotic tablet", 3],
214
- ["fever pain relief", 2],
215
- ["digestive stomach", 2],
216
- ["blood pressure hypertension", 2],
217
- ]
218
 
219
- CSS = """
220
- .gradio-container { max-width: 1100px; margin: auto; font-family: 'Segoe UI', sans-serif; }
221
- .header-box { background: linear-gradient(135deg, #1a237e, #283593);
222
- color: white; padding: 24px; border-radius: 12px; margin-bottom: 16px; }
223
- .stat-box { background: #f8f9fa; border-radius: 8px; padding: 12px; }
224
- footer { display: none !important; }
225
- """
226
 
227
- with gr.Blocks(css=CSS, title="πŸ’Š Drug Recommendation System") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  gr.HTML("""
230
- <div class="header-box">
231
- <h1 style="margin:0; font-size:1.8em;">πŸ’Š Cross-Medical-System Drug Recommender</h1>
232
- <p style="margin:8px 0 0; opacity:0.85; font-size:1.05em;">
233
- Master's Thesis β€” Intelligent Drug Formulation Recommendation System<br>
234
- <span style="font-size:0.9em;">Allopathic β€’ Ayurvedic β€’ Unani β€’ Homeopathic β€’ Herbal</span>
235
- </p>
236
  </div>
237
  """)
238
 
239
  with gr.Tabs():
240
 
241
- # ── Tab 1: Single System Recommendation ─────────────────────────
242
- with gr.TabItem("πŸ” Drug Recommender"):
243
- gr.Markdown("### Find drugs by name, generic compound, or description")
244
-
245
- with gr.Row():
246
- with gr.Column(scale=3):
247
- query_input = gr.Textbox(
248
- label="πŸ”Ž Search Query",
249
- placeholder="e.g. Azithromycin 500mg tablet, fever pain, omeprazole capsule...",
250
- lines=1
251
- )
252
- with gr.Column(scale=2):
253
- system_filter = gr.Dropdown(
254
- choices=MEDICAL_SYSTEMS,
255
- value="All Systems",
256
- label="πŸ₯ Medical System Filter"
257
- )
258
 
259
  with gr.Row():
260
- top_n_slider = gr.Slider(
261
- minimum=3, maximum=25, value=10, step=1,
262
- label="πŸ“‹ Number of Results"
263
- )
264
- min_score_slider = gr.Slider(
265
- minimum=0.01, maximum=0.5, value=0.05, step=0.01,
266
- label="🎯 Minimum Similarity Score"
267
- )
268
-
269
- recommend_btn = gr.Button("πŸš€ Get Recommendations", variant="primary", size="lg")
270
- summary_box = gr.Markdown(label="Summary")
271
- results_table = gr.DataFrame(
272
  label="πŸ“‹ Recommended Drugs",
273
- wrap=True,
274
- interactive=False
275
  )
276
 
277
- gr.Examples(
278
- examples=EXAMPLE_QUERIES,
279
- inputs=[query_input, system_filter, top_n_slider, min_score_slider],
280
- label="πŸ“Œ Quick Examples β€” Click to Try"
281
  )
282
 
283
- recommend_btn.click(
284
- fn=recommend_drugs,
285
- inputs=[query_input, system_filter, top_n_slider, min_score_slider],
286
- outputs=[results_table, summary_box]
287
- )
 
 
288
 
289
- # ── Tab 2: Cross-System Comparison ──────────────────────────────
290
- with gr.TabItem("πŸ”„ Cross-System Comparison"):
 
 
291
  gr.Markdown("""
292
- ### Compare the same drug/query across ALL 5 medical systems simultaneously
293
- > This is the **core thesis contribution** β€” finding equivalent treatments across Allopathic, Ayurvedic, Unani, Homeopathic, and Herbal systems.
 
294
  """)
295
 
296
- with gr.Row():
297
- cross_query = gr.Textbox(
298
- label="πŸ”Ž Drug / Condition Query",
299
- placeholder="e.g. fever tablet, antibiotic, digestive...",
300
- lines=1,
301
- scale=4
302
- )
303
- top_per_sys = gr.Slider(
304
- minimum=1, maximum=5, value=3, step=1,
305
- label="Results per System",
306
- scale=2
307
- )
308
-
309
- compare_btn = gr.Button("πŸ”„ Compare Across All Systems", variant="primary", size="lg")
310
  cross_summary = gr.Markdown()
311
- cross_table = gr.DataFrame(label="🌐 Cross-System Drug Comparison", wrap=True, interactive=False)
312
-
313
- gr.Examples(
314
- examples=CROSS_SYSTEM_EXAMPLES,
315
- inputs=[cross_query, top_per_sys],
316
- label="πŸ“Œ Quick Examples"
317
  )
318
 
319
  compare_btn.click(
320
  fn=cross_system_compare,
321
- inputs=[cross_query, top_per_sys],
322
- outputs=[cross_table, cross_summary]
323
  )
324
 
325
- # ── Tab 3: Dataset Stats ─────────────────────────────────────────
326
- with gr.TabItem("πŸ“Š Dataset Statistics"):
327
- stats_output = gr.Markdown()
328
- refresh_btn = gr.Button("πŸ”„ Load Statistics", variant="secondary")
329
- refresh_btn.click(fn=get_stats, inputs=[], outputs=[stats_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
- # ── Tab 4: About ─────────────────────────────────────────────────
332
- with gr.TabItem("πŸ“š About / Thesis"):
333
  gr.Markdown("""
334
- ## πŸ“– About This Project
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
- ### Thesis Title
337
- **Intelligent Cross-Medical-System Drug Recommendation Using NLP and Similarity-Based Learning**
 
338
 
339
- ### Problem Statement
340
- Healthcare practitioners often need to recommend drug alternatives across different medical traditions
341
- (Allopathic, Ayurvedic, Unani, Homeopathic, Herbal), especially in regions like South Asia where
342
- multiple medical systems coexist. No unified digital tool existed for this task.
343
 
344
- ### Methodology
345
- | Component | Technique |
346
- |-----------|-----------|
347
- | Text Feature Extraction | TF-IDF (1,2-gram, 10,000 features) |
348
- | Similarity Engine | Cosine Similarity |
349
- | Dimensionality Reduction | Truncated SVD (50 components) |
350
- | Drug Clustering | K-Means (K=10) |
351
- | Evaluation Metric | Precision@K, Silhouette Score |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  ### Dataset
354
- - **Source**: Bangladesh National Drug Registry (via Kaggle)
355
- - **Size**: 53,584 drug records
356
- - **Systems**: Allopathic (36k), Unani (8.5k), Ayurvedic (5.3k), Homeopathic (2.6k), Herbal (1k)
357
- - **Fields**: Brand Name, Generic Name, Strength, Dosage Form, Manufacturer
358
-
359
- ### Key Contributions
360
- 1. **First unified cross-medical-system recommender** for South Asian drug registry
361
- 2. **NLP-driven**: TF-IDF bigrams handle compound drug names (e.g., "Diphenhydramine + Zinc Acetate")
362
- 3. **Clustering analysis** reveals natural drug groupings across cultural medical traditions
363
- 4. **Deployable**: Fast PKL-based inference, <100ms per query
364
-
365
- ### Model Files
366
- ```
367
- models/
368
- β”œβ”€β”€ tfidf_vectorizer.pkl β€” Fitted TF-IDF transformer
369
- β”œβ”€β”€ tfidf_matrix.pkl β€” Pre-computed drug feature matrix
370
- β”œβ”€β”€ svd_reducer.pkl β€” SVD dimensionality reducer
371
- β”œβ”€β”€ kmeans_model.pkl β€” K-Means cluster assignments
372
- └── drug_database.csv β€” Processed drug database
373
- ```
374
-
375
- ### How to Cite
376
- ```
377
- Author, (2024). Cross-Medical-System Drug Recommendation Engine.
378
- Master's Thesis, [University Name].
379
- Dataset: https://www.kaggle.com/datasets/shuvokumarbasak2030/drug-pharma-new-dataset
380
- ```
381
  """)
382
 
383
  gr.HTML("""
384
- <div style="text-align:center; padding:12px; color:#666; font-size:0.85em; margin-top:10px;">
385
- πŸŽ“ Master's Thesis Project | Drug Recommendation System |
386
- Built with TF-IDF + Cosine Similarity | Hugging Face Spaces
 
387
  </div>
388
  """)
389
 
 
 
390
  if __name__ == "__main__":
391
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ # ═══════════════════════════════════════════════════════════════════
3
+ # IMPORTS
4
+ # ═══════════════════════════════════════════════════════════════════
5
  import gradio as gr
6
  import pandas as pd
7
  import numpy as np
 
10
  import os
11
  import re
12
  import warnings
13
+ import requests
14
+ import plotly.graph_objects as go
15
+ import plotly.express as px
16
+ from plotly.subplots import make_subplots
17
 
18
  warnings.filterwarnings("ignore")
19
 
20
+ # ═══════════════════════════════════════════════════════════════════
21
+ # CONSTANTS
22
+ # ═══════════════════════════════════════════════════════════════════
23
+
24
+ OPENFDA_BASE = "https://api.fda.gov/drug"
25
+ MODEL_DIR = os.path.join(os.path.dirname(__file__), "models")
26
+
27
+ # System colour palette β€” used consistently across all charts
28
+ SYSTEM_COLORS = {
29
+ "Allopathic": "#3b82f6", # blue
30
+ "Unani": "#f97316", # orange
31
+ "Ayurvedic": "#22c55e", # green
32
+ "Homeopathic": "#a855f7", # purple
33
+ "Herbal": "#ef4444", # red
34
+ }
35
+
36
+ # ─── 30 Curated Drug Options ─────────────────────────────────────
37
+ # Key = display label shown in the Gradio Dropdown
38
+ # Value = exact TF-IDF search query passed to recommend()
39
+ # Changing this dict is the ONLY place you need to add/remove options.
40
+ DRUG_OPTIONS = {
41
+ # ── Antibiotics ──────────────────────────────────────────────
42
+ "🦠 Azithromycin β€” Antibiotic (Respiratory)": "Azithromycin 500mg tablet",
43
+ "🦠 Amoxicillin β€” Antibiotic (Broad Spectrum)": "Amoxicillin 500mg capsule",
44
+ "🦠 Ciprofloxacin β€” Antibiotic (UTI/Infection)": "Ciprofloxacin 500mg tablet",
45
+ "🦠 Metronidazole β€” Antibiotic (Anaerobic)": "Metronidazole 400mg tablet",
46
+ "🦠 Ceftriaxone β€” Antibiotic (Injection)": "Ceftriaxone 1gm injection",
47
+ "🦠 Levofloxacin β€” Antibiotic (Pneumonia)": "Levofloxacin 500mg tablet",
48
+ # ── Pain & Fever ─────────────────────────────────────────────
49
+ "πŸ€’ Paracetamol β€” Fever & Pain Relief": "Paracetamol 500mg tablet",
50
+ "πŸ€’ Diclofenac β€” Anti-inflammatory": "Diclofenac Sodium 50mg tablet",
51
+ "πŸ€’ Naproxen β€” Pain Relief (Joints)": "Naproxen 250mg tablet",
52
+ "πŸ€’ Ketorolac β€” Strong Painkiller (Injection)": "Ketorolac 30mg injection",
53
+ # ── Heart & Blood Pressure ───────────────────────────────────
54
+ "πŸ’“ Amlodipine β€” Blood Pressure": "Amlodipine 5mg tablet",
55
+ "πŸ’“ Atorvastatin β€” Cholesterol": "Atorvastatin 20mg tablet",
56
+ "πŸ’“ Losartan β€” Hypertension": "Losartan Potassium 50mg tablet",
57
+ "πŸ’“ Metoprolol β€” Heart Rate / Beta Blocker": "Metoprolol 50mg tablet",
58
+ # ── Diabetes ─────────────────────────────────────────────────
59
+ "🩺 Metformin β€” Type 2 Diabetes": "Metformin Hydrochloride 500mg tablet",
60
+ "🩺 Glibenclamide β€” Blood Sugar Control": "Glibenclamide 5mg tablet",
61
+ # ── Respiratory & Allergy ────────────────────────────────────
62
+ "🫁 Salbutamol β€” Asthma / Bronchospasm": "Salbutamol 2mg tablet syrup",
63
+ "🫁 Montelukast β€” Asthma / Allergy": "Montelukast 10mg tablet",
64
+ "🫁 Fexofenadine β€” Allergy / Antihistamine": "Fexofenadine Hydrochloride 120mg tablet",
65
+ "🫁 Cetirizine β€” Allergy / Antihistamine": "Cetirizine Dihydrochloride 10mg tablet",
66
+ # ── Neuro / Mental Health ────────────────────────────────────
67
+ "🧠 Pregabalin β€” Nerve Pain / Anxiety": "Pregabalin 75mg capsule",
68
+ "🧠 Clonazepam β€” Anxiety / Seizure": "Clonazepam 0.5mg tablet",
69
+ # ── GI / Stomach ─────────────────────────────────────────────
70
+ "πŸ«ƒ Omeprazole β€” Acid Reflux / Ulcer": "Omeprazole 20mg capsule",
71
+ "πŸ«ƒ Esomeprazole β€” GERD / Acid": "Esomeprazole 40mg capsule",
72
+ "πŸ«ƒ Domperidone β€” Nausea / Vomiting": "Domperidone 10mg tablet",
73
+ "πŸ«ƒ Ondansetron β€” Nausea (Chemotherapy)": "Ondansetron 4mg tablet",
74
+ # ── Anti-infective / Antifungal ──────────────────────────────
75
+ "🌿 Albendazole β€” Deworming": "Albendazole 400mg tablet",
76
+ "🌿 Fluconazole β€” Antifungal": "Fluconazole 150mg capsule",
77
+ # ── Vitamins & Supplements ───────────────────────────────────
78
+ "πŸ’Š Vitamin D3 β€” Bone / Immunity": "Cholecalciferol Vitamin D3 tablet",
79
+ "πŸ’Š Zinc + Multivitamin β€” Immunity": "Zinc Nicotinamide Pyridoxine vitamin tablet",
80
+ }
81
+
82
+ DROPDOWN_LABELS = list(DRUG_OPTIONS.keys())
83
+
84
+
85
+ # ═══════════════════════════════════════════════════════════════════
86
+ # βœ… BUG FIX β€” build_drug_text
87
+ # ═══════════════════════════════════════════════════════════════════
88
+ # THIS FUNCTION IS THE CORE FIX.
89
+ #
90
+ # OLD behaviour (buggy):
91
+ # All systems used: GenericName + Dosage + Strength + System
92
+ # For Ayurvedic/Unani/Homeopathic/Herbal, Generic Name is NULL in
93
+ # the dataset, so the code fell back to Brand Name.
94
+ # Brand names like "Feverfit", "Paincap", "Paralead" contain tokens
95
+ # like "fever", "pain", "para" β†’ TF-IDF wrongly matched these when
96
+ # a user searched "paracetamol fever tablet".
97
+ #
98
+ # NEW behaviour (fixed):
99
+ # Allopathic β†’ GenericName + Dosage + Strength + "allopathic"
100
+ # (uses the real pharmaceutical compound)
101
+ # Non-allopathic β†’ Dosage + Strength + SystemName ONLY
102
+ # (brand name noise removed entirely)
103
+ #
104
+ # Result: "paracetamol fever tablet" now returns ONLY Allopathic
105
+ # compounds like Paracetamol, Acetaminophen β€” no more "Feverfit".
106
+ # ─────────────────────────────────────────────────────────────────
107
+
108
+ def _clean(t) -> str:
109
+ """Lowercase, remove special chars, collapse whitespace."""
110
+ if pd.isna(t):
111
+ return ""
112
+ t = re.sub(r"[^a-z0-9\s\+\-\.]", " ", str(t).lower())
113
+ return re.sub(r"\s+", " ", t).strip()
114
+
115
+
116
+ def build_drug_text(row) -> str:
117
+ """
118
+ βœ… FIXED version of drug_text construction.
119
+
120
+ Allopathic β†’ rich text: compound + dosage + strength + system
121
+ All others β†’ lean text: dosage + strength + system (NO brand name)
122
+ """
123
+ if row["medical_system"] == "Allopathic":
124
+ return " ".join(filter(None, [
125
+ _clean(row.get("Generic Name", "")),
126
+ _clean(row.get("Dosages Description", "")),
127
+ _clean(str(row.get("Strength", ""))),
128
+ "allopathic",
129
+ ]))
130
+ else:
131
+ # Non-allopathic: Generic Name is always NULL in this dataset.
132
+ # Using Brand Name as fallback was the source of the bug.
133
+ # We intentionally exclude it here.
134
+ return " ".join(filter(None, [
135
+ _clean(row.get("Dosages Description", "")),
136
+ _clean(str(row.get("Strength", ""))),
137
+ _clean(row.get("medical_system", "")),
138
+ ]))
139
+
140
 
141
+ # ════════════════���══════════════════════════════════════════════════
142
+ # LOAD PKL MODELS
143
+ # ═══════════════════════════════════════════════════════════════════
144
 
145
  def load_models():
146
+ print("Loading models …")
147
+ vec = joblib.load(os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl"))
148
+ mat = joblib.load(os.path.join(MODEL_DIR, "tfidf_matrix.pkl"))
149
+ db = pd.read_csv(os.path.join(MODEL_DIR, "drug_database.csv"))
150
  with open(os.path.join(MODEL_DIR, "model_metadata.json")) as f:
151
+ meta = json.load(f)
152
+ print(f"βœ… {len(db):,} drugs Β· {mat.shape[1]:,} features loaded")
153
+ return vec, mat, db, meta
154
 
155
 
156
  try:
 
158
  vectorizer, tfidf_matrix, drug_db, metadata = load_models()
159
  MEDICAL_SYSTEMS = ["All Systems"] + sorted(drug_db["medical_system"].unique().tolist())
160
  MODEL_LOADED = True
161
+ except Exception as exc:
162
+ print(f"Model load failed: {exc}")
163
  MODEL_LOADED = False
164
  MEDICAL_SYSTEMS = ["All Systems"]
165
+ drug_db = pd.DataFrame()
166
+ metadata = {}
167
 
168
 
169
+ # ═══════════════════════════════════════════════════════════════════
170
+ # CORE RECOMMENDER
171
+ # ═══════════════════════════════════════════════════════════════════
172
 
173
+ def _get_query(drug_label: str) -> str:
174
+ """Map dropdown label β†’ TF-IDF search query."""
175
+ return DRUG_OPTIONS.get(drug_label, drug_label)
 
 
 
 
176
 
 
 
177
 
178
+ def _run_similarity(query: str, system_filter: str, top_n: int, min_score: float):
179
+ """Inner similarity search. Returns (indices, scores)."""
180
+ q_clean = _clean(query)
181
+ q_vec = vectorizer.transform([q_clean])
182
+ sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
183
 
184
+ if system_filter and system_filter != "All Systems":
185
+ mask = drug_db["medical_system"] == system_filter
 
 
 
 
 
186
  sims_work = sims.copy()
187
+ sims_work[~mask] = 0.0
188
  else:
189
  sims_work = sims
190
 
191
+ candidate_idx = sims_work.argsort()[-(top_n * 4):][::-1]
192
+ filtered_idx = [i for i in candidate_idx if sims[i] >= min_score][:top_n]
193
+ return filtered_idx, sims
194
 
195
+
196
+ def recommend_from_selection(drug_label: str, system_filter: str,
197
+ top_n: int, min_score: float):
198
+ """Tab 1 β€” Dataset recommendations from PKL model."""
199
+ if not MODEL_LOADED:
200
+ return None, "❌ Models not loaded. Ensure `/models` folder is present."
201
+ if not drug_label:
202
+ return None, "⚠️ Please select a drug from the dropdown."
203
+
204
+ query = _get_query(drug_label)
205
+ idx, sims = _run_similarity(query, system_filter, top_n, min_score)
206
+
207
+ if not idx:
208
+ return None, (
209
+ f"⚠️ No results above similarity score **{min_score}**. "
210
+ "Try lowering the threshold slider."
211
  )
212
 
213
+ out = drug_db.iloc[idx][[
214
+ "brand_name", "generic_name", "dosage_form",
215
+ "strength", "medical_system", "manufacturer",
216
  ]].copy()
217
+ out["similarity_score"] = [round(float(sims[i]), 4) for i in idx]
218
+ out = out.sort_values("similarity_score", ascending=False).reset_index(drop=True)
219
+ out.index = range(1, len(out) + 1)
220
+ out.index.name = "Rank"
221
+ out.columns = [
222
+ "Brand Name", "Generic Name", "Dosage Form",
223
+ "Strength", "Medical System", "Manufacturer", "Score",
224
+ ]
225
+
226
+ sys_counts = out["Medical System"].value_counts()
227
+ sys_str = " Β· ".join(f"**{k}** {v}" for k, v in sys_counts.items())
228
+ label_short = drug_label.split("β€”")[0].strip()
229
+ summary = (
230
+ f"### βœ… {len(out)} results for {label_short}\n\n"
231
+ f"{sys_str}\n\n"
232
+ f"*Query used: `{query}`*"
233
+ )
234
+ return out, summary
235
 
 
 
 
236
 
237
+ def cross_system_compare(drug_label: str, top_per_system: int):
238
+ """Tab 2 β€” Best N drugs from every system side by side."""
239
+ if not MODEL_LOADED:
240
+ return None, "❌ Models not loaded."
241
+ if not drug_label:
242
+ return None, "⚠️ Select a drug first."
243
+
244
+ query = _get_query(drug_label)
245
+ q_clean = _clean(query)
246
+ q_vec = vectorizer.transform([q_clean])
247
+ sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
248
+
249
+ rows = []
250
+ for system in sorted(drug_db["medical_system"].unique()):
251
+ mask = drug_db["medical_system"] == system
252
+ s = sims.copy(); s[~mask] = 0.0
253
+ idx = [i for i in s.argsort()[-top_per_system:][::-1] if sims[i] > 0.01]
254
+ for i in idx:
255
+ r = drug_db.iloc[i]
256
+ rows.append({
257
+ "Medical System": r["medical_system"],
258
+ "Brand Name": r["brand_name"],
259
+ "Generic Name": r["generic_name"],
260
+ "Dosage Form": r["dosage_form"],
261
+ "Strength": r["strength"],
262
+ "Score": round(float(sims[i]), 4),
263
+ })
264
+
265
+ if not rows:
266
+ return None, "No cross-system results found."
267
+
268
+ df = pd.DataFrame(rows).sort_values(
269
+ ["Medical System", "Score"], ascending=[True, False]
270
+ ).reset_index(drop=True)
271
+ df.index = range(1, len(df) + 1)
272
+ df.index.name = "Rank"
273
+
274
+ label_short = drug_label.split("β€”")[0].strip()
275
  summary = (
276
+ f"### 🌐 Cross-system: {label_short}\n\n"
277
+ f"Top **{top_per_system}** per system Β· {len(df)} total drugs Β· "
278
+ f"{df['Medical System'].nunique()} systems"
279
  )
280
+ return df, summary
281
 
282
 
283
+ # ═══════════════════════════════════════════════════════════════════
284
+ # OPENFA API HELPERS
285
+ # ═══════════════════════════════════════════════════════════════════
286
+
287
+ def _openfda(endpoint: str, params: dict, timeout: int = 10) -> dict:
288
+ try:
289
+ r = requests.get(
290
+ f"{OPENFDA_BASE}/{endpoint}.json",
291
+ params=params, timeout=timeout,
292
+ headers={"User-Agent": "DrugRecommenderThesis/3.0"},
293
+ )
294
+ if r.status_code == 200:
295
+ return r.json()
296
+ return {"error": f"HTTP {r.status_code}", "message": r.text[:200]}
297
+ except requests.exceptions.Timeout:
298
+ return {"error": "timeout", "message": "OpenFDA timed out β€” try again."}
299
+ except requests.exceptions.ConnectionError:
300
+ return {"error": "connection", "message": "Cannot reach OpenFDA. Check internet."}
301
+ except Exception as e:
302
+ return {"error": "unknown", "message": str(e)}
303
+
304
+
305
+ def _extract_generic(drug_label: str) -> str:
306
+ """'🦠 Azithromycin β€” Antibiotic' β†’ 'Azithromycin'"""
307
+ raw = drug_label.split("β€”")[0]
308
+ cleaned = re.sub(r"[^\w\s]", "", raw).strip()
309
+ words = cleaned.split()
310
+ return words[0] if words else cleaned
311
+
312
+
313
+ # ─── Tab 3: FDA Drug Label ───────────────────────────────────────
314
+ def get_fda_label(drug_label: str) -> str:
315
+ if not drug_label:
316
+ return "⚠️ Select a drug first."
317
+ generic = _extract_generic(drug_label)
318
+ data = _openfda("label", {"search": f"openfda.generic_name:{generic}", "limit": 1})
319
+
320
+ if "error" in data:
321
+ return (
322
+ f"### ⚠️ OpenFDA: {data['message']}\n\n"
323
+ f"*`{generic}` may not be in the US FDA database β€” "
324
+ "OpenFDA covers US-approved drugs only.*"
325
+ )
326
+
327
+ results = data.get("results", [])
328
+ if not results:
329
+ return f"ℹ️ No FDA label found for **{generic}**."
330
+
331
+ r = results[0]
332
+ ofd = r.get("openfda", {})
333
+ lines = [
334
+ f"## πŸ’Š FDA Label: {generic.title()}",
335
+ "_Source: U.S. Food & Drug Administration Β· OpenFDA_\n",
336
+ ]
337
+
338
+ def _add(key, title):
339
+ v = ofd.get(key, [])
340
+ if v:
341
+ lines.append(f"**{title}:** {', '.join(v[:5])}")
342
+
343
+ _add("brand_name", "Brand Names (US)")
344
+ _add("manufacturer_name", "Manufacturer")
345
+ _add("route", "Route")
346
+ lines.append("")
347
+
348
+ SECTIONS = [
349
+ ("indications_and_usage", "πŸ“‹ Indications & Usage", 700),
350
+ ("warnings", "⚠️ Warnings", 500),
351
+ ("dosage_and_administration", "πŸ’‰ Dosage & Administration", 500),
352
+ ("adverse_reactions", "πŸ”΄ Adverse Reactions", 400),
353
+ ("drug_interactions", "πŸ”— Drug Interactions", 400),
354
+ ("contraindications", "🚫 Contraindications", 400),
355
+ ]
356
+ for field, heading, limit in SECTIONS:
357
+ val = r.get(field, [])
358
+ if val:
359
+ lines.append(f"### {heading}")
360
+ lines.append(val[0][:limit] + ("…" if len(val[0]) > limit else "") + "\n")
361
+
362
+ lines.append("---")
363
+ lines.append(
364
+ "*Data from [OpenFDA](https://open.fda.gov) Β· "
365
+ "For research purposes only Β· Not clinical advice*"
366
+ )
367
+ return "\n".join(lines)
368
+
369
+
370
+ # ─── Tab 4: FAERS Adverse Events ────────────────────────────────
371
+ def get_fda_adverse_events(drug_label: str):
372
+ if not drug_label:
373
+ return None, "⚠️ Select a drug first."
374
+ generic = _extract_generic(drug_label)
375
+ data = _openfda("event", {
376
+ "search": f"patient.drug.medicinalproduct:{generic}",
377
+ "count": "patient.reaction.reactionmeddrapt.exact",
378
+ "limit": 15,
379
+ })
380
+ if "error" in data:
381
+ return None, f"### ⚠️ FAERS: {data['message']}"
382
+
383
+ results = data.get("results", [])
384
+ if not results:
385
+ return None, f"ℹ️ No FAERS data for **{generic}**."
386
+
387
+ df = pd.DataFrame(results, columns=["Adverse Reaction", "Report Count"])
388
+ df = df.sort_values("Report Count", ascending=False).reset_index(drop=True)
389
+ df.index = range(1, len(df) + 1)
390
+ df.index.name = "Rank"
391
+
392
+ total = df["Report Count"].sum()
393
+ summary = (
394
+ f"### πŸ“Š FAERS Adverse Events: **{generic.title()}**\n\n"
395
+ f"Top 15 reactions Β· **{total:,} total reports** in FDA database\n\n"
396
+ f"*Source: FDA Adverse Event Reporting System (FAERS) via OpenFDA*"
397
+ )
398
+ return df, summary
399
+
400
+
401
+ # ─── Tab 4: NDC Lookup ──────────────────────────────────────────
402
+ def get_fda_ndc(drug_label: str):
403
+ if not drug_label:
404
+ return None, "⚠️ Select a drug."
405
+ generic = _extract_generic(drug_label)
406
+ data = _openfda("ndc", {"search": f"generic_name:{generic}", "limit": 10})
407
+ if "error" in data:
408
+ return None, f"### ⚠️ NDC: {data['message']}"
409
+
410
+ results = data.get("results", [])
411
+ if not results:
412
+ return None, f"ℹ️ No NDC data for **{generic}**."
413
+
414
+ rows = [{
415
+ "Brand Name": r.get("brand_name", "β€”"),
416
+ "Generic Name": r.get("generic_name", "β€”"),
417
+ "Dosage Form": r.get("dosage_form", "β€”"),
418
+ "Route": ", ".join(r.get("route", [])),
419
+ "Manufacturer": r.get("labeler_name", "β€”"),
420
+ "Product Type": r.get("product_type", "β€”"),
421
+ "NDC Code": r.get("product_ndc", "β€”"),
422
+ } for r in results]
423
+
424
+ df = pd.DataFrame(rows)
425
+ df.index = range(1, len(df) + 1)
426
+ df.index.name = "#"
427
+
428
+ summary = (
429
+ f"### 🏷️ NDC Registry: **{generic.title()}**\n\n"
430
+ f"**{len(df)} products** in US National Drug Code directory\n\n"
431
+ f"*Source: FDA NDC Database via OpenFDA*"
432
+ )
433
+ return df, summary
434
+
435
+
436
+ # ═══════════════════════════════════════════════════════════════════
437
+ # ✨ CHARTS β€” 5 unique visuals for the medical system overview tab
438
+ # ═══════════════════════════════════════════════════════════════════
439
+
440
+ # Precompute all chart data at startup (fast, in-memory)
441
+ if MODEL_LOADED and not drug_db.empty:
442
+ _sys_counts = drug_db["medical_system"].value_counts()
443
+ _dosage_top10 = drug_db["dosage_form"].value_counts().head(10)
444
+ _mfr_top15 = drug_db["manufacturer"].value_counts().head(15)
445
+ _sys_dosage = pd.crosstab(drug_db["medical_system"], drug_db["dosage_form"])
446
+ _sys_dosage = _sys_dosage[_dosage_top10.index[:8]]
447
+ else:
448
+ _sys_counts = pd.Series({"No data": 1})
449
+ _dosage_top10 = pd.Series({"No data": 1})
450
+ _mfr_top15 = pd.Series({"No data": 1})
451
+ _sys_dosage = pd.DataFrame()
452
+
453
+
454
+ def _sys_colors(labels):
455
+ return [SYSTEM_COLORS.get(l, "#64748b") for l in labels]
456
+
457
+
458
+ # ── Chart 1: Donut β€” Drug share per medical system ───────────────
459
+ def chart_donut():
460
+ labels = _sys_counts.index.tolist()
461
+ values = _sys_counts.values.tolist()
462
+ colors = _sys_colors(labels)
463
+
464
+ fig = go.Figure(go.Pie(
465
+ labels=labels,
466
+ values=values,
467
+ hole=0.55,
468
+ marker=dict(colors=colors, line=dict(color="#ffffff", width=2.5)),
469
+ textinfo="label+percent",
470
+ textfont=dict(size=13),
471
+ hovertemplate="<b>%{label}</b><br>%{value:,} drugs<br>%{percent}<extra></extra>",
472
+ ))
473
+ fig.update_layout(
474
+ title=dict(
475
+ text="<b>Drug Distribution Across 5 Medical Systems</b>",
476
+ x=0.5, xanchor="center", font=dict(size=17)
477
+ ),
478
+ annotations=[dict(
479
+ text=f"<b>{_sys_counts.sum():,}</b><br>Total Drugs",
480
+ x=0.5, y=0.5, font=dict(size=15), showarrow=False
481
+ )],
482
+ legend=dict(orientation="h", y=-0.08, x=0.5, xanchor="center"),
483
+ height=420,
484
+ margin=dict(t=60, b=40, l=20, r=20),
485
+ paper_bgcolor="white", plot_bgcolor="white",
486
+ )
487
+ return fig
488
+
489
+
490
+ # ── Chart 2: Horizontal bar β€” Top 10 dosage forms ────────────────
491
+ def chart_dosage_bar():
492
+ labels = _dosage_top10.index.tolist()[::-1]
493
+ values = _dosage_top10.values.tolist()[::-1]
494
+ colors = px.colors.sequential.Blues[2:][:len(labels)][::-1]
495
+
496
+ fig = go.Figure(go.Bar(
497
+ y=labels, x=values,
498
+ orientation="h",
499
+ marker=dict(color=colors),
500
+ text=[f" {v:,}" for v in values],
501
+ textposition="outside",
502
+ hovertemplate="<b>%{y}</b>: %{x:,} drugs<extra></extra>",
503
+ ))
504
+ fig.update_layout(
505
+ title=dict(
506
+ text="<b>Top 10 Dosage Forms</b>",
507
+ x=0.5, xanchor="center", font=dict(size=17)
508
+ ),
509
+ xaxis=dict(title="Number of Drugs", showgrid=True, gridcolor="#f0f0f0"),
510
+ yaxis=dict(title=""),
511
+ height=420,
512
+ margin=dict(t=60, b=40, l=160, r=60),
513
+ paper_bgcolor="white", plot_bgcolor="white",
514
+ )
515
+ return fig
516
+
517
+
518
+ # ── Chart 3: Grouped bar β€” Dosage form per system ────────────────
519
+ def chart_system_dosage_grouped():
520
+ if _sys_dosage.empty:
521
+ return go.Figure()
522
+
523
+ fig = go.Figure()
524
+ dosage_cols = _sys_dosage.columns.tolist()
525
+ palette = px.colors.qualitative.Pastel[:len(dosage_cols)]
526
+
527
+ for col, color in zip(dosage_cols, palette):
528
+ fig.add_trace(go.Bar(
529
+ name=col,
530
+ x=_sys_dosage.index.tolist(),
531
+ y=_sys_dosage[col].tolist(),
532
+ marker_color=color,
533
+ hovertemplate=f"<b>{col}</b><br>%{{x}}: %{{y:,}}<extra></extra>",
534
+ ))
535
+
536
+ fig.update_layout(
537
+ barmode="group",
538
+ title=dict(
539
+ text="<b>Dosage Form Breakdown per Medical System</b>",
540
+ x=0.5, xanchor="center", font=dict(size=17)
541
+ ),
542
+ xaxis=dict(title="Medical System"),
543
+ yaxis=dict(title="Drug Count", showgrid=True, gridcolor="#f0f0f0"),
544
+ legend=dict(title="Dosage Form", orientation="h", y=-0.22, x=0.5, xanchor="center"),
545
+ height=460,
546
+ margin=dict(t=60, b=100, l=60, r=20),
547
+ paper_bgcolor="white", plot_bgcolor="white",
548
+ )
549
+ return fig
550
+
551
+
552
+ # ── Chart 4: Treemap β€” Manufacturer Γ— System ─────────────────────
553
+ def chart_treemap():
554
+ top_mfr = drug_db.groupby(["medical_system", "manufacturer"]).size().reset_index(name="count")
555
+ top_mfr = top_mfr.sort_values("count", ascending=False)
556
+ # Keep top 5 manufacturers per system
557
+ top_mfr = top_mfr.groupby("medical_system").head(5).reset_index(drop=True)
558
+
559
+ fig = px.treemap(
560
+ top_mfr,
561
+ path=["medical_system", "manufacturer"],
562
+ values="count",
563
+ color="medical_system",
564
+ color_discrete_map=SYSTEM_COLORS,
565
+ custom_data=["count"],
566
+ )
567
+ fig.update_traces(
568
+ hovertemplate="<b>%{label}</b><br>Products: %{customdata[0]:,}<extra></extra>",
569
+ textfont=dict(size=12),
570
+ )
571
+ fig.update_layout(
572
+ title=dict(
573
+ text="<b>Top Manufacturers by Medical System (Treemap)</b>",
574
+ x=0.5, xanchor="center", font=dict(size=17)
575
+ ),
576
+ height=480,
577
+ margin=dict(t=60, b=20, l=20, r=20),
578
+ paper_bgcolor="white",
579
+ )
580
+ return fig
581
+
582
+
583
+ # ── Chart 5: Radar β€” System profile across dosage dimensions ─────
584
+ def chart_radar():
585
+ dosage_categories = ["Tablet", "Capsule", "Liquid", "Injection", "Syrup"]
586
+ available_cats = [c for c in dosage_categories if c in _sys_dosage.columns]
587
+ if not available_cats:
588
+ return go.Figure()
589
+
590
+ sub = _sys_dosage[available_cats]
591
+ # Normalise each system to 0-100
592
+ sub_n = sub.div(sub.max(axis=0), axis=1).fillna(0) * 100
593
+
594
+ fig = go.Figure()
595
+ for system in sub_n.index:
596
+ vals = sub_n.loc[system].tolist()
597
+ color = SYSTEM_COLORS.get(system, "#64748b")
598
+ fig.add_trace(go.Scatterpolar(
599
+ r=vals + [vals[0]],
600
+ theta=available_cats + [available_cats[0]],
601
+ fill="toself",
602
+ fillcolor=color.replace(")", ",0.15)").replace("rgb", "rgba")
603
+ if "rgb" in color else color + "28",
604
+ line=dict(color=color, width=2),
605
+ name=system,
606
+ hovertemplate="<b>" + system + "</b><br>%{theta}: %{r:.0f}%<extra></extra>",
607
+ ))
608
+ fig.update_layout(
609
+ polar=dict(
610
+ radialaxis=dict(
611
+ visible=True, range=[0, 110],
612
+ tickfont=dict(size=10), gridcolor="#e5e7eb",
613
+ ),
614
+ angularaxis=dict(tickfont=dict(size=12)),
615
+ bgcolor="white",
616
+ ),
617
+ title=dict(
618
+ text="<b>Medical System Profile β€” Dosage Form Radar</b>",
619
+ x=0.5, xanchor="center", font=dict(size=17)
620
+ ),
621
+ showlegend=True,
622
+ legend=dict(orientation="h", y=-0.12, x=0.5, xanchor="center"),
623
+ height=460,
624
+ margin=dict(t=60, b=80, l=60, r=60),
625
+ paper_bgcolor="white", plot_bgcolor="white",
626
+ )
627
+ return fig
628
+
629
+
630
+ def build_all_charts():
631
+ """Called once when the Charts tab is first opened."""
632
+ return (
633
+ chart_donut(),
634
+ chart_dosage_bar(),
635
+ chart_system_dosage_grouped(),
636
+ chart_treemap(),
637
+ chart_radar(),
638
+ )
639
+
640
+
641
+ # ═══════════════════════════════════════════════════════════════════
642
+ # STATS TEXT
643
+ # ═══════════════════════════════════════════════════════════════════
644
+ def get_stats() -> str:
645
  if not MODEL_LOADED:
646
  return "Models not loaded."
647
+ sys_dist = drug_db["medical_system"].value_counts()
648
+ dosage_dist = drug_db["dosage_form"].value_counts().head(10)
649
+ md = f"""## πŸ“Š Dataset Statistics
 
 
650
 
651
  | Metric | Value |
652
  |--------|-------|
653
+ | **Total Drugs** | {len(drug_db):,} |
654
+ | **Medical Systems** | {drug_db["medical_system"].nunique()} |
655
+ | **Unique Manufacturers** | {drug_db["manufacturer"].nunique():,} |
656
+ | **Unique Brand Names** | {drug_db["brand_name"].nunique():,} |
657
+ | **TF-IDF Features** | {metadata.get("n_features", 10000):,} |
658
+ | **Silhouette Score** | {metadata.get("silhouette_score", "N/A")} |
659
+ | **Bug Fix Applied** | Non-allopathic brand names excluded from TF-IDF |
660
+
661
+ ### πŸ₯ Medical Systems
662
  """
663
+ for s, c in sys_dist.items():
664
+ pct = c / len(drug_db) * 100
665
+ bar = "β–ˆ" * int(pct / 3)
666
+ md += f"\n- **{s}**: {c:,} ({pct:.1f}%) `{bar}`"
667
+ md += "\n\n### πŸ’Š Top 10 Dosage Forms\n"
668
+ for d, c in dosage_dist.items():
669
+ md += f"\n- {d}: {c:,}"
670
+ return md
671
 
 
 
 
672
 
673
+ # ═══════════════════════════════════════════════════════════════════
674
+ # GRADIO UI
675
+ # ═══════════════════════════════════════════════════════════════════
676
 
677
+ CSS = """
678
+ .gradio-container {
679
+ max-width: 1080px !important;
680
+ margin: auto !important;
681
+ font-family: 'Segoe UI', system-ui, sans-serif !important;
682
+ }
683
+ .hero {
684
+ background: linear-gradient(135deg, #0f172a 0%, #1e1b4b 55%, #0f172a 100%);
685
+ border: 1px solid rgba(99,102,241,0.35);
686
+ border-radius: 16px;
687
+ padding: 28px 32px 22px;
688
+ margin-bottom: 18px;
689
+ text-align: center;
690
+ }
691
+ .sbadge {
692
+ display: inline-block; border-radius: 999px;
693
+ padding: 4px 13px; font-size: 12px; margin: 3px;
694
+ }
695
+ .fix-note {
696
+ background: rgba(34,197,94,0.08);
697
+ border: 1px solid rgba(34,197,94,0.25);
698
+ border-radius: 10px; padding: 11px 16px;
699
+ font-size: 13px; margin: 8px 0 12px;
700
+ }
701
+ footer { display: none !important; }
702
+ """
703
 
704
+ HEADER_HTML = """
705
+ <div class="hero">
706
+ <h1 style="color:white;font-size:2em;margin:0 0 8px;font-weight:800;letter-spacing:-0.5px;">
707
+ πŸ’Š Cross-Medical-System Drug Recommender
708
+ </h1>
709
+ <p style="color:#94a3b8;margin:0 0 14px;font-size:1rem;">
710
+ 53,581 drugs Β· NLP-Powered Β· Master's Thesis Β· + Live OpenFDA API
711
+ </p>
712
+ <div>
713
+ <span class="sbadge" style="background:rgba(59,130,246,.15);border:1px solid rgba(59,130,246,.3);color:#93c5fd;">πŸ”΅ Allopathic 36,251</span>
714
+ <span class="sbadge" style="background:rgba(249,115,22,.12);border:1px solid rgba(249,115,22,.3);color:#fdba74;">🟠 Unani 8,460</span>
715
+ <span class="sbadge" style="background:rgba(34,197,94,.12);border:1px solid rgba(34,197,94,.3);color:#86efac;">🟒 Ayurvedic 5,262</span>
716
+ <span class="sbadge" style="background:rgba(168,85,247,.12);border:1px solid rgba(168,85,247,.3);color:#d8b4fe;">🟣 Homeopathic 2,580</span>
717
+ <span class="sbadge" style="background:rgba(239,68,68,.1);border:1px solid rgba(239,68,68,.3);color:#fca5a5;">πŸ”΄ Herbal 1,028</span>
718
+ <span class="sbadge" style="background:rgba(16,185,129,.1);border:1px solid rgba(16,185,129,.3);color:#6ee7b7;">πŸ‡ΊπŸ‡Έ + OpenFDA API</span>
719
+ </div>
720
+ </div>
721
+ """
722
 
723
+ FIX_NOTE_HTML = """
724
+ <div class="fix-note">
725
+ <strong>βœ… Bug fix applied:</strong> Non-allopathic drugs (Ayurvedic, Unani, Homeopathic, Herbal)
726
+ no longer appear in Allopathic compound searches.
727
+ Brand names like <em>"Feverfit"</em> or <em>"Paincap"</em> are no longer used as TF-IDF tokens β€”
728
+ only the pharmaceutical compound name (Generic Name) drives matching for Allopathic drugs.
729
+ </div>
730
+ """
731
 
732
+ with gr.Blocks(css=CSS, title="πŸ’Š Drug Recommender v3", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
733
 
734
+ gr.HTML(HEADER_HTML)
 
 
 
 
 
 
735
 
736
+ # ── Global selector β€” shared by all 4 data tabs ──────────────
737
+ gr.Markdown("### πŸ‘‡ Step 1: Select a drug β€” then use any tab below")
738
+
739
+ with gr.Row():
740
+ with gr.Column(scale=5):
741
+ drug_selector = gr.Dropdown(
742
+ choices=DROPDOWN_LABELS,
743
+ value=DROPDOWN_LABELS[0],
744
+ label="πŸ’Š Select Drug / Category (30 options)",
745
+ info="Antibiotics Β· Pain Β· Heart Β· Diabetes Β· Respiratory Β· Neuro Β· GI Β· Antifungal Β· Vitamins",
746
+ interactive=True,
747
+ )
748
+ with gr.Column(scale=2):
749
+ system_filter = gr.Dropdown(
750
+ choices=MEDICAL_SYSTEMS,
751
+ value="All Systems",
752
+ label="πŸ₯ Medical System Filter",
753
+ info="Optional β€” narrows results",
754
+ )
755
 
756
  gr.HTML("""
757
+ <div style="background:#f8fafc;border:1px solid #e2e8f0;border-radius:10px;
758
+ padding:11px 16px;font-size:13px;margin:6px 0 14px;color:#475569;">
759
+ <strong>Two data sources:</strong>
760
+ Tabs 1–2 query your <strong>local PKL model</strong> (53k drugs). &nbsp;|&nbsp;
761
+ Tabs 3–4 call <strong>OpenFDA live API</strong> for real-time FDA data.
762
+ Both use the same dropdown above.
763
  </div>
764
  """)
765
 
766
  with gr.Tabs():
767
 
768
+ # ═══════════════════════════════════════════════════════
769
+ # TAB 1 β€” Dataset Recommendations
770
+ # ═══════════════════════════════════════════════════════
771
+ with gr.TabItem("πŸ” Dataset Recommendations"):
772
+ gr.HTML(FIX_NOTE_HTML)
773
+ gr.Markdown("""
774
+ Finds similar drugs using **TF-IDF cosine similarity** across all 53,581 records.
775
+ The dropdown selection maps to a precise search query β€” no typing needed.
776
+ """)
 
 
 
 
 
 
 
 
777
 
778
  with gr.Row():
779
+ top_n = gr.Slider(3, 25, value=10, step=1,
780
+ label="πŸ“‹ Number of Results")
781
+ min_score = gr.Slider(0.01, 0.50, value=0.05, step=0.01,
782
+ label="🎯 Min Similarity Score")
783
+
784
+ rec_btn = gr.Button("πŸš€ Get Recommendations", variant="primary", size="lg")
785
+ rec_summary = gr.Markdown()
786
+ rec_table = gr.DataFrame(
 
 
 
 
787
  label="πŸ“‹ Recommended Drugs",
788
+ wrap=True, interactive=False,
 
789
  )
790
 
791
+ rec_btn.click(
792
+ fn=recommend_from_selection,
793
+ inputs=[drug_selector, system_filter, top_n, min_score],
794
+ outputs=[rec_table, rec_summary],
795
  )
796
 
797
+ gr.Markdown("""
798
+ ---
799
+ **How matching works per system:**
800
+ - **Allopathic** β†’ matched by *Generic Name* compound (e.g. "Paracetamol") βœ…
801
+ - **Ayurvedic / Unani / Homeopathic / Herbal** β†’ matched by *dosage form + system*
802
+ (no generic compound data exists in this dataset for these systems)
803
+ """)
804
 
805
+ # ═══════════════════════════════════════════════════════
806
+ # TAB 2 β€” Cross-System Comparison
807
+ # ═══════════════════════════════════════════════════════
808
+ with gr.TabItem("🌐 Cross-System Compare"):
809
  gr.Markdown("""
810
+ ### πŸ† Core Thesis Feature
811
+ Best results from **every medical tradition** side by side.
812
+ Bridges Allopathic ↔ Ayurvedic ↔ Unani ↔ Homeopathic ↔ Herbal.
813
  """)
814
 
815
+ top_per_sys = gr.Slider(1, 5, value=3, step=1,
816
+ label="Results per Medical System")
817
+ compare_btn = gr.Button("πŸ”„ Compare All 5 Systems",
818
+ variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
819
  cross_summary = gr.Markdown()
820
+ cross_table = gr.DataFrame(
821
+ label="🌐 All 5 Medical Systems β€” Side by Side",
822
+ wrap=True, interactive=False,
 
 
 
823
  )
824
 
825
  compare_btn.click(
826
  fn=cross_system_compare,
827
+ inputs=[drug_selector, top_per_sys],
828
+ outputs=[cross_table, cross_summary],
829
  )
830
 
831
+ # ═══════════════════════════════════════════════════════
832
+ # TAB 3 β€” OpenFDA Drug Label (Live)
833
+ # ═══════════════════════════════════════════════════════
834
+ with gr.TabItem("πŸ‡ΊπŸ‡Έ FDA Label (Live)"):
835
+ gr.Markdown("""
836
+ ### Official FDA Drug Label β€” fetched live from OpenFDA
837
+ Returns indications, warnings, dosage, adverse reactions, and drug interactions
838
+ directly from the US Food & Drug Administration.
839
+
840
+ > πŸ”Œ **API:** [OpenFDA /drug/label](https://open.fda.gov/apis/drug/label/) Β· Free Β· No key required
841
+ """)
842
+
843
+ fda_label_btn = gr.Button("πŸ” Fetch FDA Drug Label",
844
+ variant="primary", size="lg")
845
+ fda_label_result = gr.Markdown()
846
+
847
+ fda_label_btn.click(
848
+ fn=get_fda_label,
849
+ inputs=[drug_selector],
850
+ outputs=[fda_label_result],
851
+ )
852
 
 
 
853
  gr.Markdown("""
854
+ ---
855
+ ⚠️ *OpenFDA covers US-approved drugs. Bangladesh dataset drugs may use
856
+ different brand names or may not be in FDA records β€” this is expected.*
857
+ """)
858
+
859
+ # ═══════════════════════════════════════════════════════
860
+ # TAB 4 β€” FDA Adverse Events + NDC (Live)
861
+ # ═══════════════════════════════════════════════════════
862
+ with gr.TabItem("⚠️ Adverse Events + NDC (Live)"):
863
+ gr.Markdown("""
864
+ ### FDA FAERS Adverse Events + National Drug Code Registry
865
+ - **FAERS** β€” real patient-reported side effects from millions of reports
866
+ - **NDC** β€” manufacturer, packaging, and product type data
867
+ """)
868
+
869
+ with gr.Row():
870
+ ae_btn = gr.Button("πŸ“Š Fetch Adverse Events (FAERS)",
871
+ variant="primary")
872
+ ndc_btn = gr.Button("🏷️ Lookup NDC Directory",
873
+ variant="secondary")
874
+
875
+ ae_summary = gr.Markdown()
876
+ ae_table = gr.DataFrame(
877
+ label="⚠️ Top Adverse Reactions (Real FDA Data)",
878
+ wrap=True, interactive=False,
879
+ )
880
+
881
+ gr.HTML("<hr style='margin:14px 0;border-color:#e2e8f0;'>")
882
+
883
+ ndc_summary = gr.Markdown()
884
+ ndc_table = gr.DataFrame(
885
+ label="🏷️ NDC Product Registry",
886
+ wrap=True, interactive=False,
887
+ )
888
+
889
+ ae_btn.click(
890
+ fn=get_fda_adverse_events,
891
+ inputs=[drug_selector],
892
+ outputs=[ae_table, ae_summary],
893
+ )
894
+ ndc_btn.click(
895
+ fn=get_fda_ndc,
896
+ inputs=[drug_selector],
897
+ outputs=[ndc_table, ndc_summary],
898
+ )
899
+
900
+ gr.Markdown("""
901
+ ---
902
+ > **APIs:** [OpenFDA FAERS](https://open.fda.gov/apis/drug/event/) Β·
903
+ > [OpenFDA NDC](https://open.fda.gov/apis/drug/ndc/) Β· Both free, no key.
904
+ """)
905
+
906
+ # ═══════════════════════════════════════════════════════
907
+ # TAB 5 β€” Visual Charts (5 unique plots)
908
+ # ═══════════════════════════════════════════════════════
909
+ with gr.TabItem("πŸ“Š Visual Charts"):
910
+ gr.Markdown("""
911
+ ### πŸ“Š Five unique visualisations of the 53,581-drug dataset
912
+ Click **Load All Charts** to render the full dashboard.
913
+ """)
914
+
915
+ load_charts_btn = gr.Button("πŸ“Š Load All Charts",
916
+ variant="primary", size="lg")
917
 
918
+ with gr.Row():
919
+ p1 = gr.Plot(label="β‘  Drug Share by Medical System (Donut)")
920
+ p2 = gr.Plot(label="β‘‘ Top 10 Dosage Forms (Bar)")
921
 
922
+ with gr.Row():
923
+ p3 = gr.Plot(label="β‘’ Dosage Form per System (Grouped Bar)")
924
+ p4 = gr.Plot(label="β‘£ Top Manufacturers Treemap")
 
925
 
926
+ with gr.Row():
927
+ p5 = gr.Plot(label="β‘€ System Profile β€” Dosage Radar")
928
+
929
+ load_charts_btn.click(
930
+ fn=build_all_charts,
931
+ inputs=[],
932
+ outputs=[p1, p2, p3, p4, p5],
933
+ )
934
+
935
+ # ═══════════════════════════════════════════════════════
936
+ # TAB 6 β€” Stats text
937
+ # ═══════════════════════════════════════════════════════
938
+ with gr.TabItem("πŸ“ˆ Dataset Stats"):
939
+ load_stats_btn = gr.Button("πŸ“ˆ Load Statistics", variant="secondary")
940
+ stats_output = gr.Markdown()
941
+ load_stats_btn.click(fn=get_stats, inputs=[], outputs=[stats_output])
942
+
943
+ # ═══════════════════════════════════════════════════════
944
+ # TAB 7 β€” About / Code Reference
945
+ # ═══════════════════════════════════════════════════════
946
+ with gr.TabItem("πŸ“š About / Code"):
947
+ gr.Markdown("""
948
+ ## πŸ“– About This Project
949
+
950
+ **Thesis:** Intelligent Cross-Medical-System Drug Recommendation Using NLP
951
+
952
+ ### Where the key code changes live in `app.py`
953
+
954
+ | What changed | Function / location |
955
+ |---|---|
956
+ | βœ… Bug fix β€” non-allopathic brand name excluded | `build_drug_text()` ~line 100 |
957
+ | ✨ 30-option dropdown | `DRUG_OPTIONS` dict ~line 60 |
958
+ | πŸ‡ΊπŸ‡Έ FDA Drug Label API | `get_fda_label()` |
959
+ | ⚠️ FDA FAERS adverse events | `get_fda_adverse_events()` |
960
+ | 🏷️ FDA NDC lookup | `get_fda_ndc()` |
961
+ | πŸ“Š Donut chart | `chart_donut()` |
962
+ | πŸ“Š Bar chart | `chart_dosage_bar()` |
963
+ | πŸ“Š Grouped bar | `chart_system_dosage_grouped()` |
964
+ | πŸ“Š Treemap | `chart_treemap()` |
965
+ | πŸ“Š Radar chart | `chart_radar()` |
966
+
967
+ ### Technical Stack
968
+
969
+ | Layer | Technology |
970
+ |---|---|
971
+ | NLP | TF-IDF bigrams, 10,000 features, sublinear TF |
972
+ | Similarity | Cosine Similarity |
973
+ | Clustering | SVD (50d) + K-Means (K=10) |
974
+ | External APIs | OpenFDA label Β· FAERS Β· NDC |
975
+ | Visualisation | Plotly (donut, bar, grouped bar, treemap, radar) |
976
+ | Deployment | Hugging Face Spaces Β· Gradio 4 |
977
 
978
  ### Dataset
979
+ 53,581 records Β· Bangladesh National Drug Registry Β·
980
+ [Kaggle link](https://www.kaggle.com/datasets/shuvokumarbasak2030/drug-pharma-new-dataset)
981
+
982
+ ---
983
+ ⚠️ *Research and educational purposes only. Not clinical advice.*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
  """)
985
 
986
  gr.HTML("""
987
+ <div style="text-align:center;padding:14px;color:#94a3b8;font-size:12px;
988
+ border-top:1px solid #e2e8f0;margin-top:12px;">
989
+ πŸ’Š Cross-Medical-System Drug Recommender v3.0 Β· Master's Thesis Β·
990
+ 53,581 drugs Β· TF-IDF + Cosine Similarity Β· OpenFDA API Β· Plotly Charts
991
  </div>
992
  """)
993
 
994
+
995
+ # ═══════════════════════════════════════════════════════════════════
996
  if __name__ == "__main__":
997
+ demo.launch(server_name="0.0.0.0", server_port=7860,
998
+ share=False, show_error=True)