Spaces:
Paused
Paused
| import pandas as pd | |
| import numpy as np | |
| import streamlit as st | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from symspellpy import SymSpell, Verbosity | |
| # ---------------------- | |
| # Data Preparation | |
| # ---------------------- | |
| def preprocess_data(file_path): | |
| # Load dataset | |
| df = pd.read_csv(file_path) | |
| # Combine multi-value columns | |
| def combine_columns(row, prefix): | |
| values = [str(row[col]) for col in df.columns if col.startswith(prefix) and pd.notna(row[col])] | |
| return ', '.join(values) | |
| df['uses'] = df.apply(lambda x: combine_columns(x, 'use'), axis=1) | |
| df['substitutes'] = df.apply(lambda x: combine_columns(x, 'substitute'), axis=1) | |
| df['side_effects'] = df.apply(lambda x: combine_columns(x, 'sideEffect'), axis=1) | |
| # Clean text | |
| text_columns = ['name', 'uses', 'Chemical Class', 'Therapeutic Class'] | |
| for col in text_columns: | |
| df[col] = df[col].str.lower().str.replace('[^\w\s]', '', regex=True) | |
| return df[['id', 'name', 'uses', 'substitutes', 'side_effects', | |
| 'Habit Forming', 'Therapeutic Class', 'Action Class']] | |
| # ---------------------- | |
| # Embedding & FAISS Setup | |
| # ---------------------- | |
| def setup_faiss(df): | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| embeddings = model.encode(df['uses'].tolist(), show_progress_bar=True) | |
| # Create FAISS index | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return model, index | |
| # ---------------------- | |
| # Spelling Correction | |
| # ---------------------- | |
| def setup_spell_checker(): | |
| sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) | |
| sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt', | |
| term_index=0, count_index=1) | |
| return sym_spell | |
| # ---------------------- | |
| # Severity Analysis | |
| # ---------------------- | |
| SEVERITY_RANK = { | |
| 'vomiting': 3, 'nausea': 3, 'diarrhea': 3, | |
| 'dizziness': 2, 'headache': 2, 'palpitations': 2, | |
| 'rash': 1, 'itching': 1, 'fatigue': 1 | |
| } | |
| def severity_score(side_effects): | |
| return sum(SEVERITY_RANK.get(effect.strip().lower(), 0) | |
| for effect in side_effects.split(',') if effect.strip()) | |
| # ---------------------- | |
| # Drug Comparison | |
| # ---------------------- | |
| def compare_drugs(df, drug1, drug2): | |
| try: | |
| d1 = df[df['name'].str.lower() == drug1.lower()].iloc[0] | |
| d2 = df[df['name'].str.lower() == drug2.lower()].iloc[0] | |
| comparison = pd.DataFrame({ | |
| 'Attribute': ['Uses', 'Substitutes', 'Side Effects', 'Therapeutic Class'], | |
| drug1: [d1['uses'], d1['substitutes'], d1['side_effects'], d1['Therapeutic Class']], | |
| drug2: [d2['uses'], d2['substitutes'], d2['side_effects'], d2['Therapeutic Class']] | |
| }) | |
| return comparison | |
| except IndexError: | |
| return pd.DataFrame() | |
| # ---------------------- | |
| # Streamlit App | |
| # ---------------------- | |
| def main(): | |
| st.title("🧬 MedSearch NLP: Medicine Recommender System") | |
| # Load data and models | |
| df = preprocess_data('medicine_dataset.csv') | |
| model, faiss_index = setup_faiss(df) | |
| sym_spell = setup_spell_checker() | |
| # User input section | |
| query = st.text_input("Describe your symptoms or medical need:") | |
| therapeutic_class = st.selectbox( | |
| "Filter by Therapeutic Class (optional):", | |
| ['All'] + sorted(df['Therapeutic Class'].dropna().unique().tolist()) | |
| ) | |
| # Process query and show results | |
| if query: | |
| # Spelling correction | |
| suggestions = sym_spell.lookup(query, Verbosity.CLOSEST, max_edit_distance=2) | |
| if suggestions: | |
| query = suggestions[0].term | |
| st.info(f"Did you mean: '{query}'?") | |
| # Semantic search | |
| query_embedding = model.encode([query]) | |
| D, I = faiss_index.search(query_embedding, k=5) | |
| # Process results | |
| results = df.iloc[I[0]].copy() | |
| if therapeutic_class != 'All': | |
| results = results[results['Therapeutic Class'] == therapeutic_class] | |
| # Add severity analysis | |
| results['severity'] = results['side_effects'].apply(severity_score) | |
| results = results.sort_values('severity', ascending=True) | |
| # Display results | |
| st.subheader("Recommended Medicines") | |
| for _, row in results.iterrows(): | |
| with st.expander(f"💊 {row['name']} (Severity: {row['severity']})"): | |
| cols = st.columns(3) | |
| cols[0].write(f"**Uses:** {row['uses']}") | |
| cols[1].write(f"**Substitutes:** {row['substitutes']}") | |
| cols[2].write(f"**Side Effects:** {row['side_effects']}") | |
| cols2 = st.columns(2) | |
| cols2[0].write(f"Therapeutic Class: {row['Therapeutic Class']}") | |
| cols2[1].write(f"Habit Forming: {row['Habit Forming']}") | |
| # Drug comparison section | |
| st.subheader("🔍 Drug Comparison Tool") | |
| col1, col2 = st.columns(2) | |
| drug_list = df['name'].unique().tolist() | |
| with col1: | |
| drug1 = st.selectbox("Select first drug:", drug_list, index=0) | |
| with col2: | |
| drug2 = st.selectbox("Select second drug:", drug_list, index=1 if len(drug_list) > 1 else 0) | |
| comparison_df = compare_drugs(df, drug1, drug2) | |
| if not comparison_df.empty: | |
| st.table(comparison_df.style.set_properties(**{ | |
| 'white-space': 'pre-wrap', | |
| 'text-align': 'left' | |
| })) | |
| else: | |
| st.warning("One or both selected drugs not found in database") | |
| if __name__ == "__main__": | |
| main() |