import streamlit as st import pandas as pd import numpy as np import ast import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score from scipy.sparse import hstack st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis") # ====================================== # Data Loading and Preprocessing # ====================================== def load_data(): data = [ {"id": 1, "Risk Factors": "['smoking', 'obesity']", "Symptoms": "['chest pain', 'shortness of breath']", "Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"}, {"id": 2, "Risk Factors": "['alcohol']", "Symptoms": "['confusion', 'headache']", "Signs": "['abnormal brain imaging']", "Disease": "Neurological"}, {"id": 3, "Risk Factors": "['smoking']", "Symptoms": "['cough', 'wheezing']", "Signs": "['reduced lung capacity']", "Disease": "Respiratory"}, {"id": 4, "Risk Factors": "['unhealthy diet']", "Symptoms": "['abdominal pain']", "Signs": "['elevated liver enzymes']", "Disease": "Gastrointestinal"}, {"id": 5, "Risk Factors": "['genetics']", "Symptoms": "['dizziness']", "Signs": "['irregular heartbeat']", "Disease": "Cardiovascular"}, {"id": 6, "Risk Factors": "['stress']", "Symptoms": "['anxiety', 'fatigue']", "Signs": "['high cortisol']", "Disease": "Neurological"}, {"id": 7, "Risk Factors": "['smoking', 'exposure to pollutants']", "Symptoms": "['persistent cough']", "Signs": "['inflamed airways']", "Disease": "Respiratory"}, {"id": 8, "Risk Factors": "['poor diet']", "Symptoms": "['nausea']", "Signs": "['abnormal digestion']", "Disease": "Gastrointestinal"}, {"id": 9, "Risk Factors": "['smoking']", "Symptoms": "['chest pain']", "Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"}, {"id": 10, "Risk Factors": "['alcohol']", "Symptoms": "['memory loss']", "Signs": "['brain atrophy']", "Disease": "Neurological"}, {"id": 11, "Risk Factors": "['smoking']", "Symptoms": "['shortness of breath']", "Signs": "['reduced lung function']", "Disease": "Respiratory"}, {"id": 12, "Risk Factors": "['unhealthy diet']", "Symptoms": "['diarrhea']", "Signs": "['dehydration']", "Disease": "Gastrointestinal"} ] return pd.DataFrame(data) def preprocess_text_columns(df): for col in ["Risk Factors", "Symptoms", "Signs"]: df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "") return df def vectorize_columns(df): cols = ["Risk Factors", "Symptoms", "Signs"] tfidf_matrices, onehot_matrices = [], [] tfidf_vocabs, onehot_vocabs = {}, {} for col in cols: text_data = df[col + '_combined'] tfidf_vec = TfidfVectorizer() tfidf_matrix = tfidf_vec.fit_transform(text_data) tfidf_matrices.append(tfidf_matrix) tfidf_vocabs[col] = tfidf_vec.get_feature_names_out() count_vec = CountVectorizer(binary=True) onehot_matrix = count_vec.fit_transform(text_data) onehot_matrices.append(onehot_matrix) onehot_vocabs[col] = count_vec.get_feature_names_out() return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs # ====================================== # Task 1 # ====================================== def task1_feature_extraction(): st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison") df = preprocess_text_columns(load_data()) st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]]) tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df) st.write("### TF-IDF Combined Matrix") st.dataframe(pd.DataFrame(tfidf_matrix.toarray())) st.write("### One-Hot Combined Matrix") st.dataframe(pd.DataFrame(onehot_matrix.toarray())) def matrix_stats(matrix, name): total_elements = matrix.shape[0] * matrix.shape[1] nonzero = matrix.nnz sparsity = 100 * (1 - nonzero / total_elements) st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%") st.subheader("Matrix Statistics:") matrix_stats(tfidf_matrix, "TF-IDF") matrix_stats(onehot_matrix, "One-Hot") st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values())) st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values())) # ====================================== # Task 2 # ====================================== def task2_dimensionality_reduction(): st.header("Task 2: Dimensionality Reduction and Visualization") df = preprocess_text_columns(load_data()) tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df) svd_tfidf = TruncatedSVD(n_components=2, random_state=42) tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix) svd_onehot = TruncatedSVD(n_components=2, random_state=42) onehot_2d = svd_onehot.fit_transform(onehot_matrix) target = df["Disease"] diseases = target.unique() fig1, ax1 = plt.subplots() for disease in diseases: idx = target == disease ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80) ax1.set_title("TF-IDF 2D Projection") ax1.legend() st.pyplot(fig1) fig2, ax2 = plt.subplots() for disease in diseases: idx = target == disease ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80) ax2.set_title("One-Hot 2D Projection") ax2.legend() st.pyplot(fig2) st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_) st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_) # ====================================== # Task 3 # ====================================== def evaluate_model(X, y, model, name): scoring = { 'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score, average='macro', zero_division=0), 'recall': make_scorer(recall_score, average='macro', zero_division=0), 'f1': make_scorer(f1_score, average='macro', zero_division=0) } cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) results = cross_validate(model, X, y, cv=cv, scoring=scoring) st.write(f"### {name}") for metric in scoring: st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}") def task3_classification(): st.header("Task 3: Classification with KNN and Logistic Regression") df = preprocess_text_columns(load_data()) tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df) y = df["Disease"] st.subheader("KNN on TF-IDF") for k in [3, 5, 7]: model = KNeighborsClassifier(n_neighbors=k, metric='cosine') evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)") st.subheader("Logistic Regression on TF-IDF") logreg = LogisticRegression(max_iter=1000) evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression") # ====================================== # Sidebar Navigation # ====================================== task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"]) if task == "Task 1: Feature Extraction": task1_feature_extraction() elif task == "Task 2: Dimensionality Reduction": task2_dimensionality_reduction() elif task == "Task 3: Classification": task3_classification()