Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import ast | |
| import matplotlib.pyplot as plt | |
| from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
| from sklearn.decomposition import TruncatedSVD | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import cross_validate, StratifiedKFold | |
| from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score | |
| from scipy.sparse import hstack | |
| st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis") | |
| # ====================================== | |
| # Data Loading and Preprocessing | |
| # ====================================== | |
| def load_data(): | |
| data = [ | |
| {"id": 1, "Risk Factors": "['smoking', 'obesity']", | |
| "Symptoms": "['chest pain', 'shortness of breath']", | |
| "Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"}, | |
| {"id": 2, "Risk Factors": "['alcohol']", | |
| "Symptoms": "['confusion', 'headache']", | |
| "Signs": "['abnormal brain imaging']", "Disease": "Neurological"}, | |
| {"id": 3, "Risk Factors": "['smoking']", | |
| "Symptoms": "['cough', 'wheezing']", | |
| "Signs": "['reduced lung capacity']", "Disease": "Respiratory"}, | |
| {"id": 4, "Risk Factors": "['unhealthy diet']", | |
| "Symptoms": "['abdominal pain']", | |
| "Signs": "['elevated liver enzymes']", "Disease": "Gastrointestinal"}, | |
| {"id": 5, "Risk Factors": "['genetics']", | |
| "Symptoms": "['dizziness']", | |
| "Signs": "['irregular heartbeat']", "Disease": "Cardiovascular"}, | |
| {"id": 6, "Risk Factors": "['stress']", | |
| "Symptoms": "['anxiety', 'fatigue']", | |
| "Signs": "['high cortisol']", "Disease": "Neurological"}, | |
| {"id": 7, "Risk Factors": "['smoking', 'exposure to pollutants']", | |
| "Symptoms": "['persistent cough']", | |
| "Signs": "['inflamed airways']", "Disease": "Respiratory"}, | |
| {"id": 8, "Risk Factors": "['poor diet']", | |
| "Symptoms": "['nausea']", | |
| "Signs": "['abnormal digestion']", "Disease": "Gastrointestinal"}, | |
| {"id": 9, "Risk Factors": "['smoking']", | |
| "Symptoms": "['chest pain']", | |
| "Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"}, | |
| {"id": 10, "Risk Factors": "['alcohol']", | |
| "Symptoms": "['memory loss']", | |
| "Signs": "['brain atrophy']", "Disease": "Neurological"}, | |
| {"id": 11, "Risk Factors": "['smoking']", | |
| "Symptoms": "['shortness of breath']", | |
| "Signs": "['reduced lung function']", "Disease": "Respiratory"}, | |
| {"id": 12, "Risk Factors": "['unhealthy diet']", | |
| "Symptoms": "['diarrhea']", | |
| "Signs": "['dehydration']", "Disease": "Gastrointestinal"} | |
| ] | |
| return pd.DataFrame(data) | |
| def preprocess_text_columns(df): | |
| for col in ["Risk Factors", "Symptoms", "Signs"]: | |
| df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "") | |
| return df | |
| def vectorize_columns(df): | |
| cols = ["Risk Factors", "Symptoms", "Signs"] | |
| tfidf_matrices, onehot_matrices = [], [] | |
| tfidf_vocabs, onehot_vocabs = {}, {} | |
| for col in cols: | |
| text_data = df[col + '_combined'] | |
| tfidf_vec = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vec.fit_transform(text_data) | |
| tfidf_matrices.append(tfidf_matrix) | |
| tfidf_vocabs[col] = tfidf_vec.get_feature_names_out() | |
| count_vec = CountVectorizer(binary=True) | |
| onehot_matrix = count_vec.fit_transform(text_data) | |
| onehot_matrices.append(onehot_matrix) | |
| onehot_vocabs[col] = count_vec.get_feature_names_out() | |
| return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs | |
| # ====================================== | |
| # Task 1 | |
| # ====================================== | |
| def task1_feature_extraction(): | |
| st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison") | |
| df = preprocess_text_columns(load_data()) | |
| st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]]) | |
| tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df) | |
| st.write("### TF-IDF Combined Matrix") | |
| st.dataframe(pd.DataFrame(tfidf_matrix.toarray())) | |
| st.write("### One-Hot Combined Matrix") | |
| st.dataframe(pd.DataFrame(onehot_matrix.toarray())) | |
| def matrix_stats(matrix, name): | |
| total_elements = matrix.shape[0] * matrix.shape[1] | |
| nonzero = matrix.nnz | |
| sparsity = 100 * (1 - nonzero / total_elements) | |
| st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%") | |
| st.subheader("Matrix Statistics:") | |
| matrix_stats(tfidf_matrix, "TF-IDF") | |
| matrix_stats(onehot_matrix, "One-Hot") | |
| st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values())) | |
| st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values())) | |
| # ====================================== | |
| # Task 2 | |
| # ====================================== | |
| def task2_dimensionality_reduction(): | |
| st.header("Task 2: Dimensionality Reduction and Visualization") | |
| df = preprocess_text_columns(load_data()) | |
| tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df) | |
| svd_tfidf = TruncatedSVD(n_components=2, random_state=42) | |
| tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix) | |
| svd_onehot = TruncatedSVD(n_components=2, random_state=42) | |
| onehot_2d = svd_onehot.fit_transform(onehot_matrix) | |
| target = df["Disease"] | |
| diseases = target.unique() | |
| fig1, ax1 = plt.subplots() | |
| for disease in diseases: | |
| idx = target == disease | |
| ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80) | |
| ax1.set_title("TF-IDF 2D Projection") | |
| ax1.legend() | |
| st.pyplot(fig1) | |
| fig2, ax2 = plt.subplots() | |
| for disease in diseases: | |
| idx = target == disease | |
| ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80) | |
| ax2.set_title("One-Hot 2D Projection") | |
| ax2.legend() | |
| st.pyplot(fig2) | |
| st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_) | |
| st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_) | |
| # ====================================== | |
| # Task 3 | |
| # ====================================== | |
| def evaluate_model(X, y, model, name): | |
| scoring = { | |
| 'accuracy': make_scorer(accuracy_score), | |
| 'precision': make_scorer(precision_score, average='macro', zero_division=0), | |
| 'recall': make_scorer(recall_score, average='macro', zero_division=0), | |
| 'f1': make_scorer(f1_score, average='macro', zero_division=0) | |
| } | |
| cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) | |
| results = cross_validate(model, X, y, cv=cv, scoring=scoring) | |
| st.write(f"### {name}") | |
| for metric in scoring: | |
| st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}") | |
| def task3_classification(): | |
| st.header("Task 3: Classification with KNN and Logistic Regression") | |
| df = preprocess_text_columns(load_data()) | |
| tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df) | |
| y = df["Disease"] | |
| st.subheader("KNN on TF-IDF") | |
| for k in [3, 5, 7]: | |
| model = KNeighborsClassifier(n_neighbors=k, metric='cosine') | |
| evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)") | |
| st.subheader("Logistic Regression on TF-IDF") | |
| logreg = LogisticRegression(max_iter=1000) | |
| evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression") | |
| # ====================================== | |
| # Sidebar Navigation | |
| # ====================================== | |
| task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"]) | |
| if task == "Task 1: Feature Extraction": | |
| task1_feature_extraction() | |
| elif task == "Task 2: Dimensionality Reduction": | |
| task2_dimensionality_reduction() | |
| elif task == "Task 3: Classification": | |
| task3_classification() | |