Assignment03 / app.py
ZainabEman's picture
Update app.py
677ed4f verified
import streamlit as st
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")
# ======================================
# Data Loading and Preprocessing
# ======================================
def load_data():
data = [
{"id": 1, "Risk Factors": "['smoking', 'obesity']",
"Symptoms": "['chest pain', 'shortness of breath']",
"Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"},
{"id": 2, "Risk Factors": "['alcohol']",
"Symptoms": "['confusion', 'headache']",
"Signs": "['abnormal brain imaging']", "Disease": "Neurological"},
{"id": 3, "Risk Factors": "['smoking']",
"Symptoms": "['cough', 'wheezing']",
"Signs": "['reduced lung capacity']", "Disease": "Respiratory"},
{"id": 4, "Risk Factors": "['unhealthy diet']",
"Symptoms": "['abdominal pain']",
"Signs": "['elevated liver enzymes']", "Disease": "Gastrointestinal"},
{"id": 5, "Risk Factors": "['genetics']",
"Symptoms": "['dizziness']",
"Signs": "['irregular heartbeat']", "Disease": "Cardiovascular"},
{"id": 6, "Risk Factors": "['stress']",
"Symptoms": "['anxiety', 'fatigue']",
"Signs": "['high cortisol']", "Disease": "Neurological"},
{"id": 7, "Risk Factors": "['smoking', 'exposure to pollutants']",
"Symptoms": "['persistent cough']",
"Signs": "['inflamed airways']", "Disease": "Respiratory"},
{"id": 8, "Risk Factors": "['poor diet']",
"Symptoms": "['nausea']",
"Signs": "['abnormal digestion']", "Disease": "Gastrointestinal"},
{"id": 9, "Risk Factors": "['smoking']",
"Symptoms": "['chest pain']",
"Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"},
{"id": 10, "Risk Factors": "['alcohol']",
"Symptoms": "['memory loss']",
"Signs": "['brain atrophy']", "Disease": "Neurological"},
{"id": 11, "Risk Factors": "['smoking']",
"Symptoms": "['shortness of breath']",
"Signs": "['reduced lung function']", "Disease": "Respiratory"},
{"id": 12, "Risk Factors": "['unhealthy diet']",
"Symptoms": "['diarrhea']",
"Signs": "['dehydration']", "Disease": "Gastrointestinal"}
]
return pd.DataFrame(data)
def preprocess_text_columns(df):
for col in ["Risk Factors", "Symptoms", "Signs"]:
df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
return df
def vectorize_columns(df):
cols = ["Risk Factors", "Symptoms", "Signs"]
tfidf_matrices, onehot_matrices = [], []
tfidf_vocabs, onehot_vocabs = {}, {}
for col in cols:
text_data = df[col + '_combined']
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(text_data)
tfidf_matrices.append(tfidf_matrix)
tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()
count_vec = CountVectorizer(binary=True)
onehot_matrix = count_vec.fit_transform(text_data)
onehot_matrices.append(onehot_matrix)
onehot_vocabs[col] = count_vec.get_feature_names_out()
return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs
# ======================================
# Task 1
# ======================================
def task1_feature_extraction():
st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
df = preprocess_text_columns(load_data())
st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])
tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)
st.write("### TF-IDF Combined Matrix")
st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
st.write("### One-Hot Combined Matrix")
st.dataframe(pd.DataFrame(onehot_matrix.toarray()))
def matrix_stats(matrix, name):
total_elements = matrix.shape[0] * matrix.shape[1]
nonzero = matrix.nnz
sparsity = 100 * (1 - nonzero / total_elements)
st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%")
st.subheader("Matrix Statistics:")
matrix_stats(tfidf_matrix, "TF-IDF")
matrix_stats(onehot_matrix, "One-Hot")
st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values()))
st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values()))
# ======================================
# Task 2
# ======================================
def task2_dimensionality_reduction():
st.header("Task 2: Dimensionality Reduction and Visualization")
df = preprocess_text_columns(load_data())
tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)
svd_onehot = TruncatedSVD(n_components=2, random_state=42)
onehot_2d = svd_onehot.fit_transform(onehot_matrix)
target = df["Disease"]
diseases = target.unique()
fig1, ax1 = plt.subplots()
for disease in diseases:
idx = target == disease
ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80)
ax1.set_title("TF-IDF 2D Projection")
ax1.legend()
st.pyplot(fig1)
fig2, ax2 = plt.subplots()
for disease in diseases:
idx = target == disease
ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80)
ax2.set_title("One-Hot 2D Projection")
ax2.legend()
st.pyplot(fig2)
st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_)
st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_)
# ======================================
# Task 3
# ======================================
def evaluate_model(X, y, model, name):
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score, average='macro', zero_division=0),
'recall': make_scorer(recall_score, average='macro', zero_division=0),
'f1': make_scorer(f1_score, average='macro', zero_division=0)
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
results = cross_validate(model, X, y, cv=cv, scoring=scoring)
st.write(f"### {name}")
for metric in scoring:
st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}")
def task3_classification():
st.header("Task 3: Classification with KNN and Logistic Regression")
df = preprocess_text_columns(load_data())
tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
y = df["Disease"]
st.subheader("KNN on TF-IDF")
for k in [3, 5, 7]:
model = KNeighborsClassifier(n_neighbors=k, metric='cosine')
evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)")
st.subheader("Logistic Regression on TF-IDF")
logreg = LogisticRegression(max_iter=1000)
evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression")
# ======================================
# Sidebar Navigation
# ======================================
task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"])
if task == "Task 1: Feature Extraction":
task1_feature_extraction()
elif task == "Task 2: Dimensionality Reduction":
task2_dimensionality_reduction()
elif task == "Task 3: Classification":
task3_classification()