Spaces:
Sleeping
Sleeping
File size: 8,052 Bytes
728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f 728c9c8 677ed4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | import streamlit as st
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")
# ======================================
# Data Loading and Preprocessing
# ======================================
def load_data():
data = [
{"id": 1, "Risk Factors": "['smoking', 'obesity']",
"Symptoms": "['chest pain', 'shortness of breath']",
"Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"},
{"id": 2, "Risk Factors": "['alcohol']",
"Symptoms": "['confusion', 'headache']",
"Signs": "['abnormal brain imaging']", "Disease": "Neurological"},
{"id": 3, "Risk Factors": "['smoking']",
"Symptoms": "['cough', 'wheezing']",
"Signs": "['reduced lung capacity']", "Disease": "Respiratory"},
{"id": 4, "Risk Factors": "['unhealthy diet']",
"Symptoms": "['abdominal pain']",
"Signs": "['elevated liver enzymes']", "Disease": "Gastrointestinal"},
{"id": 5, "Risk Factors": "['genetics']",
"Symptoms": "['dizziness']",
"Signs": "['irregular heartbeat']", "Disease": "Cardiovascular"},
{"id": 6, "Risk Factors": "['stress']",
"Symptoms": "['anxiety', 'fatigue']",
"Signs": "['high cortisol']", "Disease": "Neurological"},
{"id": 7, "Risk Factors": "['smoking', 'exposure to pollutants']",
"Symptoms": "['persistent cough']",
"Signs": "['inflamed airways']", "Disease": "Respiratory"},
{"id": 8, "Risk Factors": "['poor diet']",
"Symptoms": "['nausea']",
"Signs": "['abnormal digestion']", "Disease": "Gastrointestinal"},
{"id": 9, "Risk Factors": "['smoking']",
"Symptoms": "['chest pain']",
"Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"},
{"id": 10, "Risk Factors": "['alcohol']",
"Symptoms": "['memory loss']",
"Signs": "['brain atrophy']", "Disease": "Neurological"},
{"id": 11, "Risk Factors": "['smoking']",
"Symptoms": "['shortness of breath']",
"Signs": "['reduced lung function']", "Disease": "Respiratory"},
{"id": 12, "Risk Factors": "['unhealthy diet']",
"Symptoms": "['diarrhea']",
"Signs": "['dehydration']", "Disease": "Gastrointestinal"}
]
return pd.DataFrame(data)
def preprocess_text_columns(df):
for col in ["Risk Factors", "Symptoms", "Signs"]:
df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
return df
def vectorize_columns(df):
cols = ["Risk Factors", "Symptoms", "Signs"]
tfidf_matrices, onehot_matrices = [], []
tfidf_vocabs, onehot_vocabs = {}, {}
for col in cols:
text_data = df[col + '_combined']
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(text_data)
tfidf_matrices.append(tfidf_matrix)
tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()
count_vec = CountVectorizer(binary=True)
onehot_matrix = count_vec.fit_transform(text_data)
onehot_matrices.append(onehot_matrix)
onehot_vocabs[col] = count_vec.get_feature_names_out()
return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs
# ======================================
# Task 1
# ======================================
def task1_feature_extraction():
st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
df = preprocess_text_columns(load_data())
st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])
tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)
st.write("### TF-IDF Combined Matrix")
st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
st.write("### One-Hot Combined Matrix")
st.dataframe(pd.DataFrame(onehot_matrix.toarray()))
def matrix_stats(matrix, name):
total_elements = matrix.shape[0] * matrix.shape[1]
nonzero = matrix.nnz
sparsity = 100 * (1 - nonzero / total_elements)
st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%")
st.subheader("Matrix Statistics:")
matrix_stats(tfidf_matrix, "TF-IDF")
matrix_stats(onehot_matrix, "One-Hot")
st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values()))
st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values()))
# ======================================
# Task 2
# ======================================
def task2_dimensionality_reduction():
st.header("Task 2: Dimensionality Reduction and Visualization")
df = preprocess_text_columns(load_data())
tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)
svd_onehot = TruncatedSVD(n_components=2, random_state=42)
onehot_2d = svd_onehot.fit_transform(onehot_matrix)
target = df["Disease"]
diseases = target.unique()
fig1, ax1 = plt.subplots()
for disease in diseases:
idx = target == disease
ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80)
ax1.set_title("TF-IDF 2D Projection")
ax1.legend()
st.pyplot(fig1)
fig2, ax2 = plt.subplots()
for disease in diseases:
idx = target == disease
ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80)
ax2.set_title("One-Hot 2D Projection")
ax2.legend()
st.pyplot(fig2)
st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_)
st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_)
# ======================================
# Task 3
# ======================================
def evaluate_model(X, y, model, name):
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score, average='macro', zero_division=0),
'recall': make_scorer(recall_score, average='macro', zero_division=0),
'f1': make_scorer(f1_score, average='macro', zero_division=0)
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
results = cross_validate(model, X, y, cv=cv, scoring=scoring)
st.write(f"### {name}")
for metric in scoring:
st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}")
def task3_classification():
st.header("Task 3: Classification with KNN and Logistic Regression")
df = preprocess_text_columns(load_data())
tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
y = df["Disease"]
st.subheader("KNN on TF-IDF")
for k in [3, 5, 7]:
model = KNeighborsClassifier(n_neighbors=k, metric='cosine')
evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)")
st.subheader("Logistic Regression on TF-IDF")
logreg = LogisticRegression(max_iter=1000)
evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression")
# ======================================
# Sidebar Navigation
# ======================================
task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"])
if task == "Task 1: Feature Extraction":
task1_feature_extraction()
elif task == "Task 2: Dimensionality Reduction":
task2_dimensionality_reduction()
elif task == "Task 3: Classification":
task3_classification()
|