File size: 8,052 Bytes
728c9c8
 
 
 
 
 
 
 
 
 
 
677ed4f
728c9c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677ed4f
 
 
728c9c8
 
677ed4f
728c9c8
 
 
 
677ed4f
728c9c8
 
 
 
677ed4f
 
728c9c8
 
677ed4f
728c9c8
 
 
677ed4f
728c9c8
677ed4f
728c9c8
677ed4f
728c9c8
 
 
 
677ed4f
728c9c8
 
677ed4f
728c9c8
677ed4f
 
728c9c8
 
 
677ed4f
 
 
728c9c8
 
677ed4f
728c9c8
 
677ed4f
 
728c9c8
677ed4f
728c9c8
 
677ed4f
728c9c8
 
677ed4f
 
728c9c8
677ed4f
728c9c8
 
 
677ed4f
728c9c8
 
 
677ed4f
728c9c8
 
 
677ed4f
728c9c8
 
 
677ed4f
 
 
728c9c8
 
677ed4f
728c9c8
677ed4f
728c9c8
677ed4f
 
 
 
728c9c8
677ed4f
 
 
 
 
728c9c8
677ed4f
 
 
 
 
 
 
 
 
 
 
 
 
 
728c9c8
 
677ed4f
728c9c8
677ed4f
728c9c8
677ed4f
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import streamlit as st
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack

st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")

# ======================================
# Data Loading and Preprocessing
# ======================================
def load_data():
    data = [
        {"id": 1, "Risk Factors": "['smoking', 'obesity']",
         "Symptoms": "['chest pain', 'shortness of breath']",
         "Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"},
        {"id": 2, "Risk Factors": "['alcohol']",
         "Symptoms": "['confusion', 'headache']",
         "Signs": "['abnormal brain imaging']", "Disease": "Neurological"},
        {"id": 3, "Risk Factors": "['smoking']",
         "Symptoms": "['cough', 'wheezing']",
         "Signs": "['reduced lung capacity']", "Disease": "Respiratory"},
        {"id": 4, "Risk Factors": "['unhealthy diet']",
         "Symptoms": "['abdominal pain']",
         "Signs": "['elevated liver enzymes']", "Disease": "Gastrointestinal"},
        {"id": 5, "Risk Factors": "['genetics']",
         "Symptoms": "['dizziness']",
         "Signs": "['irregular heartbeat']", "Disease": "Cardiovascular"},
        {"id": 6, "Risk Factors": "['stress']",
         "Symptoms": "['anxiety', 'fatigue']",
         "Signs": "['high cortisol']", "Disease": "Neurological"},
        {"id": 7, "Risk Factors": "['smoking', 'exposure to pollutants']",
         "Symptoms": "['persistent cough']",
         "Signs": "['inflamed airways']", "Disease": "Respiratory"},
        {"id": 8, "Risk Factors": "['poor diet']",
         "Symptoms": "['nausea']",
         "Signs": "['abnormal digestion']", "Disease": "Gastrointestinal"},
        {"id": 9, "Risk Factors": "['smoking']",
         "Symptoms": "['chest pain']",
         "Signs": "['elevated blood pressure']", "Disease": "Cardiovascular"},
        {"id": 10, "Risk Factors": "['alcohol']",
         "Symptoms": "['memory loss']",
         "Signs": "['brain atrophy']", "Disease": "Neurological"},
        {"id": 11, "Risk Factors": "['smoking']",
         "Symptoms": "['shortness of breath']",
         "Signs": "['reduced lung function']", "Disease": "Respiratory"},
        {"id": 12, "Risk Factors": "['unhealthy diet']",
         "Symptoms": "['diarrhea']",
         "Signs": "['dehydration']", "Disease": "Gastrointestinal"}
    ]
    return pd.DataFrame(data)

def preprocess_text_columns(df):
    for col in ["Risk Factors", "Symptoms", "Signs"]:
        df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
    return df

def vectorize_columns(df):
    cols = ["Risk Factors", "Symptoms", "Signs"]
    tfidf_matrices, onehot_matrices = [], []
    tfidf_vocabs, onehot_vocabs = {}, {}

    for col in cols:
        text_data = df[col + '_combined']

        tfidf_vec = TfidfVectorizer()
        tfidf_matrix = tfidf_vec.fit_transform(text_data)
        tfidf_matrices.append(tfidf_matrix)
        tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()

        count_vec = CountVectorizer(binary=True)
        onehot_matrix = count_vec.fit_transform(text_data)
        onehot_matrices.append(onehot_matrix)
        onehot_vocabs[col] = count_vec.get_feature_names_out()

    return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs

# ======================================
# Task 1
# ======================================
def task1_feature_extraction():
    st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
    df = preprocess_text_columns(load_data())
    st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])

    tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)

    st.write("### TF-IDF Combined Matrix")
    st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
    st.write("### One-Hot Combined Matrix")
    st.dataframe(pd.DataFrame(onehot_matrix.toarray()))

    def matrix_stats(matrix, name):
        total_elements = matrix.shape[0] * matrix.shape[1]
        nonzero = matrix.nnz
        sparsity = 100 * (1 - nonzero / total_elements)
        st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%")

    st.subheader("Matrix Statistics:")
    matrix_stats(tfidf_matrix, "TF-IDF")
    matrix_stats(onehot_matrix, "One-Hot")

    st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values()))
    st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values()))

# ======================================
# Task 2
# ======================================
def task2_dimensionality_reduction():
    st.header("Task 2: Dimensionality Reduction and Visualization")
    df = preprocess_text_columns(load_data())
    tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)

    svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
    tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)

    svd_onehot = TruncatedSVD(n_components=2, random_state=42)
    onehot_2d = svd_onehot.fit_transform(onehot_matrix)

    target = df["Disease"]
    diseases = target.unique()

    fig1, ax1 = plt.subplots()
    for disease in diseases:
        idx = target == disease
        ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80)
    ax1.set_title("TF-IDF 2D Projection")
    ax1.legend()
    st.pyplot(fig1)

    fig2, ax2 = plt.subplots()
    for disease in diseases:
        idx = target == disease
        ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80)
    ax2.set_title("One-Hot 2D Projection")
    ax2.legend()
    st.pyplot(fig2)

    st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_)
    st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_)

# ======================================
# Task 3
# ======================================
def evaluate_model(X, y, model, name):
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='macro', zero_division=0),
        'recall': make_scorer(recall_score, average='macro', zero_division=0),
        'f1': make_scorer(f1_score, average='macro', zero_division=0)
    }
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    results = cross_validate(model, X, y, cv=cv, scoring=scoring)
    st.write(f"### {name}")
    for metric in scoring:
        st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}")

def task3_classification():
    st.header("Task 3: Classification with KNN and Logistic Regression")
    df = preprocess_text_columns(load_data())
    tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
    y = df["Disease"]

    st.subheader("KNN on TF-IDF")
    for k in [3, 5, 7]:
        model = KNeighborsClassifier(n_neighbors=k, metric='cosine')
        evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)")

    st.subheader("Logistic Regression on TF-IDF")
    logreg = LogisticRegression(max_iter=1000)
    evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression")

# ======================================
# Sidebar Navigation
# ======================================
task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"])

if task == "Task 1: Feature Extraction":
    task1_feature_extraction()
elif task == "Task 2: Dimensionality Reduction":
    task2_dimensionality_reduction()
elif task == "Task 3: Classification":
    task3_classification()