ZainabEman commited on
Commit
677ed4f
·
verified ·
1 Parent(s): 7378255

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -218
app.py CHANGED
@@ -9,6 +9,7 @@ from sklearn.decomposition import TruncatedSVD
9
  from sklearn.neighbors import KNeighborsClassifier
10
  from sklearn.linear_model import LogisticRegression
11
  from sklearn.model_selection import cross_validate, StratifiedKFold
 
12
  from scipy.sparse import hstack
13
 
14
  st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")
@@ -17,8 +18,6 @@ st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analy
17
  # Data Loading and Preprocessing
18
  # ======================================
19
  def load_data():
20
- # Simulated clinical dataset with stringified lists for demonstration.
21
- # (In practice, replace this with reading the actual dataset.)
22
  data = [
23
  {"id": 1, "Risk Factors": "['smoking', 'obesity']",
24
  "Symptoms": "['chest pain', 'shortness of breath']",
@@ -60,284 +59,133 @@ def load_data():
60
  return pd.DataFrame(data)
61
 
62
  def preprocess_text_columns(df):
63
- # Convert each stringified list to an actual list, then join items into a single space-separated string.
64
  for col in ["Risk Factors", "Symptoms", "Signs"]:
65
  df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
66
  return df
67
 
68
- # ======================================
69
- # Vectorization: TF-IDF and One-Hot Encoding
70
- # ======================================
71
  def vectorize_columns(df):
72
  cols = ["Risk Factors", "Symptoms", "Signs"]
73
- tfidf_matrices = []
74
- onehot_matrices = []
75
- tfidf_vocabs = {}
76
- onehot_vocabs = {}
77
-
78
  for col in cols:
79
  text_data = df[col + '_combined']
80
-
81
- # TF-IDF vectorization
82
  tfidf_vec = TfidfVectorizer()
83
  tfidf_matrix = tfidf_vec.fit_transform(text_data)
84
  tfidf_matrices.append(tfidf_matrix)
85
  tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()
86
-
87
- # One-hot encoding using CountVectorizer (binary=True)
88
  count_vec = CountVectorizer(binary=True)
89
  onehot_matrix = count_vec.fit_transform(text_data)
90
  onehot_matrices.append(onehot_matrix)
91
  onehot_vocabs[col] = count_vec.get_feature_names_out()
92
-
93
- tfidf_combined = hstack(tfidf_matrices)
94
- onehot_combined = hstack(onehot_matrices)
95
-
96
- return tfidf_combined, onehot_combined, tfidf_vocabs, onehot_vocabs
97
 
98
  # ======================================
99
- # Task 1: Feature Extraction and Encoding Comparison
100
  # ======================================
101
  def task1_feature_extraction():
102
  st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
103
- st.write("""
104
- **Steps:**
105
-
106
- 1. Parse the stringified lists for "Risk Factors", "Symptoms", and "Signs".
107
- 2. Convert each list into a single string.
108
- 3. Apply TF-IDF vectorization (using TfidfVectorizer) on each column separately.
109
- 4. Apply one-hot encoding (using CountVectorizer with binary=True) on the same columns.
110
- 5. Combine the matrices and compare shapes, sparsity, and the number of unique features.
111
- """)
112
-
113
- df = load_data()
114
- df = preprocess_text_columns(df)
115
- st.write("### Input Data")
116
  st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])
117
-
118
  tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)
119
-
120
- # Display the matrices (dense format for small datasets)
121
  st.write("### TF-IDF Combined Matrix")
122
  st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
123
-
124
  st.write("### One-Hot Combined Matrix")
125
  st.dataframe(pd.DataFrame(onehot_matrix.toarray()))
126
-
127
  def matrix_stats(matrix, name):
128
  total_elements = matrix.shape[0] * matrix.shape[1]
129
- nonzero = matrix.nnz if hasattr(matrix, 'nnz') else np.count_nonzero(matrix)
130
  sparsity = 100 * (1 - nonzero / total_elements)
131
- st.write(f"**{name} Matrix Shape:** {matrix.shape}")
132
- st.write(f"**{name} Sparsity:** {sparsity:.2f}%")
133
-
134
  st.subheader("Matrix Statistics:")
135
  matrix_stats(tfidf_matrix, "TF-IDF")
136
  matrix_stats(onehot_matrix, "One-Hot")
137
-
138
- total_tfidf_features = sum(len(v) for v in tfidf_vocabs.values())
139
- total_onehot_features = sum(len(v) for v in onehot_vocabs.values())
140
- st.write("**Total Unique TF-IDF Features:**", total_tfidf_features)
141
- st.write("**Total Unique One-Hot Features:**", total_onehot_features)
142
 
143
  # ======================================
144
- # Task 2: Dimensionality Reduction and Visualization
145
  # ======================================
146
  def task2_dimensionality_reduction():
147
- st.header("Task 2: Dimensionality Reduction and 2D Visualization")
148
- st.write("""
149
- **Steps:**
150
-
151
- 1. Use Truncated SVD (for sparse matrices) to reduce dimensions of both TF-IDF and One-Hot feature matrices to 2 components.
152
- 2. Compare the explained variance ratios.
153
- 3. Visualize the 2D projections with points color-coded by the disease category.
154
- """)
155
-
156
- df = load_data()
157
- df = preprocess_text_columns(df)
158
  tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
159
-
160
- # Dimensionality reduction for TF-IDF
161
  svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
162
  tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)
163
- st.write("**TF-IDF Explained Variance Ratio (2 components):**", svd_tfidf.explained_variance_ratio_)
164
-
165
- # Dimensionality reduction for One-Hot
166
  svd_onehot = TruncatedSVD(n_components=2, random_state=42)
167
  onehot_2d = svd_onehot.fit_transform(onehot_matrix)
168
- st.write("**One-Hot Explained Variance Ratio (2 components):**", svd_onehot.explained_variance_ratio_)
169
-
170
- target = df['Disease']
171
  diseases = target.unique()
172
-
173
- # Plot for TF-IDF
174
  fig1, ax1 = plt.subplots()
175
  for disease in diseases:
176
  idx = target == disease
177
- ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1],
178
- label=disease, s=80)
179
  ax1.set_title("TF-IDF 2D Projection")
180
- ax1.set_xlabel("Component 1")
181
- ax1.set_ylabel("Component 2")
182
  ax1.legend()
183
  st.pyplot(fig1)
184
-
185
- # Plot for One-Hot
186
  fig2, ax2 = plt.subplots()
187
  for disease in diseases:
188
  idx = target == disease
189
- ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1],
190
- label=disease, s=80)
191
  ax2.set_title("One-Hot 2D Projection")
192
- ax2.set_xlabel("Component 1")
193
- ax2.set_ylabel("Component 2")
194
  ax2.legend()
195
  st.pyplot(fig2)
196
-
197
- st.write("""
198
- **Discussion:**
199
- Compare the two plots above to see which encoding method (TF-IDF or One-Hot) produces clusters that are more separable based on the disease categories.
200
- """)
201
 
202
  # ======================================
203
- # Task 3: Classification Using KNN and Logistic Regression
204
  # ======================================
205
- def task3_classification():
206
- st.header("Task 3: Train KNN and Logistic Regression Models")
207
- st.write("""
208
- **KNN Classification:**
209
- Evaluate KNN using k = 3, 5, 7 and distance metrics: Euclidean, Manhattan, and Cosine.
210
- Use cross-validation to report Accuracy, Precision, Recall, and F1-score.
211
-
212
- **Logistic Regression Classification:**
213
- Train Logistic Regression using cross-validation and compare its performance (Accuracy and F1-score) with KNN.
214
- """)
215
-
216
- df = load_data()
217
- df = preprocess_text_columns(df)
218
- tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
219
- y = df['Disease']
220
-
221
- # Determine appropriate number of folds based on minimum class count.
222
- min_count = y.value_counts().min()
223
- n_splits = min(5, min_count) # Using smaller splits if classes have fewer than 5 samples.
224
- st.write(f"**Using {n_splits}-fold cross-validation (based on minimum class count of {min_count}).**")
225
- cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
226
-
227
  scoring = {
228
- 'accuracy': 'accuracy',
229
- 'precision': 'precision_macro',
230
- 'recall': 'recall_macro',
231
- 'f1': 'f1_macro'
232
  }
233
-
234
- knn_results = []
235
- distance_metrics = ['euclidean', 'manhattan', 'cosine']
236
- k_values = [3, 5, 7]
237
-
238
- # Evaluate KNN for both encoding methods.
239
- for encoding, X in [('TF-IDF', tfidf_matrix), ('One-Hot', onehot_matrix)]:
240
- for metric in distance_metrics:
241
- for k in k_values:
242
- # For cosine distance, use the 'brute' algorithm.
243
- if metric == 'cosine':
244
- knn = KNeighborsClassifier(n_neighbors=k, metric=metric, algorithm='brute')
245
- else:
246
- knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
247
-
248
- scores = cross_validate(knn, X, y, cv=cv, scoring=scoring, n_jobs=-1)
249
- knn_results.append({
250
- "Encoding": encoding,
251
- "Model": "KNN",
252
- "Parameter": f"k={k}, metric={metric}",
253
- "Accuracy": np.mean(scores['test_accuracy']),
254
- "Precision": np.mean(scores['test_precision']),
255
- "Recall": np.mean(scores['test_recall']),
256
- "F1": np.mean(scores['test_f1'])
257
- })
258
-
259
- knn_df = pd.DataFrame(knn_results)
260
- st.subheader("KNN Classification Results")
261
- st.dataframe(knn_df)
262
-
263
- # Evaluate Logistic Regression for both encoding methods.
264
- lr_results = []
265
- for encoding, X in [('TF-IDF', tfidf_matrix), ('One-Hot', onehot_matrix)]:
266
- lr = LogisticRegression(max_iter=1000, random_state=42)
267
- scores = cross_validate(lr, X, y, cv=cv, scoring=scoring, n_jobs=-1)
268
- lr_results.append({
269
- "Encoding": encoding,
270
- "Model": "Logistic Regression",
271
- "Parameter": "Default",
272
- "Accuracy": np.mean(scores['test_accuracy']),
273
- "Precision": np.mean(scores['test_precision']),
274
- "Recall": np.mean(scores['test_recall']),
275
- "F1": np.mean(scores['test_f1'])
276
- })
277
- lr_df = pd.DataFrame(lr_results)
278
- st.subheader("Logistic Regression Classification Results")
279
- st.dataframe(lr_df)
280
-
281
- st.write("""
282
- **Discussion:**
283
- - Compare the performance of KNN with different values of k and different distance metrics.
284
- - Compare the results for TF-IDF vs. One-Hot encoding.
285
- - Examine how Logistic Regression performs relative to KNN.
286
- """)
287
 
288
- # ======================================
289
- # Task 4: Critical Analysis Report
290
- # ======================================
291
- def task4_critical_analysis():
292
- st.header("Task 4: Critical Analysis Report")
293
- st.markdown("""
294
- ### Critical Analysis
295
-
296
- **1. Encoding Comparison: TF-IDF vs. One-Hot**
297
- - **TF-IDF Advantages:**
298
- - Weights terms according to their frequency relative to all documents, emphasizing informative words.
299
- - Down-weights common terms, which can be beneficial in highlighting key clinical features.
300
- - **One-Hot Advantages:**
301
- - Provides a simple, interpretable representation where each feature signifies the presence or absence of a term.
302
-
303
- **2. Clinical Relevance of the Results**
304
- - **TF-IDF Clusters:**
305
- - May reveal clusters that align with clinical disease categories by emphasizing significant symptom patterns.
306
- - Could help in differential diagnosis if clusters clearly separate conditions (e.g., Cardiovascular vs. Neurological).
307
- - **One-Hot Clusters:**
308
- - Although simpler, one-hot encoding may be sufficient when dataset size is small or when interpretability is a primary concern.
309
-
310
- **3. Limitations of Both Methods**
311
- - **TF-IDF Limitations:**
312
- - Does not capture word order or context.
313
- - Sensitive to minor variations in spelling or term usage.
314
- - **One-Hot Limitations:**
315
- - Can lead to very high-dimensional and sparse feature spaces.
316
- - Lacks a weighting mechanism, treating all words as equally important.
317
-
318
- **Conclusion:**
319
- The choice between TF-IDF and one-hot encoding depends on the application context. In clinical text analysis, TF-IDF may provide an advantage by emphasizing key symptoms, while one-hot encoding remains valuable for its simplicity and interpretability.
320
- """)
321
 
322
  # ======================================
323
- # Main App Navigation
324
  # ======================================
325
- def main():
326
- st.sidebar.title("Assignment 3 Tasks")
327
- task = st.sidebar.radio("Choose Task",
328
- ("Task 1: Feature Extraction",
329
- "Task 2: Dimensionality Reduction",
330
- "Task 3: Classification Models",
331
- "Task 4: Critical Analysis"))
332
-
333
- if task == "Task 1: Feature Extraction":
334
- task1_feature_extraction()
335
- elif task == "Task 2: Dimensionality Reduction":
336
- task2_dimensionality_reduction()
337
- elif task == "Task 3: Classification Models":
338
- task3_classification()
339
- elif task == "Task 4: Critical Analysis":
340
- task4_critical_analysis()
341
 
342
- if __name__ == "__main__":
343
- main()
 
 
 
 
 
9
  from sklearn.neighbors import KNeighborsClassifier
10
  from sklearn.linear_model import LogisticRegression
11
  from sklearn.model_selection import cross_validate, StratifiedKFold
12
+ from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
13
  from scipy.sparse import hstack
14
 
15
  st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")
 
18
  # Data Loading and Preprocessing
19
  # ======================================
20
  def load_data():
 
 
21
  data = [
22
  {"id": 1, "Risk Factors": "['smoking', 'obesity']",
23
  "Symptoms": "['chest pain', 'shortness of breath']",
 
59
  return pd.DataFrame(data)
60
 
61
  def preprocess_text_columns(df):
 
62
  for col in ["Risk Factors", "Symptoms", "Signs"]:
63
  df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
64
  return df
65
 
 
 
 
66
  def vectorize_columns(df):
67
  cols = ["Risk Factors", "Symptoms", "Signs"]
68
+ tfidf_matrices, onehot_matrices = [], []
69
+ tfidf_vocabs, onehot_vocabs = {}, {}
70
+
 
 
71
  for col in cols:
72
  text_data = df[col + '_combined']
73
+
 
74
  tfidf_vec = TfidfVectorizer()
75
  tfidf_matrix = tfidf_vec.fit_transform(text_data)
76
  tfidf_matrices.append(tfidf_matrix)
77
  tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()
78
+
 
79
  count_vec = CountVectorizer(binary=True)
80
  onehot_matrix = count_vec.fit_transform(text_data)
81
  onehot_matrices.append(onehot_matrix)
82
  onehot_vocabs[col] = count_vec.get_feature_names_out()
83
+
84
+ return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs
 
 
 
85
 
86
  # ======================================
87
+ # Task 1
88
  # ======================================
89
  def task1_feature_extraction():
90
  st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
91
+ df = preprocess_text_columns(load_data())
 
 
 
 
 
 
 
 
 
 
 
 
92
  st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])
93
+
94
  tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)
95
+
 
96
  st.write("### TF-IDF Combined Matrix")
97
  st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
 
98
  st.write("### One-Hot Combined Matrix")
99
  st.dataframe(pd.DataFrame(onehot_matrix.toarray()))
100
+
101
  def matrix_stats(matrix, name):
102
  total_elements = matrix.shape[0] * matrix.shape[1]
103
+ nonzero = matrix.nnz
104
  sparsity = 100 * (1 - nonzero / total_elements)
105
+ st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%")
106
+
 
107
  st.subheader("Matrix Statistics:")
108
  matrix_stats(tfidf_matrix, "TF-IDF")
109
  matrix_stats(onehot_matrix, "One-Hot")
110
+
111
+ st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values()))
112
+ st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values()))
 
 
113
 
114
  # ======================================
115
+ # Task 2
116
  # ======================================
117
  def task2_dimensionality_reduction():
118
+ st.header("Task 2: Dimensionality Reduction and Visualization")
119
+ df = preprocess_text_columns(load_data())
 
 
 
 
 
 
 
 
 
120
  tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
121
+
 
122
  svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
123
  tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)
124
+
 
 
125
  svd_onehot = TruncatedSVD(n_components=2, random_state=42)
126
  onehot_2d = svd_onehot.fit_transform(onehot_matrix)
127
+
128
+ target = df["Disease"]
 
129
  diseases = target.unique()
130
+
 
131
  fig1, ax1 = plt.subplots()
132
  for disease in diseases:
133
  idx = target == disease
134
+ ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80)
 
135
  ax1.set_title("TF-IDF 2D Projection")
 
 
136
  ax1.legend()
137
  st.pyplot(fig1)
138
+
 
139
  fig2, ax2 = plt.subplots()
140
  for disease in diseases:
141
  idx = target == disease
142
+ ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80)
 
143
  ax2.set_title("One-Hot 2D Projection")
 
 
144
  ax2.legend()
145
  st.pyplot(fig2)
146
+
147
+ st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_)
148
+ st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_)
 
 
149
 
150
  # ======================================
151
+ # Task 3
152
  # ======================================
153
+ def evaluate_model(X, y, model, name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  scoring = {
155
+ 'accuracy': make_scorer(accuracy_score),
156
+ 'precision': make_scorer(precision_score, average='macro', zero_division=0),
157
+ 'recall': make_scorer(recall_score, average='macro', zero_division=0),
158
+ 'f1': make_scorer(f1_score, average='macro', zero_division=0)
159
  }
160
+ cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
161
+ results = cross_validate(model, X, y, cv=cv, scoring=scoring)
162
+ st.write(f"### {name}")
163
+ for metric in scoring:
164
+ st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ def task3_classification():
167
+ st.header("Task 3: Classification with KNN and Logistic Regression")
168
+ df = preprocess_text_columns(load_data())
169
+ tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
170
+ y = df["Disease"]
171
+
172
+ st.subheader("KNN on TF-IDF")
173
+ for k in [3, 5, 7]:
174
+ model = KNeighborsClassifier(n_neighbors=k, metric='cosine')
175
+ evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)")
176
+
177
+ st.subheader("Logistic Regression on TF-IDF")
178
+ logreg = LogisticRegression(max_iter=1000)
179
+ evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  # ======================================
182
+ # Sidebar Navigation
183
  # ======================================
184
+ task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ if task == "Task 1: Feature Extraction":
187
+ task1_feature_extraction()
188
+ elif task == "Task 2: Dimensionality Reduction":
189
+ task2_dimensionality_reduction()
190
+ elif task == "Task 3: Classification":
191
+ task3_classification()