Spaces:

ZainabEman
/

Assignment03

Sleeping

App Files Files Community

ZainabEman commited on May 12, 2025

Commit

677ed4f

verified ·

1 Parent(s): 7378255

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -218

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from sklearn.decomposition import TruncatedSVD
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_validate, StratifiedKFold
 from scipy.sparse import hstack
 st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")
@@ -17,8 +18,6 @@ st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analy
 # Data Loading and Preprocessing
 # ======================================
 def load_data():
-    # Simulated clinical dataset with stringified lists for demonstration.
-    # (In practice, replace this with reading the actual dataset.)
     data = [
         {"id": 1, "Risk Factors": "['smoking', 'obesity']",
          "Symptoms": "['chest pain', 'shortness of breath']",
@@ -60,284 +59,133 @@ def load_data():
     return pd.DataFrame(data)
 def preprocess_text_columns(df):
-    # Convert each stringified list to an actual list, then join items into a single space-separated string.
     for col in ["Risk Factors", "Symptoms", "Signs"]:
         df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
     return df
-# ======================================
-# Vectorization: TF-IDF and One-Hot Encoding
-# ======================================
 def vectorize_columns(df):
     cols = ["Risk Factors", "Symptoms", "Signs"]
-    tfidf_matrices = []
-    onehot_matrices = []
-    tfidf_vocabs = {}
-    onehot_vocabs = {}
     for col in cols:
         text_data = df[col + '_combined']
-        # TF-IDF vectorization
         tfidf_vec = TfidfVectorizer()
         tfidf_matrix = tfidf_vec.fit_transform(text_data)
         tfidf_matrices.append(tfidf_matrix)
         tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()
-        # One-hot encoding using CountVectorizer (binary=True)
         count_vec = CountVectorizer(binary=True)
         onehot_matrix = count_vec.fit_transform(text_data)
         onehot_matrices.append(onehot_matrix)
         onehot_vocabs[col] = count_vec.get_feature_names_out()
-    tfidf_combined = hstack(tfidf_matrices)
-    onehot_combined = hstack(onehot_matrices)
-    return tfidf_combined, onehot_combined, tfidf_vocabs, onehot_vocabs
 # ======================================
-# Task 1: Feature Extraction and Encoding Comparison
 # ======================================
 def task1_feature_extraction():
     st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
-    st.write("""
-    **Steps:**
-    1. Parse the stringified lists for "Risk Factors", "Symptoms", and "Signs".
-    2. Convert each list into a single string.
-    3. Apply TF-IDF vectorization (using TfidfVectorizer) on each column separately.
-    4. Apply one-hot encoding (using CountVectorizer with binary=True) on the same columns.
-    5. Combine the matrices and compare shapes, sparsity, and the number of unique features.
-    """)
-    df = load_data()
-    df = preprocess_text_columns(df)
-    st.write("### Input Data")
     st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])
     tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)
-    # Display the matrices (dense format for small datasets)
     st.write("### TF-IDF Combined Matrix")
     st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
     st.write("### One-Hot Combined Matrix")
     st.dataframe(pd.DataFrame(onehot_matrix.toarray()))
     def matrix_stats(matrix, name):
         total_elements = matrix.shape[0] * matrix.shape[1]
-        nonzero = matrix.nnz if hasattr(matrix, 'nnz') else np.count_nonzero(matrix)
         sparsity = 100 * (1 - nonzero / total_elements)
-        st.write(f"**{name} Matrix Shape:** {matrix.shape}")
-        st.write(f"**{name} Sparsity:** {sparsity:.2f}%")
     st.subheader("Matrix Statistics:")
     matrix_stats(tfidf_matrix, "TF-IDF")
     matrix_stats(onehot_matrix, "One-Hot")
-    total_tfidf_features = sum(len(v) for v in tfidf_vocabs.values())
-    total_onehot_features = sum(len(v) for v in onehot_vocabs.values())
-    st.write("**Total Unique TF-IDF Features:**", total_tfidf_features)
-    st.write("**Total Unique One-Hot Features:**", total_onehot_features)
 # ======================================
-# Task 2: Dimensionality Reduction and Visualization
 # ======================================
 def task2_dimensionality_reduction():
-    st.header("Task 2: Dimensionality Reduction and 2D Visualization")
-    st.write("""
-    **Steps:**
-    1. Use Truncated SVD (for sparse matrices) to reduce dimensions of both TF-IDF and One-Hot feature matrices to 2 components.
-    2. Compare the explained variance ratios.
-    3. Visualize the 2D projections with points color-coded by the disease category.
-    """)
-    df = load_data()
-    df = preprocess_text_columns(df)
     tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
-    # Dimensionality reduction for TF-IDF
     svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
     tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)
-    st.write("**TF-IDF Explained Variance Ratio (2 components):**", svd_tfidf.explained_variance_ratio_)
-    # Dimensionality reduction for One-Hot
     svd_onehot = TruncatedSVD(n_components=2, random_state=42)
     onehot_2d = svd_onehot.fit_transform(onehot_matrix)
-    st.write("**One-Hot Explained Variance Ratio (2 components):**", svd_onehot.explained_variance_ratio_)
-    target = df['Disease']
     diseases = target.unique()
-    # Plot for TF-IDF
     fig1, ax1 = plt.subplots()
     for disease in diseases:
         idx = target == disease
-        ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1],
-                    label=disease, s=80)
     ax1.set_title("TF-IDF 2D Projection")
-    ax1.set_xlabel("Component 1")
-    ax1.set_ylabel("Component 2")
     ax1.legend()
     st.pyplot(fig1)
-    # Plot for One-Hot
     fig2, ax2 = plt.subplots()
     for disease in diseases:
         idx = target == disease
-        ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1],
-                    label=disease, s=80)
     ax2.set_title("One-Hot 2D Projection")
-    ax2.set_xlabel("Component 1")
-    ax2.set_ylabel("Component 2")
     ax2.legend()
     st.pyplot(fig2)
-    st.write("""
-    **Discussion:**
-    Compare the two plots above to see which encoding method (TF-IDF or One-Hot) produces clusters that are more separable based on the disease categories.
-    """)
 # ======================================
-# Task 3: Classification Using KNN and Logistic Regression
 # ======================================
-def task3_classification():
-    st.header("Task 3: Train KNN and Logistic Regression Models")
-    st.write("""
-    **KNN Classification:**
-    Evaluate KNN using k = 3, 5, 7 and distance metrics: Euclidean, Manhattan, and Cosine.
-    Use cross-validation to report Accuracy, Precision, Recall, and F1-score.
-    **Logistic Regression Classification:**
-    Train Logistic Regression using cross-validation and compare its performance (Accuracy and F1-score) with KNN.
-    """)
-    df = load_data()
-    df = preprocess_text_columns(df)
-    tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
-    y = df['Disease']
-    # Determine appropriate number of folds based on minimum class count.
-    min_count = y.value_counts().min()
-    n_splits = min(5, min_count)  # Using smaller splits if classes have fewer than 5 samples.
-    st.write(f"**Using {n_splits}-fold cross-validation (based on minimum class count of {min_count}).**")
-    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
     scoring = {
-        'accuracy': 'accuracy',
-        'precision': 'precision_macro',
-        'recall': 'recall_macro',
-        'f1': 'f1_macro'
     }
-    knn_results = []
-    distance_metrics = ['euclidean', 'manhattan', 'cosine']
-    k_values = [3, 5, 7]
-    # Evaluate KNN for both encoding methods.
-    for encoding, X in [('TF-IDF', tfidf_matrix), ('One-Hot', onehot_matrix)]:
-        for metric in distance_metrics:
-            for k in k_values:
-                # For cosine distance, use the 'brute' algorithm.
-                if metric == 'cosine':
-                    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, algorithm='brute')
-                else:
-                    knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
-                scores = cross_validate(knn, X, y, cv=cv, scoring=scoring, n_jobs=-1)
-                knn_results.append({
-                    "Encoding": encoding,
-                    "Model": "KNN",
-                    "Parameter": f"k={k}, metric={metric}",
-                    "Accuracy": np.mean(scores['test_accuracy']),
-                    "Precision": np.mean(scores['test_precision']),
-                    "Recall": np.mean(scores['test_recall']),
-                    "F1": np.mean(scores['test_f1'])
-                })
-    knn_df = pd.DataFrame(knn_results)
-    st.subheader("KNN Classification Results")
-    st.dataframe(knn_df)
-    # Evaluate Logistic Regression for both encoding methods.
-    lr_results = []
-    for encoding, X in [('TF-IDF', tfidf_matrix), ('One-Hot', onehot_matrix)]:
-        lr = LogisticRegression(max_iter=1000, random_state=42)
-        scores = cross_validate(lr, X, y, cv=cv, scoring=scoring, n_jobs=-1)
-        lr_results.append({
-            "Encoding": encoding,
-            "Model": "Logistic Regression",
-            "Parameter": "Default",
-            "Accuracy": np.mean(scores['test_accuracy']),
-            "Precision": np.mean(scores['test_precision']),
-            "Recall": np.mean(scores['test_recall']),
-            "F1": np.mean(scores['test_f1'])
-        })
-    lr_df = pd.DataFrame(lr_results)
-    st.subheader("Logistic Regression Classification Results")
-    st.dataframe(lr_df)
-    st.write("""
-    **Discussion:**
-    - Compare the performance of KNN with different values of k and different distance metrics.
-    - Compare the results for TF-IDF vs. One-Hot encoding.
-    - Examine how Logistic Regression performs relative to KNN.
-    """)
-# ======================================
-# Task 4: Critical Analysis Report
-# ======================================
-def task4_critical_analysis():
-    st.header("Task 4: Critical Analysis Report")
-    st.markdown("""
-    ### Critical Analysis
-    **1. Encoding Comparison: TF-IDF vs. One-Hot**
-    - **TF-IDF Advantages:**
-      - Weights terms according to their frequency relative to all documents, emphasizing informative words.
-      - Down-weights common terms, which can be beneficial in highlighting key clinical features.
-    - **One-Hot Advantages:**
-      - Provides a simple, interpretable representation where each feature signifies the presence or absence of a term.
-    **2. Clinical Relevance of the Results**
-    - **TF-IDF Clusters:**
-      - May reveal clusters that align with clinical disease categories by emphasizing significant symptom patterns.
-      - Could help in differential diagnosis if clusters clearly separate conditions (e.g., Cardiovascular vs. Neurological).
-    - **One-Hot Clusters:**
-      - Although simpler, one-hot encoding may be sufficient when dataset size is small or when interpretability is a primary concern.
-    **3. Limitations of Both Methods**
-    - **TF-IDF Limitations:**
-      - Does not capture word order or context.
-      - Sensitive to minor variations in spelling or term usage.
-    - **One-Hot Limitations:**
-      - Can lead to very high-dimensional and sparse feature spaces.
-      - Lacks a weighting mechanism, treating all words as equally important.
-    **Conclusion:**
-    The choice between TF-IDF and one-hot encoding depends on the application context. In clinical text analysis, TF-IDF may provide an advantage by emphasizing key symptoms, while one-hot encoding remains valuable for its simplicity and interpretability.
-    """)
 # ======================================
-# Main App Navigation
 # ======================================
-def main():
-    st.sidebar.title("Assignment 3 Tasks")
-    task = st.sidebar.radio("Choose Task",
-                              ("Task 1: Feature Extraction",
-                               "Task 2: Dimensionality Reduction",
-                               "Task 3: Classification Models",
-                               "Task 4: Critical Analysis"))
-    if task == "Task 1: Feature Extraction":
-        task1_feature_extraction()
-    elif task == "Task 2: Dimensionality Reduction":
-        task2_dimensionality_reduction()
-    elif task == "Task 3: Classification Models":
-        task3_classification()
-    elif task == "Task 4: Critical Analysis":
-        task4_critical_analysis()
-if __name__ == "__main__":
-    main()

 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_validate, StratifiedKFold
+from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
 from scipy.sparse import hstack
 st.set_page_config(layout="wide", page_title="Assignment 3 - Clinical Text Analysis")
 # Data Loading and Preprocessing
 # ======================================
 def load_data():
     data = [
         {"id": 1, "Risk Factors": "['smoking', 'obesity']",
          "Symptoms": "['chest pain', 'shortness of breath']",
     return pd.DataFrame(data)
 def preprocess_text_columns(df):
     for col in ["Risk Factors", "Symptoms", "Signs"]:
         df[col + '_combined'] = df[col].apply(lambda x: " ".join(ast.literal_eval(x)) if pd.notnull(x) else "")
     return df
 def vectorize_columns(df):
     cols = ["Risk Factors", "Symptoms", "Signs"]
+    tfidf_matrices, onehot_matrices = [], []
+    tfidf_vocabs, onehot_vocabs = {}, {}
     for col in cols:
         text_data = df[col + '_combined']
         tfidf_vec = TfidfVectorizer()
         tfidf_matrix = tfidf_vec.fit_transform(text_data)
         tfidf_matrices.append(tfidf_matrix)
         tfidf_vocabs[col] = tfidf_vec.get_feature_names_out()
         count_vec = CountVectorizer(binary=True)
         onehot_matrix = count_vec.fit_transform(text_data)
         onehot_matrices.append(onehot_matrix)
         onehot_vocabs[col] = count_vec.get_feature_names_out()
+    return hstack(tfidf_matrices), hstack(onehot_matrices), tfidf_vocabs, onehot_vocabs
 # ======================================
+# Task 1
 # ======================================
 def task1_feature_extraction():
     st.header("Task 1: TF-IDF Feature Extraction and One-Hot Comparison")
+    df = preprocess_text_columns(load_data())
     st.dataframe(df[["id", "Risk Factors", "Symptoms", "Signs", "Disease"]])
     tfidf_matrix, onehot_matrix, tfidf_vocabs, onehot_vocabs = vectorize_columns(df)
     st.write("### TF-IDF Combined Matrix")
     st.dataframe(pd.DataFrame(tfidf_matrix.toarray()))
     st.write("### One-Hot Combined Matrix")
     st.dataframe(pd.DataFrame(onehot_matrix.toarray()))
     def matrix_stats(matrix, name):
         total_elements = matrix.shape[0] * matrix.shape[1]
+        nonzero = matrix.nnz
         sparsity = 100 * (1 - nonzero / total_elements)
+        st.write(f"**{name} Shape:** {matrix.shape}, **Sparsity:** {sparsity:.2f}%")
     st.subheader("Matrix Statistics:")
     matrix_stats(tfidf_matrix, "TF-IDF")
     matrix_stats(onehot_matrix, "One-Hot")
+    st.write("**Total Unique TF-IDF Features:**", sum(len(v) for v in tfidf_vocabs.values()))
+    st.write("**Total Unique One-Hot Features:**", sum(len(v) for v in onehot_vocabs.values()))
 # ======================================
+# Task 2
 # ======================================
 def task2_dimensionality_reduction():
+    st.header("Task 2: Dimensionality Reduction and Visualization")
+    df = preprocess_text_columns(load_data())
     tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
     svd_tfidf = TruncatedSVD(n_components=2, random_state=42)
     tfidf_2d = svd_tfidf.fit_transform(tfidf_matrix)
     svd_onehot = TruncatedSVD(n_components=2, random_state=42)
     onehot_2d = svd_onehot.fit_transform(onehot_matrix)
+    target = df["Disease"]
     diseases = target.unique()
     fig1, ax1 = plt.subplots()
     for disease in diseases:
         idx = target == disease
+        ax1.scatter(tfidf_2d[idx, 0], tfidf_2d[idx, 1], label=disease, s=80)
     ax1.set_title("TF-IDF 2D Projection")
     ax1.legend()
     st.pyplot(fig1)
     fig2, ax2 = plt.subplots()
     for disease in diseases:
         idx = target == disease
+        ax2.scatter(onehot_2d[idx, 0], onehot_2d[idx, 1], label=disease, s=80)
     ax2.set_title("One-Hot 2D Projection")
     ax2.legend()
     st.pyplot(fig2)
+    st.write("**TF-IDF Explained Variance Ratio:**", svd_tfidf.explained_variance_ratio_)
+    st.write("**One-Hot Explained Variance Ratio:**", svd_onehot.explained_variance_ratio_)
 # ======================================
+# Task 3
 # ======================================
+def evaluate_model(X, y, model, name):
     scoring = {
+        'accuracy': make_scorer(accuracy_score),
+        'precision': make_scorer(precision_score, average='macro', zero_division=0),
+        'recall': make_scorer(recall_score, average='macro', zero_division=0),
+        'f1': make_scorer(f1_score, average='macro', zero_division=0)
     }
+    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
+    results = cross_validate(model, X, y, cv=cv, scoring=scoring)
+    st.write(f"### {name}")
+    for metric in scoring:
+        st.write(f"**{metric.capitalize()}:** {np.mean(results[f'test_{metric}']):.2f}")
+def task3_classification():
+    st.header("Task 3: Classification with KNN and Logistic Regression")
+    df = preprocess_text_columns(load_data())
+    tfidf_matrix, onehot_matrix, _, _ = vectorize_columns(df)
+    y = df["Disease"]
+    st.subheader("KNN on TF-IDF")
+    for k in [3, 5, 7]:
+        model = KNeighborsClassifier(n_neighbors=k, metric='cosine')
+        evaluate_model(tfidf_matrix, y, model, f"KNN (k={k}, Cosine)")
+    st.subheader("Logistic Regression on TF-IDF")
+    logreg = LogisticRegression(max_iter=1000)
+    evaluate_model(tfidf_matrix, y, logreg, "Logistic Regression")
 # ======================================
+# Sidebar Navigation
 # ======================================
+task = st.sidebar.radio("Select Task", ["Task 1: Feature Extraction", "Task 2: Dimensionality Reduction", "Task 3: Classification"])
+if task == "Task 1: Feature Extraction":
+    task1_feature_extraction()
+elif task == "Task 2: Dimensionality Reduction":
+    task2_dimensionality_reduction()
+elif task == "Task 3: Classification":
+    task3_classification()