Spaces:

jaker86
/

data_science_crash_course

Sleeping

App Files Files Community

jaker86 commited on Feb 25, 2025

Commit

e791e5b

verified ·

1 Parent(s): ee936fb

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -108

app.py CHANGED Viewed

@@ -12,8 +12,14 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import io
 def update_dropdown(file):
-    """Update the dropdown choices with column names from the uploaded file."""
     if file is None:
         return gr.Dropdown.update(choices=[], value=None)
     try:
@@ -24,12 +30,16 @@ def update_dropdown(file):
         else:
             return gr.Dropdown.update(choices=[], value=None)
         return gr.Dropdown.update(choices=list(df.columns), value=None)
-    except Exception:
         return gr.Dropdown.update(choices=[], value=None)
 def analyze_file(file, label_col, n_clusters):
     """Analyze the uploaded file with ML techniques and return results and plots."""
-    # Read the file based on its extension
     try:
         if file.name.endswith('.csv'):
             df = pd.read_csv(file.name)
@@ -40,147 +50,173 @@ def analyze_file(file, label_col, n_clusters):
     except Exception as e:
         return (f"Error reading file: {e}", None, None, None, None, None)
-    # Validate label column
     if label_col not in df.columns:
-        return (f"Label column '{label_col}' not found. Please select a valid column.", None, None, None, None, None)
-    # Clean data and validate size
     df = df.dropna()
-    if df.shape[0] < 10:
-        return ("Not enough data rows (less than 10) after removing missing values.", None, None, None, None, None)
-    if df.shape[1] < 2:
         return ("Need at least one feature and one label column.", None, None, None, None, None)
     # Separate features and target
     y = df[label_col]
     X = df.drop(columns=[label_col])
     X_processed = pd.get_dummies(X)  # One-hot encode categorical features
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X_processed)
     results_text = ""
     model_img = None
-    # Prediction: regression or classification based on target type
-    if pd.api.types.is_numeric_dtype(y):
-        # Regression
-        X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)
-        model = RandomForestRegressor(random_state=42)
-        model.fit(X_train, y_train)
-        y_pred = model.predict(X_test)
-        mse = mean_squared_error(y_test, y_pred)
-        r2 = r2_score(y_test, y_pred)
-        results_text += (
-            "Regression Results (predicting numeric values):\n"
-            f"- Mean Squared Error (MSE): {mse:.3f} (lower is better)\n"
-            f"- R² Score: {r2:.3f} (0 to 1, higher is better)\n"
-        )
         plt.figure(figsize=(8, 6))
-        plt.scatter(y_test, y_pred, alpha=0.7)
-        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
-        plt.xlabel("True Values")
-        plt.ylabel("Predicted Values")
-        plt.title("Regression: True vs Predicted")
         buf = io.BytesIO()
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
-        model_img = buf
-    else:
-        # Classification
-        y_encoded, uniques = pd.factorize(y)
-        X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=42)
-        model = RandomForestClassifier(random_state=42)
-        model.fit(X_train, y_train)
-        y_pred = model.predict(X_test)
-        cm = confusion_matrix(y_test, y_pred)
-        cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
-        results_text += "Classification Results (predicting categories):\n" + cr + "\n"
         plt.figure(figsize=(8, 6))
-        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=uniques, yticklabels=uniques)
-        plt.xlabel("Predicted")
-        plt.ylabel("True")
-        plt.title("Confusion Matrix")
         buf = io.BytesIO()
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
-        model_img = buf
-    # Feature importance (top 10)
-    fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(10)
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x=fi.values, y=fi.index)
-    plt.title("Top 10 Feature Importances")
-    plt.xlabel("Importance")
-    plt.ylabel("Feature")
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches="tight")
-    plt.close()
-    buf.seek(0)
-    fi_img = buf
-    # KMeans clustering
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-    clusters_kmeans = kmeans.fit_predict(X_scaled)
-    pca = PCA(n_components=2, random_state=42)
-    X_pca = pca.fit_transform(X_scaled)
-    explained_var = sum(pca.explained_variance_ratio_)
-    plt.figure(figsize=(8, 6))
-    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
-    plt.xlabel("PCA 1")
-    plt.ylabel("PCA 2")
-    plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
-    plt.colorbar(scatter, ticks=range(n_clusters))
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches="tight")
-    plt.close()
-    buf.seek(0)
-    kmeans_img = buf
-    # Agglomerative clustering
-    agg = AgglomerativeClustering(n_clusters=n_clusters)
-    clusters_agg = agg.fit_predict(X_scaled)
-    plt.figure(figsize=(8, 6))
-    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
-    plt.xlabel("PCA 1")
-    plt.ylabel("PCA 2")
-    plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
-    plt.colorbar(scatter, ticks=range(n_clusters))
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches="tight")
-    plt.close()
-    buf.seek(0)
-    agg_img = buf
-    # Differentiating features (top 10)
-    f_scores, _ = f_classif(X_processed, clusters_kmeans)
-    f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(10)
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x=f_series.values, y=f_series.index, palette="mako")
-    plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
-    plt.xlabel("F-score")
-    plt.ylabel("Feature")
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches="tight")
-    plt.close()
-    buf.seek(0)
-    diff_img = buf
     return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
 with gr.Blocks() as demo:
     gr.Markdown("## Data Analysis Explorer")
     gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
     with gr.Row():
         file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
-        label_dropdown = gr.Dropdown(label="Select Column to Predict", interactive=True)
         clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")
-    # Event handler to update dropdown when file is uploaded
     file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
     analyze_btn = gr.Button("Analyze")
     with gr.Tabs():

 import seaborn as sns
 import io
+# Constants for reproducibility and configuration
+RANDOM_STATE = 42
+MIN_ROWS = 10
+MIN_COLS = 2
+MAX_FEATURES_TO_SHOW = 10
 def update_dropdown(file):
+    """Update dropdown choices with column names from the uploaded file."""
     if file is None:
         return gr.Dropdown.update(choices=[], value=None)
     try:
         else:
             return gr.Dropdown.update(choices=[], value=None)
         return gr.Dropdown.update(choices=list(df.columns), value=None)
+    except Exception as e:
         return gr.Dropdown.update(choices=[], value=None)
 def analyze_file(file, label_col, n_clusters):
     """Analyze the uploaded file with ML techniques and return results and plots."""
+    # Validate file input
+    if file is None:
+        return ("Please upload a file.", None, None, None, None, None)
+    # Read file based on extension
     try:
         if file.name.endswith('.csv'):
             df = pd.read_csv(file.name)
     except Exception as e:
         return (f"Error reading file: {e}", None, None, None, None, None)
+    # Validate data shape and label column
+    if df.empty:
+        return ("File is empty.", None, None, None, None, None)
     if label_col not in df.columns:
+        return (f"Label column '{label_col}' not found.", None, None, None, None, None)
+    # Clean data and check minimum requirements
     df = df.dropna()
+    if df.shape[0] < MIN_ROWS:
+        return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
+    if df.shape[1] < MIN_COLS:
         return ("Need at least one feature and one label column.", None, None, None, None, None)
     # Separate features and target
     y = df[label_col]
     X = df.drop(columns=[label_col])
     X_processed = pd.get_dummies(X)  # One-hot encode categorical features
+    if X_processed.shape[1] == 0:
+        return ("No valid features after preprocessing.", None, None, None, None, None)
+    # Scale features
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X_processed)
     results_text = ""
     model_img = None
+    # Prediction: Regression or Classification
+    try:
+        if pd.api.types.is_numeric_dtype(y):
+            # Regression
+            X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE)
+            model = RandomForestRegressor(random_state=RANDOM_STATE)
+            model.fit(X_train, y_train)
+            y_pred = model.predict(X_test)
+            mse = mean_squared_error(y_test, y_pred)
+            r2 = r2_score(y_test, y_pred)
+            results_text += (
+                "Regression Results (predicting numeric values):\n"
+                f"- Mean Squared Error (MSE): {mse:.3f} (lower is better)\n"
+                f"- R² Score: {r2:.3f} (0 to 1, higher is better)\n"
+            )
+            plt.figure(figsize=(8, 6))
+            plt.scatter(y_test, y_pred, alpha=0.7)
+            plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
+            plt.xlabel("True Values")
+            plt.ylabel("Predicted Values")
+            plt.title("Regression: True vs Predicted")
+            buf = io.BytesIO()
+            plt.savefig(buf, format="png", bbox_inches="tight")
+            plt.close()
+            buf.seek(0)
+            model_img = buf
+        else:
+            # Classification
+            if len(y.unique()) < 2:
+                return ("Label column must have at least 2 unique values for classification.", None, None, None, None, None)
+            y_encoded, uniques = pd.factorize(y)
+            X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
+            model = RandomForestClassifier(random_state=RANDOM_STATE)
+            model.fit(X_train, y_train)
+            y_pred = model.predict(X_test)
+            cm = confusion_matrix(y_test, y_pred)
+            cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
+            results_text += "Classification Results (predicting categories):\n" + cr + "\n"
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=uniques, yticklabels=uniques)
+            plt.xlabel("Predicted")
+            plt.ylabel("True")
+            plt.title("Confusion Matrix")
+            buf = io.BytesIO()
+            plt.savefig(buf, format="png", bbox_inches="tight")
+            plt.close()
+            buf.seek(0)
+            model_img = buf
+    except Exception as e:
+        return (f"Error during model training: {e}", None, None, None, None, None)
+    # Feature Importance
+    try:
+        fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
+        plt.figure(figsize=(10, 6))
+        sns.barplot(x=fi.values, y=fi.index)
+        plt.title("Top 10 Feature Importances")
+        plt.xlabel("Importance")
+        plt.ylabel("Feature")
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png", bbox_inches="tight")
+        plt.close()
+        buf.seek(0)
+        fi_img = buf
+    except Exception as e:
+        fi_img = None
+        results_text += f"\nWarning: Could not compute feature importance: {e}"
+    # KMeans Clustering
+    try:
+        kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
+        clusters_kmeans = kmeans.fit_predict(X_scaled)
+        pca = PCA(n_components=2, random_state=RANDOM_STATE)
+        X_pca = pca.fit_transform(X_scaled)
+        explained_var = sum(pca.explained_variance_ratio_)
         plt.figure(figsize=(8, 6))
+        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
+        plt.xlabel("PCA 1")
+        plt.ylabel("PCA 2")
+        plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
+        plt.colorbar(scatter, ticks=range(n_clusters))
         buf = io.BytesIO()
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
+        kmeans_img = buf
+    except Exception as e:
+        kmeans_img = None
+        results_text += f"\nWarning: KMeans clustering failed: {e}"
+    # Agglomerative Clustering
+    try:
+        agg = AgglomerativeClustering(n_clusters=n_clusters)
+        clusters_agg = agg.fit_predict(X_scaled)
         plt.figure(figsize=(8, 6))
+        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
+        plt.xlabel("PCA 1")
+        plt.ylabel("PCA 2")
+        plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
+        plt.colorbar(scatter, ticks=range(n_clusters))
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png", bbox_inches="tight")
+        plt.close()
+        buf.seek(0)
+        agg_img = buf
+    except Exception as e:
+        agg_img = None
+        results_text += f"\nWarning: Agglomerative clustering failed: {e}"
+    # Differentiating Features
+    try:
+        f_scores, _ = f_classif(X_processed, clusters_kmeans)
+        f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
+        plt.figure(figsize=(10, 6))
+        sns.barplot(x=f_series.values, y=f_series.index, palette="mako")
+        plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
+        plt.xlabel("F-score")
+        plt.ylabel("Feature")
         buf = io.BytesIO()
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
+        diff_img = buf
+    except Exception as e:
+        diff_img = None
+        results_text += f"\nWarning: Could not compute differentiating features: {e}"
     return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
+# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("## Data Analysis Explorer")
     gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
     with gr.Row():
         file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
+        label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True)
         clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")
     file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
     analyze_btn = gr.Button("Analyze")
     with gr.Tabs():