Spaces:

jaker86
/

data_science_crash_course

Sleeping

App Files Files Community

jaker86 commited on Feb 25, 2025

Commit

3e930db

verified ·

1 Parent(s): 755fb3a

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -44

app.py CHANGED Viewed

@@ -11,16 +11,15 @@ from sklearn.feature_selection import f_classif
 import matplotlib.pyplot as plt
 import seaborn as sns
 import io
-from PIL import Image  # For converting BytesIO to PIL Image
-# Constants for reproducibility and configuration
 RANDOM_STATE = 42
 MIN_ROWS = 10
 MIN_COLS = 2
 MAX_FEATURES_TO_SHOW = 10
 def update_dropdown(file):
-    """Update dropdown choices with column names from the uploaded file."""
     if file is None:
         return gr.update(choices=[], value=None)
     try:
@@ -35,12 +34,9 @@ def update_dropdown(file):
         return gr.update(choices=[], value=None)
 def analyze_file(file, label_col, n_clusters):
-    """Analyze the uploaded file with ML techniques and return results and plots."""
-    # Validate file input
     if file is None:
         return ("Please upload a file.", None, None, None, None, None)
-    # Read file based on extension
     try:
         if file.name.endswith('.csv'):
             df = pd.read_csv(file.name)
@@ -51,27 +47,23 @@ def analyze_file(file, label_col, n_clusters):
     except Exception as e:
         return (f"Error reading file: {e}", None, None, None, None, None)
-    # Validate data shape and label column
     if df.empty:
         return ("File is empty.", None, None, None, None, None)
     if label_col not in df.columns:
         return (f"Label column '{label_col}' not found.", None, None, None, None, None)
-    # Clean data and check minimum requirements
     df = df.dropna()
     if df.shape[0] < MIN_ROWS:
         return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
     if df.shape[1] < MIN_COLS:
         return ("Need at least one feature and one label column.", None, None, None, None, None)
-    # Separate features and target
     y = df[label_col]
     X = df.drop(columns=[label_col])
-    X_processed = pd.get_dummies(X)  # One-hot encode categorical features
     if X_processed.shape[1] == 0:
         return ("No valid features after preprocessing.", None, None, None, None, None)
-    # Scale features
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X_processed)
@@ -82,7 +74,6 @@ def analyze_file(file, label_col, n_clusters):
     agg_img = None
     diff_img = None
-    # Prediction: Regression or Classification
     try:
         if pd.api.types.is_numeric_dtype(y):
             # Regression
@@ -96,29 +87,38 @@ def analyze_file(file, label_col, n_clusters):
                 "Regression Results:\n"
                 f"- MSE: {mse:.3f}\n"
                 f"- R²: {r2:.3f}\n"
             )
-            # 3D Plot with next two most important features
             fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
-            if len(fi) < 3:
-                results_text += "\nNot enough features for a 3D plot with the next two most important features."
-            else:
-                next_two_features = fi.index[1:3]  # Second and third most important features
-                fig = plt.figure(figsize=(10, 8))
-                ax = fig.add_subplot(111, projection='3d')
-                ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c='blue', marker='o', label='True Values')
-                ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_pred, c='red', marker='^', label='Predicted Values')
-                ax.set_xlabel(next_two_features[0])
-                ax.set_ylabel(next_two_features[1])
-                ax.set_zlabel(label_col)
-                ax.set_title("3D Plot: Label vs Next Two Most Important Features")
-                ax.legend()
-                buf = io.BytesIO()
-                plt.savefig(buf, format="png", bbox_inches="tight")
-                plt.close()
-                buf.seek(0)
-                model_img = Image.open(buf)
         else:
-            # Classification
             if len(y.unique()) < 2:
                 return ("Label must have at least 2 unique values.", None, None, None, None, None)
             y_encoded, uniques = pd.factorize(y)
@@ -128,12 +128,11 @@ def analyze_file(file, label_col, n_clusters):
             y_pred = model.predict(X_test)
             cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
             results_text += "Classification Results:\n" + cr + "\n"
-            # 3D Plot with next two most important features
             fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
             if len(fi) < 3:
                 results_text += "\nNot enough features for a 3D plot with the next two most important features."
             else:
-                next_two_features = fi.index[1:3]  # Second and third most important features
                 fig = plt.figure(figsize=(10, 8))
                 ax = fig.add_subplot(111, projection='3d')
                 scatter = ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c=y_test, cmap='viridis', marker='o')
@@ -149,7 +148,6 @@ def analyze_file(file, label_col, n_clusters):
     except Exception as e:
         results_text += f"\nError during model training: {e}"
-    # Feature Importance
     try:
         fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
         plt.figure(figsize=(10, 6))
@@ -161,11 +159,10 @@ def analyze_file(file, label_col, n_clusters):
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
-        fi_img = Image.open(buf)  # Convert to PIL Image
     except Exception as e:
         results_text += f"\nWarning: Could not compute feature importance: {e}"
-    # KMeans Clustering
     try:
         kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
         clusters_kmeans = kmeans.fit_predict(X_scaled)
@@ -182,11 +179,10 @@ def analyze_file(file, label_col, n_clusters):
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
-        kmeans_img = Image.open(buf)  # Convert to PIL Image
     except Exception as e:
         results_text += f"\nWarning: KMeans clustering failed: {e}"
-    # Agglomerative Clustering
     try:
         agg = AgglomerativeClustering(n_clusters=n_clusters)
         clusters_agg = agg.fit_predict(X_scaled)
@@ -200,11 +196,10 @@ def analyze_file(file, label_col, n_clusters):
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
-        agg_img = Image.open(buf)  # Convert to PIL Image
     except Exception as e:
         results_text += f"\nWarning: Agglomerative clustering failed: {e}"
-    # Differentiating Features
     try:
         f_scores, _ = f_classif(X_processed, clusters_kmeans)
         f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
@@ -217,13 +212,12 @@ def analyze_file(file, label_col, n_clusters):
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
-        diff_img = Image.open(buf)  # Convert to PIL Image
     except Exception as e:
         results_text += f"\nWarning: Could not compute differentiating features: {e}"
     return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
-# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("## Data Analysis Explorer")
     gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
@@ -248,7 +242,7 @@ with gr.Blocks() as demo:
         with gr.TabItem("Prediction Plot"):
             gr.Markdown("### Prediction Visualization")
-            gr.Markdown("Regression shows true vs. predicted values. Classification shows a confusion matrix of correct/incorrect predictions.")
             model_img_output = gr.Image(label="Prediction Output")
         with gr.TabItem("Feature Importances"):

 import matplotlib.pyplot as plt
 import seaborn as sns
 import io
+from PIL import Image
+# Constants
 RANDOM_STATE = 42
 MIN_ROWS = 10
 MIN_COLS = 2
 MAX_FEATURES_TO_SHOW = 10
 def update_dropdown(file):
     if file is None:
         return gr.update(choices=[], value=None)
     try:
         return gr.update(choices=[], value=None)
 def analyze_file(file, label_col, n_clusters):
     if file is None:
         return ("Please upload a file.", None, None, None, None, None)
     try:
         if file.name.endswith('.csv'):
             df = pd.read_csv(file.name)
     except Exception as e:
         return (f"Error reading file: {e}", None, None, None, None, None)
     if df.empty:
         return ("File is empty.", None, None, None, None, None)
     if label_col not in df.columns:
         return (f"Label column '{label_col}' not found.", None, None, None, None, None)
     df = df.dropna()
     if df.shape[0] < MIN_ROWS:
         return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
     if df.shape[1] < MIN_COLS:
         return ("Need at least one feature and one label column.", None, None, None, None, None)
     y = df[label_col]
     X = df.drop(columns=[label_col])
+    X_processed = pd.get_dummies(X)
     if X_processed.shape[1] == 0:
         return ("No valid features after preprocessing.", None, None, None, None, None)
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X_processed)
     agg_img = None
     diff_img = None
     try:
         if pd.api.types.is_numeric_dtype(y):
             # Regression
                 "Regression Results:\n"
                 f"- MSE: {mse:.3f}\n"
                 f"- R²: {r2:.3f}\n"
+                "\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n"
             )
+            # 2D Plots: Top 3 features vs predicted and true vs predicted
             fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
+            top_features = fi.head(3).index
+            fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+            axes = axes.flatten()
+            for i, feature in enumerate(top_features):
+                ax = axes[i]
+                ax.scatter(X_test[feature], y_pred, alpha=0.5)
+                ax.set_xlabel(feature)
+                ax.set_ylabel('Predicted SalePrice')
+                ax.set_title(f'{feature} vs Predicted SalePrice')
+            ax = axes[3]
+            ax.scatter(y_test, y_pred, alpha=0.5)
+            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
+            ax.set_xlabel('True SalePrice')
+            ax.set_ylabel('Predicted SalePrice')
+            ax.set_title('True vs Predicted SalePrice')
+            min_val = min(y_test.min(), y_pred.min())
+            max_val = max(y_test.max(), y_pred.max())
+            ax.set_xlim(min_val, max_val)
+            ax.set_ylim(min_val, max_val)
+            ax.legend()
+            plt.tight_layout()
+            buf = io.BytesIO()
+            plt.savefig(buf, format="png", bbox_inches="tight")
+            plt.close()
+            buf.seek(0)
+            model_img = Image.open(buf)
         else:
+            # Classification (unchanged)
             if len(y.unique()) < 2:
                 return ("Label must have at least 2 unique values.", None, None, None, None, None)
             y_encoded, uniques = pd.factorize(y)
             y_pred = model.predict(X_test)
             cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
             results_text += "Classification Results:\n" + cr + "\n"
             fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
             if len(fi) < 3:
                 results_text += "\nNot enough features for a 3D plot with the next two most important features."
             else:
+                next_two_features = fi.index[1:3]
                 fig = plt.figure(figsize=(10, 8))
                 ax = fig.add_subplot(111, projection='3d')
                 scatter = ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c=y_test, cmap='viridis', marker='o')
     except Exception as e:
         results_text += f"\nError during model training: {e}"
     try:
         fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
         plt.figure(figsize=(10, 6))
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
+        fi_img = Image.open(buf)
     except Exception as e:
         results_text += f"\nWarning: Could not compute feature importance: {e}"
     try:
         kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
         clusters_kmeans = kmeans.fit_predict(X_scaled)
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
+        kmeans_img = Image.open(buf)
     except Exception as e:
         results_text += f"\nWarning: KMeans clustering failed: {e}"
     try:
         agg = AgglomerativeClustering(n_clusters=n_clusters)
         clusters_agg = agg.fit_predict(X_scaled)
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
+        agg_img = Image.open(buf)
     except Exception as e:
         results_text += f"\nWarning: Agglomerative clustering failed: {e}"
     try:
         f_scores, _ = f_classif(X_processed, clusters_kmeans)
         f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
         plt.savefig(buf, format="png", bbox_inches="tight")
         plt.close()
         buf.seek(0)
+        diff_img = Image.open(buf)
     except Exception as e:
         results_text += f"\nWarning: Could not compute differentiating features: {e}"
     return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
 with gr.Blocks() as demo:
     gr.Markdown("## Data Analysis Explorer")
     gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
         with gr.TabItem("Prediction Plot"):
             gr.Markdown("### Prediction Visualization")
+            gr.Markdown("For regression, shows scatter plots of the top three features vs. predicted values and a plot of true vs. predicted values. For classification, shows a 3D plot of the label vs. next two features.")
             model_img_output = gr.Image(label="Prediction Output")
         with gr.TabItem("Feature Importances"):