Spaces:

jaker86
/

data_science_crash_course

Sleeping

App Files Files Community

jaker86 commited on Feb 25, 2025

Commit

755fb3a

verified ·

1 Parent(s): d1cd9a7

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -27

app.py CHANGED Viewed

@@ -93,43 +93,59 @@ def analyze_file(file, label_col, n_clusters):
             mse = mean_squared_error(y_test, y_pred)
             r2 = r2_score(y_test, y_pred)
             results_text += (
-                "Regression Results (predicting numeric values):\n"
-                f"- Mean Squared Error (MSE): {mse:.3f} (lower is better)\n"
-                f"- R² Score: {r2:.3f} (0 to 1, higher is better)\n"
             )
-            plt.figure(figsize=(8, 6))
-            plt.scatter(y_test, y_pred, alpha=0.7)
-            plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
-            plt.xlabel("True Values")
-            plt.ylabel("Predicted Values")
-            plt.title("Regression: True vs Predicted")
-            buf = io.BytesIO()
-            plt.savefig(buf, format="png", bbox_inches="tight")
-            plt.close()
-            buf.seek(0)
-            model_img = Image.open(buf)  # Convert to PIL Image
         else:
             # Classification
             if len(y.unique()) < 2:
-                return ("Label column must have at least 2 unique values for classification.", None, None, None, None, None)
             y_encoded, uniques = pd.factorize(y)
             X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
             model = RandomForestClassifier(random_state=RANDOM_STATE)
             model.fit(X_train, y_train)
             y_pred = model.predict(X_test)
-            cm = confusion_matrix(y_test, y_pred)
             cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
-            results_text += "Classification Results (predicting categories):\n" + cr + "\n"
-            plt.figure(figsize=(8, 6))
-            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=uniques, yticklabels=uniques)
-            plt.xlabel("Predicted")
-            plt.ylabel("True")
-            plt.title("Confusion Matrix")
-            buf = io.BytesIO()
-            plt.savefig(buf, format="png", bbox_inches="tight")
-            plt.close()
-            buf.seek(0)
-            model_img = Image.open(buf)  # Convert to PIL Image
     except Exception as e:
         results_text += f"\nError during model training: {e}"

             mse = mean_squared_error(y_test, y_pred)
             r2 = r2_score(y_test, y_pred)
             results_text += (
+                "Regression Results:\n"
+                f"- MSE: {mse:.3f}\n"
+                f"- R²: {r2:.3f}\n"
             )
+            # 3D Plot with next two most important features
+            fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
+            if len(fi) < 3:
+                results_text += "\nNot enough features for a 3D plot with the next two most important features."
+            else:
+                next_two_features = fi.index[1:3]  # Second and third most important features
+                fig = plt.figure(figsize=(10, 8))
+                ax = fig.add_subplot(111, projection='3d')
+                ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c='blue', marker='o', label='True Values')
+                ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_pred, c='red', marker='^', label='Predicted Values')
+                ax.set_xlabel(next_two_features[0])
+                ax.set_ylabel(next_two_features[1])
+                ax.set_zlabel(label_col)
+                ax.set_title("3D Plot: Label vs Next Two Most Important Features")
+                ax.legend()
+                buf = io.BytesIO()
+                plt.savefig(buf, format="png", bbox_inches="tight")
+                plt.close()
+                buf.seek(0)
+                model_img = Image.open(buf)
         else:
             # Classification
             if len(y.unique()) < 2:
+                return ("Label must have at least 2 unique values.", None, None, None, None, None)
             y_encoded, uniques = pd.factorize(y)
             X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
             model = RandomForestClassifier(random_state=RANDOM_STATE)
             model.fit(X_train, y_train)
             y_pred = model.predict(X_test)
             cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
+            results_text += "Classification Results:\n" + cr + "\n"
+            # 3D Plot with next two most important features
+            fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
+            if len(fi) < 3:
+                results_text += "\nNot enough features for a 3D plot with the next two most important features."
+            else:
+                next_two_features = fi.index[1:3]  # Second and third most important features
+                fig = plt.figure(figsize=(10, 8))
+                ax = fig.add_subplot(111, projection='3d')
+                scatter = ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c=y_test, cmap='viridis', marker='o')
+                ax.set_xlabel(next_two_features[0])
+                ax.set_ylabel(next_two_features[1])
+                ax.set_zlabel(label_col + " (encoded)")
+                ax.set_title("3D Plot: Label vs Next Two Most Important Features")
+                buf = io.BytesIO()
+                plt.savefig(buf, format="png", bbox_inches="tight")
+                plt.close()
+                buf.seek(0)
+                model_img = Image.open(buf)
     except Exception as e:
         results_text += f"\nError during model training: {e}"