Spaces:

jaker86
/

data_science_crash_course

Sleeping

App Files Files Community

jaker86 commited on Mar 3, 2025

Commit

2fe6c63

verified ·

1 Parent(s): f0ece10

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -24

app.py CHANGED Viewed

@@ -19,6 +19,9 @@ MIN_ROWS = 10
 MIN_COLS = 2
 MAX_FEATURES_TO_SHOW = 10
 def update_dropdown(file):
     if file is None:
         return gr.update(choices=[], value=None)
@@ -98,14 +101,14 @@ def analyze_file(file, label_col, n_clusters):
                 ax = axes[i]
                 ax.scatter(X_test[feature], y_pred, alpha=0.5)
                 ax.set_xlabel(feature)
-                ax.set_ylabel('Predicted SalePrice')
-                ax.set_title(f'{feature} vs Predicted SalePrice')
             ax = axes[3]
             ax.scatter(y_test, y_pred, alpha=0.5)
             ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
-            ax.set_xlabel('True SalePrice')
-            ax.set_ylabel('Predicted SalePrice')
-            ax.set_title('True vs Predicted SalePrice')
             min_val = min(y_test.min(), y_pred.min())
             max_val = max(y_test.max(), y_pred.max())
             ax.set_xlim(min_val, max_val)
@@ -117,8 +120,9 @@ def analyze_file(file, label_col, n_clusters):
             plt.close()
             buf.seek(0)
             model_img = Image.open(buf)
         else:
-            # Classification (unchanged)
             if len(y.unique()) < 2:
                 return ("Label must have at least 2 unique values.", None, None, None, None, None)
             y_encoded, uniques = pd.factorize(y)
@@ -128,23 +132,19 @@ def analyze_file(file, label_col, n_clusters):
             y_pred = model.predict(X_test)
             cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
             results_text += "Classification Results:\n" + cr + "\n"
-            fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
-            if len(fi) < 3:
-                results_text += "\nNot enough features for a 3D plot with the next two most important features."
-            else:
-                next_two_features = fi.index[1:3]
-                fig = plt.figure(figsize=(10, 8))
-                ax = fig.add_subplot(111, projection='3d')
-                scatter = ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c=y_test, cmap='viridis', marker='o')
-                ax.set_xlabel(next_two_features[0])
-                ax.set_ylabel(next_two_features[1])
-                ax.set_zlabel(label_col + " (encoded)")
-                ax.set_title("3D Plot: Label vs Next Two Most Important Features")
-                buf = io.BytesIO()
-                plt.savefig(buf, format="png", bbox_inches="tight")
-                plt.close()
-                buf.seek(0)
-                model_img = Image.open(buf)
     except Exception as e:
         results_text += f"\nError during model training: {e}"
@@ -218,6 +218,63 @@ def analyze_file(file, label_col, n_clusters):
     return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
 with gr.Blocks() as demo:
     gr.Markdown("## Data Analysis Explorer")
     gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
@@ -242,7 +299,7 @@ with gr.Blocks() as demo:
         with gr.TabItem("Prediction Plot"):
             gr.Markdown("### Prediction Visualization")
-            gr.Markdown("For regression, shows scatter plots of the top three features vs. predicted values and a plot of true vs. predicted values. For classification, shows a 3D plot of the label vs. next two features.")
             model_img_output = gr.Image(label="Prediction Output")
         with gr.TabItem("Feature Importances"):
@@ -265,6 +322,21 @@ with gr.Blocks() as demo:
             gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.")
             diff_output = gr.Image(label="Differentiating Features")
     analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider],
                       outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])

 MIN_COLS = 2
 MAX_FEATURES_TO_SHOW = 10
+# Global variable to store trained model and data
+global_data = {'model': None, 'scaler': None, 'X_columns': None, 'y_type': None, 'uniques': None}
 def update_dropdown(file):
     if file is None:
         return gr.update(choices=[], value=None)
                 ax = axes[i]
                 ax.scatter(X_test[feature], y_pred, alpha=0.5)
                 ax.set_xlabel(feature)
+                ax.set_ylabel('Predicted Value')
+                ax.set_title(f'{feature} vs Predicted')
             ax = axes[3]
             ax.scatter(y_test, y_pred, alpha=0.5)
             ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
+            ax.set_xlabel('True Value')
+            ax.set_ylabel('Predicted Value')
+            ax.set_title('True vs Predicted')
             min_val = min(y_test.min(), y_pred.min())
             max_val = max(y_test.max(), y_pred.max())
             ax.set_xlim(min_val, max_val)
             plt.close()
             buf.seek(0)
             model_img = Image.open(buf)
+            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'regression', 'uniques': None})
         else:
+            # Classification
             if len(y.unique()) < 2:
                 return ("Label must have at least 2 unique values.", None, None, None, None, None)
             y_encoded, uniques = pd.factorize(y)
             y_pred = model.predict(X_test)
             cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
             results_text += "Classification Results:\n" + cr + "\n"
+            # 2D Confusion Matrix
+            cm = confusion_matrix(y_test, y_pred)
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(u) for u in uniques], yticklabels=[str(u) for u in uniques])
+            plt.xlabel('Predicted')
+            plt.ylabel('True')
+            plt.title('Confusion Matrix')
+            buf = io.BytesIO()
+            plt.savefig(buf, format="png", bbox_inches="tight")
+            plt.close()
+            buf.seek(0)
+            model_img = Image.open(buf)
+            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'classification', 'uniques': uniques})
     except Exception as e:
         results_text += f"\nError during model training: {e}"
     return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
+def predict_interactive(**kwargs):
+    if global_data['model'] is None:
+        return "Please analyze a file first to train the model."
+    try:
+        # Create DataFrame from user inputs
+        input_data = pd.DataFrame([kwargs])
+        # Handle categorical variables with one-hot encoding
+        X_processed = pd.get_dummies(input_data)
+        # Ensure all expected columns are present
+        for col in global_data['X_columns']:
+            if col not in X_processed.columns:
+                X_processed[col] = 0
+        # Reorder columns to match training data
+        X_processed = X_processed[global_data['X_columns']]
+        # Scale the input
+        X_scaled = global_data['scaler'].transform(X_processed)
+        # Predict
+        prediction = global_data['model'].predict(X_scaled)
+        if global_data['y_type'] == 'classification':
+            pred_value = global_data['uniques'][int(prediction[0])]
+            return f"Predicted class: {pred_value}"
+        else:
+            return f"Predicted value: {prediction[0]:.3f}"
+    except Exception as e:
+        return f"Error in prediction: {str(e)}. Please ensure all inputs are valid numbers or categories."
+def create_interactive_inputs(file, label_col):
+    if file is None or label_col is None:
+        return []
+    try:
+        if file.name.endswith('.csv'):
+            df = pd.read_csv(file.name)
+        elif file.name.endswith('.xlsx'):
+            df = pd.read_excel(file.name)
+        else:
+            return []
+        X = df.drop(columns=[label_col])
+        inputs = []
+        for col in X.columns:
+            examples = X[col].dropna().sample(min(3, len(X[col].dropna()))).tolist()
+            if pd.api.types.is_numeric_dtype(X[col]):
+                inputs.append(gr.Number(label=f"{col} (e.g., {', '.join(map(str, examples))})"))
+            else:
+                inputs.append(gr.Textbox(label=f"{col} (e.g., {', '.join(map(str, examples))})"))
+        return inputs
+    except Exception:
+        return []
 with gr.Blocks() as demo:
     gr.Markdown("## Data Analysis Explorer")
     gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
         with gr.TabItem("Prediction Plot"):
             gr.Markdown("### Prediction Visualization")
+            gr.Markdown("For regression: scatter plots of top 3 features vs. predicted values and true vs. predicted. For classification: confusion matrix.")
             model_img_output = gr.Image(label="Prediction Output")
         with gr.TabItem("Feature Importances"):
             gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.")
             diff_output = gr.Image(label="Differentiating Features")
+        with gr.TabItem("Interactive"):
+            gr.Markdown("### Interactive Prediction")
+            gr.Markdown("Enter values for each feature to get a prediction based on the trained model.")
+            interactive_inputs = gr.State(value=[])
+            with gr.Column():
+                input_components = gr.DynamicLayout(fn=create_interactive_inputs, inputs=[file_input, label_dropdown], outputs=interactive_inputs)
+                predict_btn = gr.Button("Predict")
+                prediction_output = gr.Textbox(label="Prediction Result")
+            predict_btn.click(
+                fn=predict_interactive,
+                inputs=interactive_inputs,
+                outputs=prediction_output
+            )
     analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider],
                       outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])