Spaces:

jaker86
/

data_science_crash_course

Sleeping

File size: 17,444 Bytes

13bf251
 
 
 
 
 
6753c42
13bf251
6753c42
 
13bf251
 
 
3e930db
13bf251
3e930db
e791e5b
 
 
 
 
2fe6c63
 
 
eaca5d0
 
5e1bd42
13bf251
eaca5d0
 
 
 
 
5e1bd42
 
e791e5b
5715ac3
5e1bd42
eaca5d0
 
e791e5b
 
 
eaca5d0
 
 
 
 
 
 
13bf251
5715ac3
eaca5d0
 
e791e5b
 
13bf251
e791e5b
 
13bf251
e791e5b
 
 
eaca5d0
 
13bf251
6753c42
3e930db
e791e5b
 
 
6753c42
 
eaca5d0
13bf251
6753c42
864aed7
 
 
 
13bf251
e791e5b
 
 
 
 
 
 
 
 
 
755fb3a
 
 
3e930db
e791e5b
3e930db
755fb3a
3e930db
 
 
 
 
 
 
2fe6c63
 
3e930db
 
 
2fe6c63
 
 
3e930db
 
 
 
 
 
 
 
 
 
 
2fe6c63
e791e5b
2fe6c63
e791e5b
755fb3a
e791e5b
 
 
 
 
 
755fb3a
2fe6c63
 
 
 
 
 
 
 
 
 
 
 
 
e791e5b
864aed7
e791e5b
 
 
 
 
 
 
 
 
 
 
 
3e930db
e791e5b
 
 
 
 
 
 
 
 
eaca5d0
e791e5b
 
 
 
 
13bf251
6753c42
13bf251
 
3e930db
e791e5b
 
 
 
 
 
eaca5d0
e791e5b
 
 
 
 
 
 
 
 
3e930db
e791e5b
 
 
 
 
5b239bf
 
e791e5b
 
5b239bf
e791e5b
 
 
5b239bf
13bf251
6753c42
13bf251
 
3e930db
e791e5b
 
13bf251
6753c42
13bf251
5b239bf
2fe6c63
 
 
 
5b239bf
 
 
 
 
 
 
2fe6c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5715ac3
2fe6c63
 
 
 
 
 
 
 
5715ac3
 
 
 
 
2fe6c63
 
 
5715ac3
 
 
 
3ee9608
2fe6c63
 
 
6f355e0
2fe6c63
6f355e0
 
5715ac3
3ee9608
5715ac3
 
2fe6c63
 
13bf251
eaca5d0
ee936fb
 
 
 
e791e5b
ee936fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fe6c63
ee936fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fe6c63
1ed57a6
 
 
 
 
 
 
2fe6c63
6f355e0
1ed57a6
6f355e0
1ed57a6
 
 
 
 
 
6f355e0
1ed57a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fe6c63
ee936fb
 
 
5715ac3

import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import io
from PIL import Image

# Constants
RANDOM_STATE = 42
MIN_ROWS = 10
MIN_COLS = 2
MAX_FEATURES_TO_SHOW = 10

# Global variable to store trained model and data
global_data = {'model': None, 'scaler': None, 'X_columns': None, 'y_type': None, 'uniques': None}

def update_dropdown(file):
    if file is None:
        return gr.update(choices=[], value=None)
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            return gr.update(choices=[], value=None)
        return gr.update(choices=list(df.columns), value=None)
    except Exception as e:
        print(f"Error in update_dropdown: {e}")  # Debug logging
        return gr.update(choices=[], value=None)

def analyze_file(file, label_col, n_clusters):
    if file is None:
        return ("Please upload a file.", None, None, None, None, None)
    
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            return ("Unsupported file type. Please upload a CSV or XLSX file.", None, None, None, None, None)
    except Exception as e:
        print(f"Error reading file: {e}")  # Debug logging
        return (f"Error reading file: {e}", None, None, None, None, None)

    if df.empty:
        return ("File is empty.", None, None, None, None, None)
    if label_col not in df.columns:
        return (f"Label column '{label_col}' not found.", None, None, None, None, None)
    
    df = df.dropna()
    if df.shape[0] < MIN_ROWS:
        return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
    if df.shape[1] < MIN_COLS:
        return ("Need at least one feature and one label column.", None, None, None, None, None)

    y = df[label_col]
    X = df.drop(columns=[label_col])
    X_processed = pd.get_dummies(X)
    if X_processed.shape[1] == 0:
        return ("No valid features after preprocessing.", None, None, None, None, None)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_processed)

    results_text = ""
    model_img = None
    fi_img = None
    kmeans_img = None
    agg_img = None
    diff_img = None

    try:
        if pd.api.types.is_numeric_dtype(y):
            # Regression
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE)
            model = RandomForestRegressor(random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results_text += (
                "Regression Results:\n"
                f"- MSE: {mse:.3f}\n"
                f"- R²: {r2:.3f}\n"
                "\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n"
            )
            # 2D Plots: Top 3 features vs predicted and true vs predicted
            fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
            top_features = fi.head(3).index
            fig, axes = plt.subplots(2, 2, figsize=(12, 10))
            axes = axes.flatten()
            for i, feature in enumerate(top_features):
                ax = axes[i]
                ax.scatter(X_test[feature], y_pred, alpha=0.5)
                ax.set_xlabel(feature)
                ax.set_ylabel('Predicted Value')
                ax.set_title(f'{feature} vs Predicted')
            ax = axes[3]
            ax.scatter(y_test, y_pred, alpha=0.5)
            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
            ax.set_xlabel('True Value')
            ax.set_ylabel('Predicted Value')
            ax.set_title('True vs Predicted')
            min_val = min(y_test.min(), y_pred.min())
            max_val = max(y_test.max(), y_pred.max())
            ax.set_xlim(min_val, max_val)
            ax.set_ylim(min_val, max_val)
            ax.legend()
            plt.tight_layout()
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            model_img = Image.open(buf)
            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'regression', 'uniques': None})
        else:
            # Classification
            if len(y.unique()) < 2:
                return ("Label must have at least 2 unique values.", None, None, None, None, None)
            y_encoded, uniques = pd.factorize(y)
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
            model = RandomForestClassifier(random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
            results_text += "Classification Results:\n" + cr + "\n"
            # 2D Confusion Matrix
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(u) for u in uniques], yticklabels=[str(u) for u in uniques])
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title('Confusion Matrix')
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            model_img = Image.open(buf)
            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'classification', 'uniques': uniques})
    except Exception as e:
        results_text += f"\nError during model training: {e}"

    try:
        fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=fi.values, y=fi.index)
        plt.title("Top 10 Feature Importances")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        fi_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Could not compute feature importance: {e}"

    try:
        kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
        clusters_kmeans = kmeans.fit_predict(X_scaled)
        pca = PCA(n_components=2, random_state=RANDOM_STATE)
        X_pca = pca.fit_transform(X_scaled)
        explained_var = sum(pca.explained_variance_ratio_)
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
        plt.colorbar(scatter, ticks=range(n_clusters))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        kmeans_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: KMeans clustering failed: {e}"

    try:
        agg = AgglomerativeClustering(n_clusters=n_clusters)
        clusters_agg = agg.fit_predict(X_scaled)
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
        plt.colorbar(scatter, ticks=range(n_clusters))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        agg_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Agglomerative clustering failed: {e}"

    try:
        f_scores, _ = f_classif(X_processed, clusters_kmeans)
        # Handle potential division by zero or NaN values
        f_scores = np.nan_to_num(f_scores, nan=0.0, posinf=0.0)
        f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
        plt.figure(figsize=(10, 6))
        sns.barplot(data=f_series.reset_index(), x="index", y=0, hue="index", legend=False)  # Fix palette warning
        plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
        plt.xlabel("F-score")
        plt.ylabel("Feature")
        plt.xticks(rotation=45)
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        diff_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Could not compute differentiating features: {e}"

    return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img

def predict_interactive(*args):
    if global_data['model'] is None:
        return "Please analyze a file first to train the model."
    
    try:
        # Convert args to kwargs based on column names
        kwargs = {}
        if len(args) > 0 and global_data['X_columns'] is not None:
            for i, col in enumerate(global_data['X_columns']):
                if i < len(args):
                    kwargs[col] = args[i]
        
        # Create DataFrame from user inputs
        input_data = pd.DataFrame([kwargs])
        
        # Handle categorical variables with one-hot encoding
        X_processed = pd.get_dummies(input_data)
        
        # Ensure all expected columns are present
        for col in global_data['X_columns']:
            if col not in X_processed.columns:
                X_processed[col] = 0
        
        # Reorder columns to match training data
        X_processed = X_processed[global_data['X_columns']]
        
        # Scale the input
        X_scaled = global_data['scaler'].transform(X_processed)
        
        # Predict
        prediction = global_data['model'].predict(X_scaled)
        
        if global_data['y_type'] == 'classification':
            pred_value = global_data['uniques'][int(prediction[0])]
            return f"Predicted class: {pred_value}"
        else:
            return f"Predicted value: {prediction[0]:.3f}"
    except Exception as e:
        return f"Error in prediction: {str(e)}. Please ensure all inputs are valid numbers or categories."

def create_interactive_inputs(file, label_col):
    if file is None or label_col is None:
        print("No file or label column provided")  # Debug logging
        return []
    
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            print("Unsupported file type")  # Debug logging
            return []
        
        if df.empty or label_col not in df.columns:
            print(f"Empty DataFrame or invalid label column: {label_col}")  # Debug logging
            return []
        
        X = df.drop(columns=[label_col])
        if X.empty:
            print("No features available after dropping label column")  # Debug logging
            return []
        
        components = []
        for col in X.columns:
            examples = X[col].dropna().sample(min(3, len(X[col].dropna()))).tolist()
            if pd.api.types.is_numeric_dtype(X[col]):
                components.append(gr.Number(label=f"{col} (e.g., {', '.join(map(str, examples))})", value=None))
            else:
                unique_values = X[col].dropna().unique().tolist()
                components.append(gr.Dropdown(label=f"{col} (e.g., {', '.join(map(str, examples))})", choices=unique_values, value=None))
        print(f"Generated {len(components)} input components")  # Debug logging
        return components
    except Exception as e:
        print(f"Error in create_interactive_inputs: {e}")  # Debug logging
        return []

with gr.Blocks() as demo:
    gr.Markdown("## Data Analysis Explorer")
    gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")

    with gr.Row():
        file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
        label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True)
        clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")

    file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
    analyze_btn = gr.Button("Analyze")

    with gr.Tabs():
        with gr.TabItem("Prediction Results"):
            gr.Markdown("### Classification or Regression")
            gr.Markdown("""
            - **Regression**: Predicts numbers (e.g., sales). Uses Random Forest.
            - **Classification**: Predicts categories (e.g., yes/no). Uses Random Forest.
            - Rows with missing values are removed. 70% of data trains the model; 30% tests it.
            """)
            results_textbox = gr.Textbox(label="Performance Metrics", lines=10)

        with gr.TabItem("Prediction Plot"):
            gr.Markdown("### Prediction Visualization")
            gr.Markdown("For regression: scatter plots of top 3 features vs. predicted values and true vs. predicted. For classification: confusion matrix.")
            model_img_output = gr.Image(label="Prediction Output")

        with gr.TabItem("Feature Importances"):
            gr.Markdown("### Top 10 Key Features")
            gr.Markdown("Shows the most important features for predictions. Higher bars mean bigger impact.")
            fi_output = gr.Image(label="Feature Importances")

        with gr.TabItem("KMeans Clustering"):
            gr.Markdown("### KMeans Clustering")
            gr.Markdown("Groups similar data points without using the selected column. Colors show clusters in 2D (PCA projection).")
            kmeans_output = gr.Image(label="KMeans Clusters")

        with gr.TabItem("Agglomerative Clustering"):
            gr.Markdown("### Agglomerative Clustering")
            gr.Markdown("Another way to group data hierarchically. Compare with KMeans to see differences!")
            agg_output = gr.Image(label="Agglomerative Clusters")

        with gr.TabItem("Cluster Differences"):
            gr.Markdown("### Top 10 Cluster-Differentiating Features")
            gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.")
            diff_output = gr.Image(label="Differentiating Features")

        with gr.TabItem("Interactive"):
            gr.Markdown("### Interactive Prediction")
            gr.Markdown("Enter values for each feature to get a prediction based on the trained model.")
            with gr.Column():
                input_components = gr.State(value=[])
                dynamic_inputs = gr.Column(visible=True)
                predict_btn = gr.Button("Predict")
                prediction_output = gr.Textbox(label="Prediction Result")
            
            def update_inputs(file, label_col):
                print(f"Updating inputs with file: {file}, label_col: {label_col}")  # Debug logging
                components = create_interactive_inputs(file, label_col)
                # Return the components and update the Column's visibility
                return components, gr.update(visible=True)  # Only update visibility, components are rendered in Blocks
                # Use Blocks to render components dynamically
                with dynamic_inputs:
                    for component in components:
                        component.render()

            file_input.change(
                fn=update_inputs,
                inputs=[file_input, label_dropdown],
                outputs=[input_components, dynamic_inputs]
            )
            label_dropdown.change(
                fn=update_inputs,
                inputs=[file_input, label_dropdown],
                outputs=[input_components, dynamic_inputs]
            )
            predict_btn.click(
                fn=predict_interactive,
                inputs=input_components,
                outputs=prediction_output
            )

    analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider],
                      outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])

demo.launch(debug=True)  # Enable debug mode for more detailed error logging