Spaces:

Shrouk04
/

data_analysis_agent

Sleeping

File size: 5,758 Bytes

f73646a

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
#import utils.decision  


def visualize(df, target, feature_scores, eda_report, decisions):

    print("\n--- SMART VISUALIZATION ENGINE ---")

    
    # select top features
    top_numeric = []
    top_categorical = []


    # num features
    if 'numerical_correlation' in feature_scores:
        top_numeric = feature_scores['numerical_correlation'].head(3).index.tolist()


    elif 'numerical_anova' in feature_scores:
        top_numeric = feature_scores['numerical_anova'].head(3).index.tolist()


    # categorical features
    if 'categorical_anova' in feature_scores:
        cat_cols = feature_scores['categorical_anova'].head(5).index.tolist()
        top_categorical = list(set([col.split('_')[0] for col in cat_cols]))



    elif 'categorical_chi2' in feature_scores:
        cat_cols = feature_scores['categorical_chi2'].head(5).index.tolist()
        top_categorical = list(set([col.split('_')[0] for col in cat_cols]))


    print(f"Top numerical features: {top_numeric}")
    print(f"Top categorical features: {top_categorical}")


    # check

    if not top_numeric and not top_categorical:
        print(" No important features detected. Skipping visualization.")
        return





    print("\n[0] Target Distribution")

    # categorical target
    if df[target].dtype == 'object':

        counts = df[target].value_counts()
        n_classes = len(counts)

        # pie chart ( for small number od categories)
        if n_classes <= 5:
            plt.figure()
            counts.plot(kind='pie', autopct='%1.1f%%')
            plt.title(f"{target} Distribution (Pie Chart)")
            plt.ylabel('')
            plt.show()

        # bar chart 
        plt.figure()
        counts.plot(kind='bar')
        plt.title(f"{target} Distribution (Bar Chart)")
        plt.xlabel(target)
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.show()


    # numerical target
    else:
        plt.figure()
        df[target].hist()
        plt.title(f"{target} Distribution (Histogram)")
        plt.xlabel(target)
        plt.ylabel("Frequency")
        plt.show()




    
    # Single Variable Analysis
    
    print("\n[1] Single Variable Analysis")

    for col in top_numeric:
        if col in df.columns:
            plt.figure()
            df[col].hist()
            plt.title(f"Histogram of {col}")
            plt.xlabel(col)
            plt.ylabel("Frequency")
            plt.show()


    for col in top_categorical:
        if col in df.columns:
            plt.figure()
            df[col].value_counts().head(10).plot(kind='bar')
            plt.title(f"Top Categories of {col}")
            plt.xticks(rotation=45)
            plt.show()


    # relationship with target
    
    print("\n[2] Relationship with Target")

    # case 1 --> target is numerical
    if df[target].dtype in ['int64', 'float64']:

        # numerical vs target --> scatter
        for col in top_numeric:
            if col in df.columns:
                plt.figure()
                plt.scatter(df[col], df[target])
                plt.title(f"{col} vs {target}")
                plt.xlabel(col)
                plt.ylabel(target)
                plt.show()

        # categorical vs target --> boxplot
        for col in top_categorical:
            if col in df.columns:
                plt.figure()
                sns.boxplot(x=df[col], y=df[target])
                plt.title(f"{col} vs {target}")
                plt.xticks(rotation=45)
                plt.show()

    # Case 2--> target is categorical
    else:
        # numerical vs categorical target --> boxplot
        for col in top_numeric:
            if col in df.columns:
                plt.figure()
                sns.boxplot(x=df[target], y=df[col])
                plt.title(f"{col} vs {target}")
                plt.xticks(rotation=45)
                plt.show()


        # categorical vs categorical -> grouped bar
        for col in top_categorical:
            if col in df.columns:
                plt.figure()
                pd.crosstab(df[col], df[target]).plot(kind='bar', stacked=True)
                plt.title(f"{col} vs {target}")
                plt.xticks(rotation=45)
                plt.show()

    
    # correlation heatmap
    
    print("\n[3] Correlation Heatmap")

    #heatmap_cols = top_numeric.copy()

    #if target not in heatmap_cols and target in df.columns:
    #    heatmap_cols.append(target)

    #if len(heatmap_cols) > 1:
    #    plt.figure(figsize=(8, 6))
    #    corr = df[heatmap_cols].corr()
    #    sns.heatmap(corr, annot=True, cmap='coolwarm')
    #    plt.title("Correlation Heatmap (Top Features)")
    #    plt.show()

    # keep only numeric columns
    heatmap_cols = [
        col for col in top_numeric
        if pd.api.types.is_numeric_dtype(df[col])
]

    # add only numeric target
    if pd.api.types.is_numeric_dtype(df[target]):
        if target not in heatmap_cols:
            heatmap_cols.append(target)

    # build heatmap 
    if len(heatmap_cols) > 1:
        plt.figure(figsize=(8, 6))
        corr = df[heatmap_cols].corr(numeric_only=True)
        sns.heatmap(corr, annot=True, cmap='coolwarm')
        plt.title("Correlation Heatmap (Top Features)")
        plt.show()
    else:
        print(" Not enough numeric features for heatmap")


    

    # outlier visualization (for only included )
    
    print("\n[4] Outlier Detection ")

    outliers = eda_report.get('outliers', {})

    for col, count in outliers.items():
        if col in top_numeric and count > 0:
            plt.figure()
            sns.boxplot(x=df[col])
            plt.title(f"Outliers in {col}")
            plt.show()