import matplotlib.pyplot as plt import seaborn as sns import pandas as pd #import utils.decision def visualize(df, target, feature_scores, eda_report, decisions): print("\n--- SMART VISUALIZATION ENGINE ---") # select top features top_numeric = [] top_categorical = [] # num features if 'numerical_correlation' in feature_scores: top_numeric = feature_scores['numerical_correlation'].head(3).index.tolist() elif 'numerical_anova' in feature_scores: top_numeric = feature_scores['numerical_anova'].head(3).index.tolist() # categorical features if 'categorical_anova' in feature_scores: cat_cols = feature_scores['categorical_anova'].head(5).index.tolist() top_categorical = list(set([col.split('_')[0] for col in cat_cols])) elif 'categorical_chi2' in feature_scores: cat_cols = feature_scores['categorical_chi2'].head(5).index.tolist() top_categorical = list(set([col.split('_')[0] for col in cat_cols])) print(f"Top numerical features: {top_numeric}") print(f"Top categorical features: {top_categorical}") # check if not top_numeric and not top_categorical: print(" No important features detected. Skipping visualization.") return print("\n[0] Target Distribution") # categorical target if df[target].dtype == 'object': counts = df[target].value_counts() n_classes = len(counts) # pie chart ( for small number od categories) if n_classes <= 5: plt.figure() counts.plot(kind='pie', autopct='%1.1f%%') plt.title(f"{target} Distribution (Pie Chart)") plt.ylabel('') plt.show() # bar chart plt.figure() counts.plot(kind='bar') plt.title(f"{target} Distribution (Bar Chart)") plt.xlabel(target) plt.ylabel("Count") plt.xticks(rotation=45) plt.show() # numerical target else: plt.figure() df[target].hist() plt.title(f"{target} Distribution (Histogram)") plt.xlabel(target) plt.ylabel("Frequency") plt.show() # Single Variable Analysis print("\n[1] Single Variable Analysis") for col in top_numeric: if col in df.columns: plt.figure() df[col].hist() plt.title(f"Histogram of {col}") plt.xlabel(col) plt.ylabel("Frequency") plt.show() for col in top_categorical: if col in df.columns: plt.figure() df[col].value_counts().head(10).plot(kind='bar') plt.title(f"Top Categories of {col}") plt.xticks(rotation=45) plt.show() # relationship with target print("\n[2] Relationship with Target") # case 1 --> target is numerical if df[target].dtype in ['int64', 'float64']: # numerical vs target --> scatter for col in top_numeric: if col in df.columns: plt.figure() plt.scatter(df[col], df[target]) plt.title(f"{col} vs {target}") plt.xlabel(col) plt.ylabel(target) plt.show() # categorical vs target --> boxplot for col in top_categorical: if col in df.columns: plt.figure() sns.boxplot(x=df[col], y=df[target]) plt.title(f"{col} vs {target}") plt.xticks(rotation=45) plt.show() # Case 2--> target is categorical else: # numerical vs categorical target --> boxplot for col in top_numeric: if col in df.columns: plt.figure() sns.boxplot(x=df[target], y=df[col]) plt.title(f"{col} vs {target}") plt.xticks(rotation=45) plt.show() # categorical vs categorical -> grouped bar for col in top_categorical: if col in df.columns: plt.figure() pd.crosstab(df[col], df[target]).plot(kind='bar', stacked=True) plt.title(f"{col} vs {target}") plt.xticks(rotation=45) plt.show() # correlation heatmap print("\n[3] Correlation Heatmap") #heatmap_cols = top_numeric.copy() #if target not in heatmap_cols and target in df.columns: # heatmap_cols.append(target) #if len(heatmap_cols) > 1: # plt.figure(figsize=(8, 6)) # corr = df[heatmap_cols].corr() # sns.heatmap(corr, annot=True, cmap='coolwarm') # plt.title("Correlation Heatmap (Top Features)") # plt.show() # keep only numeric columns heatmap_cols = [ col for col in top_numeric if pd.api.types.is_numeric_dtype(df[col]) ] # add only numeric target if pd.api.types.is_numeric_dtype(df[target]): if target not in heatmap_cols: heatmap_cols.append(target) # build heatmap if len(heatmap_cols) > 1: plt.figure(figsize=(8, 6)) corr = df[heatmap_cols].corr(numeric_only=True) sns.heatmap(corr, annot=True, cmap='coolwarm') plt.title("Correlation Heatmap (Top Features)") plt.show() else: print(" Not enough numeric features for heatmap") # outlier visualization (for only included ) print("\n[4] Outlier Detection ") outliers = eda_report.get('outliers', {}) for col, count in outliers.items(): if col in top_numeric and count > 0: plt.figure() sns.boxplot(x=df[col]) plt.title(f"Outliers in {col}") plt.show()