import pandas as pd import numpy as np def run_eda(df): report = {} # view report["shape"] = df.shape report["columns"] = df.columns.tolist() report["dtypes"] = df.dtypes.astype(str).to_dict() report["memory_mb"] = round( df.memory_usage(deep=True).sum() / (1024 ** 2), 2 ) report["duplicate_rows"] = int(df.duplicated().sum()) # nulls report["null_count"] = df.isnull().sum().to_dict() report["null_percent"] = ( (df.isnull().mean() * 100).round(2).to_dict() ) # unique values report["unique"] = df.nunique(dropna=False).to_dict() # constant column report["constant_columns"] = [ col for col in df.columns if df[col].nunique(dropna=False) <= 1 ] # high cardinality -> cate cat_cols = df.select_dtypes(include="object").columns.tolist() report["high_cardinality"] = [ col for col in cat_cols if df[col].nunique() > 50 ] # numeric columns summary num_cols = df.select_dtypes(include=np.number).columns.tolist() numeric_summary = {} for col in num_cols: series = df[col].dropna() if len(series) == 0: continue numeric_summary[col] = { "mean": round(series.mean(), 4), "median": round(series.median(), 4), "std": round(series.std(), 4), "min": round(series.min(), 4), "max": round(series.max(), 4), "skewness": round(series.skew(), 4) } report["numeric_summary"] = numeric_summary # outlier outliers = {} for col in num_cols: # skip binary / low unique if df[col].nunique() <= 10: continue Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 if IQR == 0: continue lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR count = int(((df[col] < lower) | (df[col] > upper)).sum()) outliers[col] = count report["outliers"] = outliers # correlation analysis if len(num_cols) >= 2: corr_matrix = df[num_cols].corr().round(3) report["correlation_matrix"] = corr_matrix.to_dict() strong_pairs = [] cols = corr_matrix.columns.tolist() for i in range(len(cols)): for j in range(i + 1, len(cols)): c1 = cols[i] c2 = cols[j] val = corr_matrix.loc[c1, c2] if abs(val) >= 0.75: strong_pairs.append( { "feature_1": c1, "feature_2": c2, "correlation": float(val) } ) report["strong_correlations"] = strong_pairs else: report["correlation_matrix"] = {} report["strong_correlations"] = [] # datetime date_candidates = [] for col in df.select_dtypes(include="object").columns: sample = df[col].dropna().astype(str).head(20) keywords = [ "-", "/", ":", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec" ] found = any( any(k in val.lower() for k in keywords) for val in sample ) if found: date_candidates.append(col) report["datetime_candidates"] = date_candidates # near zero variance nzv = [] for col in num_cols: if df[col].nunique() <= 2: continue top_freq = df[col].value_counts(normalize=True).iloc[0] if top_freq > 0.95: nzv.append(col) report["near_zero_variance"] = nzv # recommenation recommendations = [] if report["duplicate_rows"] > 0: recommendations.append("Remove duplicate rows.") if len(report["constant_columns"]) > 0: recommendations.append("Drop constant columns.") if len(report["high_cardinality"]) > 0: recommendations.append("Encode/group high-cardinality columns.") if len(report["outliers"]) > 0: recommendations.append("Consider handling outliers.") skewed_cols = [ col for col, vals in numeric_summary.items() if abs(vals["skewness"]) > 1 ] if len(skewed_cols) > 0: recommendations.append("Some columns are highly skewed. Consider transformation.") if len(report["strong_correlations"]) > 0: recommendations.append("Possible multicollinearity detected.") report["recommendations"] = recommendations return report