Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| def run_eda(df): | |
| report = {} | |
| # view | |
| report["shape"] = df.shape | |
| report["columns"] = df.columns.tolist() | |
| report["dtypes"] = df.dtypes.astype(str).to_dict() | |
| report["memory_mb"] = round( | |
| df.memory_usage(deep=True).sum() / (1024 ** 2), 2 | |
| ) | |
| report["duplicate_rows"] = int(df.duplicated().sum()) | |
| # nulls | |
| report["null_count"] = df.isnull().sum().to_dict() | |
| report["null_percent"] = ( | |
| (df.isnull().mean() * 100).round(2).to_dict() | |
| ) | |
| # unique values | |
| report["unique"] = df.nunique(dropna=False).to_dict() | |
| # constant column | |
| report["constant_columns"] = [ | |
| col for col in df.columns | |
| if df[col].nunique(dropna=False) <= 1 | |
| ] | |
| # high cardinality -> cate | |
| cat_cols = df.select_dtypes(include="object").columns.tolist() | |
| report["high_cardinality"] = [ | |
| col for col in cat_cols | |
| if df[col].nunique() > 50 | |
| ] | |
| # numeric columns summary | |
| num_cols = df.select_dtypes(include=np.number).columns.tolist() | |
| numeric_summary = {} | |
| for col in num_cols: | |
| series = df[col].dropna() | |
| if len(series) == 0: | |
| continue | |
| numeric_summary[col] = { | |
| "mean": round(series.mean(), 4), | |
| "median": round(series.median(), 4), | |
| "std": round(series.std(), 4), | |
| "min": round(series.min(), 4), | |
| "max": round(series.max(), 4), | |
| "skewness": round(series.skew(), 4) | |
| } | |
| report["numeric_summary"] = numeric_summary | |
| # outlier | |
| outliers = {} | |
| for col in num_cols: | |
| # skip binary / low unique | |
| if df[col].nunique() <= 10: | |
| continue | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| if IQR == 0: | |
| continue | |
| lower = Q1 - 1.5 * IQR | |
| upper = Q3 + 1.5 * IQR | |
| count = int(((df[col] < lower) | (df[col] > upper)).sum()) | |
| outliers[col] = count | |
| report["outliers"] = outliers | |
| # correlation analysis | |
| if len(num_cols) >= 2: | |
| corr_matrix = df[num_cols].corr().round(3) | |
| report["correlation_matrix"] = corr_matrix.to_dict() | |
| strong_pairs = [] | |
| cols = corr_matrix.columns.tolist() | |
| for i in range(len(cols)): | |
| for j in range(i + 1, len(cols)): | |
| c1 = cols[i] | |
| c2 = cols[j] | |
| val = corr_matrix.loc[c1, c2] | |
| if abs(val) >= 0.75: | |
| strong_pairs.append( | |
| { | |
| "feature_1": c1, | |
| "feature_2": c2, | |
| "correlation": float(val) | |
| } | |
| ) | |
| report["strong_correlations"] = strong_pairs | |
| else: | |
| report["correlation_matrix"] = {} | |
| report["strong_correlations"] = [] | |
| # datetime | |
| date_candidates = [] | |
| for col in df.select_dtypes(include="object").columns: | |
| sample = df[col].dropna().astype(str).head(20) | |
| keywords = [ | |
| "-", "/", ":", | |
| "jan", "feb", "mar", "apr", "may", | |
| "jun", "jul", "aug", "sep", | |
| "oct", "nov", "dec" | |
| ] | |
| found = any( | |
| any(k in val.lower() for k in keywords) | |
| for val in sample | |
| ) | |
| if found: | |
| date_candidates.append(col) | |
| report["datetime_candidates"] = date_candidates | |
| # near zero variance | |
| nzv = [] | |
| for col in num_cols: | |
| if df[col].nunique() <= 2: | |
| continue | |
| top_freq = df[col].value_counts(normalize=True).iloc[0] | |
| if top_freq > 0.95: | |
| nzv.append(col) | |
| report["near_zero_variance"] = nzv | |
| # recommenation | |
| recommendations = [] | |
| if report["duplicate_rows"] > 0: | |
| recommendations.append("Remove duplicate rows.") | |
| if len(report["constant_columns"]) > 0: | |
| recommendations.append("Drop constant columns.") | |
| if len(report["high_cardinality"]) > 0: | |
| recommendations.append("Encode/group high-cardinality columns.") | |
| if len(report["outliers"]) > 0: | |
| recommendations.append("Consider handling outliers.") | |
| skewed_cols = [ | |
| col for col, vals in numeric_summary.items() | |
| if abs(vals["skewness"]) > 1 | |
| ] | |
| if len(skewed_cols) > 0: | |
| recommendations.append("Some columns are highly skewed. Consider transformation.") | |
| if len(report["strong_correlations"]) > 0: | |
| recommendations.append("Possible multicollinearity detected.") | |
| report["recommendations"] = recommendations | |
| return report | |