Shrouk04's picture
Upload 36 files
f73646a verified
Raw
History Blame Contribute Delete
4.69 kB
import pandas as pd
import numpy as np
def run_eda(df):
report = {}
# view
report["shape"] = df.shape
report["columns"] = df.columns.tolist()
report["dtypes"] = df.dtypes.astype(str).to_dict()
report["memory_mb"] = round(
df.memory_usage(deep=True).sum() / (1024 ** 2), 2
)
report["duplicate_rows"] = int(df.duplicated().sum())
# nulls
report["null_count"] = df.isnull().sum().to_dict()
report["null_percent"] = (
(df.isnull().mean() * 100).round(2).to_dict()
)
# unique values
report["unique"] = df.nunique(dropna=False).to_dict()
# constant column
report["constant_columns"] = [
col for col in df.columns
if df[col].nunique(dropna=False) <= 1
]
# high cardinality -> cate
cat_cols = df.select_dtypes(include="object").columns.tolist()
report["high_cardinality"] = [
col for col in cat_cols
if df[col].nunique() > 50
]
# numeric columns summary
num_cols = df.select_dtypes(include=np.number).columns.tolist()
numeric_summary = {}
for col in num_cols:
series = df[col].dropna()
if len(series) == 0:
continue
numeric_summary[col] = {
"mean": round(series.mean(), 4),
"median": round(series.median(), 4),
"std": round(series.std(), 4),
"min": round(series.min(), 4),
"max": round(series.max(), 4),
"skewness": round(series.skew(), 4)
}
report["numeric_summary"] = numeric_summary
# outlier
outliers = {}
for col in num_cols:
# skip binary / low unique
if df[col].nunique() <= 10:
continue
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
if IQR == 0:
continue
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
count = int(((df[col] < lower) | (df[col] > upper)).sum())
outliers[col] = count
report["outliers"] = outliers
# correlation analysis
if len(num_cols) >= 2:
corr_matrix = df[num_cols].corr().round(3)
report["correlation_matrix"] = corr_matrix.to_dict()
strong_pairs = []
cols = corr_matrix.columns.tolist()
for i in range(len(cols)):
for j in range(i + 1, len(cols)):
c1 = cols[i]
c2 = cols[j]
val = corr_matrix.loc[c1, c2]
if abs(val) >= 0.75:
strong_pairs.append(
{
"feature_1": c1,
"feature_2": c2,
"correlation": float(val)
}
)
report["strong_correlations"] = strong_pairs
else:
report["correlation_matrix"] = {}
report["strong_correlations"] = []
# datetime
date_candidates = []
for col in df.select_dtypes(include="object").columns:
sample = df[col].dropna().astype(str).head(20)
keywords = [
"-", "/", ":",
"jan", "feb", "mar", "apr", "may",
"jun", "jul", "aug", "sep",
"oct", "nov", "dec"
]
found = any(
any(k in val.lower() for k in keywords)
for val in sample
)
if found:
date_candidates.append(col)
report["datetime_candidates"] = date_candidates
# near zero variance
nzv = []
for col in num_cols:
if df[col].nunique() <= 2:
continue
top_freq = df[col].value_counts(normalize=True).iloc[0]
if top_freq > 0.95:
nzv.append(col)
report["near_zero_variance"] = nzv
# recommenation
recommendations = []
if report["duplicate_rows"] > 0:
recommendations.append("Remove duplicate rows.")
if len(report["constant_columns"]) > 0:
recommendations.append("Drop constant columns.")
if len(report["high_cardinality"]) > 0:
recommendations.append("Encode/group high-cardinality columns.")
if len(report["outliers"]) > 0:
recommendations.append("Consider handling outliers.")
skewed_cols = [
col for col, vals in numeric_summary.items()
if abs(vals["skewness"]) > 1
]
if len(skewed_cols) > 0:
recommendations.append("Some columns are highly skewed. Consider transformation.")
if len(report["strong_correlations"]) > 0:
recommendations.append("Possible multicollinearity detected.")
report["recommendations"] = recommendations
return report