Shrouk04's picture
Upload 36 files
f73646a verified
Raw
History Blame Contribute Delete
6.39 kB
import pandas as pd
import numpy as np
#from decision import decide_pipeline
def generate_full_report(df, target, feature_scores, eda_report,decisions , prep_report=None ):
print(" DATA ANALYSIS REPORT :--")
# dataset overview
print("\n DATASET OVERVIEW")
rows, cols = df.shape
print(f"- Rows: {rows}")
print(f"- Columns: {cols}")
total_nulls = df.isnull().sum().sum()
duplicate_rows = df.duplicated().sum()
print(f"- Missing values: {int(total_nulls)}")
print(f"- Duplicate rows: {int(duplicate_rows)}")
mem = df.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"- Memory usage: {mem:.2f} MB")
# detect target type
y = df[target]
if y.dtype == "object":
target_type = "categorical"
elif pd.api.types.is_numeric_dtype(y):
if y.nunique() <= 10:
target_type = "categorical"
else:
target_type = "numerical"
else:
target_type = "categorical"
# target analysis
print("\n TARGET ANALYSIS")
if target_type == "categorical":
counts = y.value_counts()
n_classes = len(counts)
print(f"- Target column: {target}")
print(f"- Problem Type: Classification")
print(f"- Number of classes: {n_classes}")
ratio = counts.max() / counts.min()
# balace detection
if ratio < 1.5:
print("- Class distribution: Balanced")
elif ratio < 3:
print("- Class distribution: Mild imbalance")
else:
print("- Class distribution: Strong imbalance")
for cls, val in counts.items():
pct = (val / len(y)) * 100
print(f" • {cls}: {val} ({pct:.1f}%)")
else:
desc = y.describe()
print(f"- Target column: {target}")
print(f"- Problem Type: Regression")
print(f"- Mean: {desc['mean']:.3f}")
print(f"- Std: {desc['std']:.3f}")
print(f"- Min: {desc['min']:.3f}")
print(f"- Max: {desc['max']:.3f}")
# key relationship
print("\n KEY RELATIONSHIPS")
found = False
# numerical correlation
if "numerical_correlation" in feature_scores:
corr = feature_scores["numerical_correlation"]
for col, val in corr.head(5).items():
if pd.isna(val):
continue
found = True
if val > 0.7:
strength = "strong"
elif val > 0.4:
strength = "moderate"
else:
strength = "weak"
print(f"- {col} has {strength} relationship with {target} (score={val:.2f})")
if col.lower() in ["duration", "outcome", "result"]:
print(f" '{col}' may cause data leakage")
# numerical anova
if "numerical_anova" in feature_scores:
anova = feature_scores["numerical_anova"]
for col, val in anova.head(5).items():
if pd.isna(val) or np.isinf(val):
continue
found = True
print(f"- {col} strongly separates target classes (ANOVA={val:.2f})")
# categorical scores
for key in ["categorical_anova", "categorical_chi2"]:
if key in feature_scores:
cat_scores = feature_scores[key]
seen = set()
for col, val in cat_scores.head(10).items():
base_col = col.split("_")[0]
if base_col in seen:
continue
seen.add(base_col)
found = True
print(f"- {base_col} has significant impact on {target}")
if not found:
print("- No strong relationships detected")
# quality
print("\n data quality")
issues = False
# missing columns
null_pct = df.isnull().mean() * 100
missing_cols = [col for col, val in null_pct.items() if val > 0]
if missing_cols:
issues = True
print(f"- Columns with missing values: {missing_cols}")
# outliers
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
outlier_cols = []
for col in numeric_cols:
if col == target:
continue
if df[col].nunique() <= 10:
continue
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
if IQR == 0:
continue
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
count = ((df[col] < lower) | (df[col] > upper)).sum()
if count > 0:
outlier_cols.append(col)
if outlier_cols:
issues = True
print(f"- Columns containing outliers: {outlier_cols}")
# high cardinality
cat_cols = df.select_dtypes(include="object").columns.tolist()
high_card = [
col for col in cat_cols
if df[col].nunique() > 50
]
if high_card:
issues = True
print(f"- High-cardinality categorical columns: {high_card}")
if not issues:
print("- No major data quality issues detected")
# suggest recommendation
print("\n RECOMMENDATIONS")
if target_type == "categorical":
print("- Suggested models: Logistic Regression, Random Forest, XGBoost")
print("- Use Stratified Train/Test Split")
else:
print("- Suggested models: Linear Regression, Random Forest Regressor, XGBoost Regressor")
print("- Consider target scaling if highly skewed")
if high_card:
print("- Apply encoding to high-cardinality columns")
if outlier_cols:
print("- Consider RobustScaler or outlier capping")
if duplicate_rows > 0:
print("- Remove duplicate rows before training")
# final summary
print("\n FINAL SUMMARY")
if not issues and duplicate_rows == 0:
print("- Dataset is clean and ready for modeling")
else:
print("- Dataset needs preprocessing before final modeling")
print("- Feature selection identified the most useful predictors")
print("- Visualization results can support deeper interpretation")