import pandas as pd import numpy as np #from decision import decide_pipeline def generate_full_report(df, target, feature_scores, eda_report,decisions , prep_report=None ): print(" DATA ANALYSIS REPORT :--") # dataset overview print("\n DATASET OVERVIEW") rows, cols = df.shape print(f"- Rows: {rows}") print(f"- Columns: {cols}") total_nulls = df.isnull().sum().sum() duplicate_rows = df.duplicated().sum() print(f"- Missing values: {int(total_nulls)}") print(f"- Duplicate rows: {int(duplicate_rows)}") mem = df.memory_usage(deep=True).sum() / (1024 ** 2) print(f"- Memory usage: {mem:.2f} MB") # detect target type y = df[target] if y.dtype == "object": target_type = "categorical" elif pd.api.types.is_numeric_dtype(y): if y.nunique() <= 10: target_type = "categorical" else: target_type = "numerical" else: target_type = "categorical" # target analysis print("\n TARGET ANALYSIS") if target_type == "categorical": counts = y.value_counts() n_classes = len(counts) print(f"- Target column: {target}") print(f"- Problem Type: Classification") print(f"- Number of classes: {n_classes}") ratio = counts.max() / counts.min() # balace detection if ratio < 1.5: print("- Class distribution: Balanced") elif ratio < 3: print("- Class distribution: Mild imbalance") else: print("- Class distribution: Strong imbalance") for cls, val in counts.items(): pct = (val / len(y)) * 100 print(f" • {cls}: {val} ({pct:.1f}%)") else: desc = y.describe() print(f"- Target column: {target}") print(f"- Problem Type: Regression") print(f"- Mean: {desc['mean']:.3f}") print(f"- Std: {desc['std']:.3f}") print(f"- Min: {desc['min']:.3f}") print(f"- Max: {desc['max']:.3f}") # key relationship print("\n KEY RELATIONSHIPS") found = False # numerical correlation if "numerical_correlation" in feature_scores: corr = feature_scores["numerical_correlation"] for col, val in corr.head(5).items(): if pd.isna(val): continue found = True if val > 0.7: strength = "strong" elif val > 0.4: strength = "moderate" else: strength = "weak" print(f"- {col} has {strength} relationship with {target} (score={val:.2f})") if col.lower() in ["duration", "outcome", "result"]: print(f" '{col}' may cause data leakage") # numerical anova if "numerical_anova" in feature_scores: anova = feature_scores["numerical_anova"] for col, val in anova.head(5).items(): if pd.isna(val) or np.isinf(val): continue found = True print(f"- {col} strongly separates target classes (ANOVA={val:.2f})") # categorical scores for key in ["categorical_anova", "categorical_chi2"]: if key in feature_scores: cat_scores = feature_scores[key] seen = set() for col, val in cat_scores.head(10).items(): base_col = col.split("_")[0] if base_col in seen: continue seen.add(base_col) found = True print(f"- {base_col} has significant impact on {target}") if not found: print("- No strong relationships detected") # quality print("\n data quality") issues = False # missing columns null_pct = df.isnull().mean() * 100 missing_cols = [col for col, val in null_pct.items() if val > 0] if missing_cols: issues = True print(f"- Columns with missing values: {missing_cols}") # outliers numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist() outlier_cols = [] for col in numeric_cols: if col == target: continue if df[col].nunique() <= 10: continue Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 if IQR == 0: continue lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR count = ((df[col] < lower) | (df[col] > upper)).sum() if count > 0: outlier_cols.append(col) if outlier_cols: issues = True print(f"- Columns containing outliers: {outlier_cols}") # high cardinality cat_cols = df.select_dtypes(include="object").columns.tolist() high_card = [ col for col in cat_cols if df[col].nunique() > 50 ] if high_card: issues = True print(f"- High-cardinality categorical columns: {high_card}") if not issues: print("- No major data quality issues detected") # suggest recommendation print("\n RECOMMENDATIONS") if target_type == "categorical": print("- Suggested models: Logistic Regression, Random Forest, XGBoost") print("- Use Stratified Train/Test Split") else: print("- Suggested models: Linear Regression, Random Forest Regressor, XGBoost Regressor") print("- Consider target scaling if highly skewed") if high_card: print("- Apply encoding to high-cardinality columns") if outlier_cols: print("- Consider RobustScaler or outlier capping") if duplicate_rows > 0: print("- Remove duplicate rows before training") # final summary print("\n FINAL SUMMARY") if not issues and duplicate_rows == 0: print("- Dataset is clean and ready for modeling") else: print("- Dataset needs preprocessing before final modeling") print("- Feature selection identified the most useful predictors") print("- Visualization results can support deeper interpretation")