| |
|
|
| def generate_recommendations(profile: dict, results: dict) -> list: |
| """ |
| Return a prioritised list of actionable data science recommendations. |
| Each item: {priority, icon, title, detail, action} |
| """ |
| recs = [] |
|
|
| |
| col_stats = profile.get("column_stats") or {} |
| shap = results.get("shap_summary") or {} |
| is_clf = results.get("is_classification", True) |
|
|
| |
| try: |
| score = float(results.get("score", 0)) |
| except Exception: |
| score = 0 |
|
|
| rows = profile.get("rows", 0) |
|
|
| |
| for col, stats in col_stats.items(): |
| if not isinstance(stats, dict): |
| continue |
|
|
| mp = stats.get("missing_pct", 0) |
| if mp > 50: |
| recs.append({ |
| "priority": "🔴 Critical", |
| "title": f"Drop column '{col}'", |
| "detail": f"{mp}% of values are missing — this column adds more noise than signal.", |
| "action": f"df.drop(columns=['{col}'], inplace=True)", |
| }) |
| elif mp > 20: |
| recs.append({ |
| "priority": "🟡 Important", |
| "title": f"Review imputation for '{col}'", |
| "detail": f"{mp}% missing. Consider domain-specific fills or median/mode by group.", |
| "action": f"df['{col}'].fillna(df['{col}'].median(), inplace=True)", |
| }) |
|
|
| |
| zero_var = [ |
| c for c, s in col_stats.items() |
| if isinstance(s, dict) and s.get("unique", 99) <= 1 |
| ] |
| for col in zero_var: |
| recs.append({ |
| "priority": "🔴 Critical", |
| "title": f"Drop constant column '{col}'", |
| "detail": "This column has only one unique value and contributes nothing to the model.", |
| "action": f"df.drop(columns=['{col}'], inplace=True)", |
| }) |
|
|
| |
| for col, stats in col_stats.items(): |
| if not isinstance(stats, dict): |
| continue |
|
|
| skew = stats.get("skew") |
| if skew is not None: |
| try: |
| skew_val = float(skew) |
| if abs(skew_val) > 5: |
| recs.append({ |
| "priority": "🟡 Important", |
| "title": f"Log-transform skewed feature '{col}'", |
| "detail": f"Skewness = {skew_val:.2f}. Heavy skew hurts linear models and slows tree convergence.", |
| "action": f"import numpy as np\ndf['{col}'] = np.log1p(df['{col}'].clip(lower=0))", |
| }) |
| except (ValueError, TypeError): |
| continue |
|
|
| |
| if profile.get("imbalance") == "High ⚠️" and is_clf: |
| recs.append({ |
| "priority": "🔴 Critical", |
| "title": "Address class imbalance with SMOTE", |
| "detail": "Your target classes are severely skewed. Minority class recall will be poor.", |
| "action": "from imblearn.over_sampling import SMOTE\nX_res, y_res = SMOTE().fit_resample(X, y)", |
| }) |
|
|
| |
| if rows < 500: |
| recs.append({ |
| "priority": "🟡 Important", |
| "title": "Expand your dataset with Synthetic Data", |
| "detail": f"Only {rows} rows detected. Use the Synthetic Generator on the AI Tools page.", |
| "action": "Go to 8_AI_Tools → Synthetic Data Generator", |
| }) |
|
|
| |
| if score < 70: |
| recs.append({ |
| "priority": "🟡 Important", |
| "title": "Try feature engineering", |
| "detail": f"Score is {score}%. Consider creating interaction terms or polynomial features.", |
| "action": "from sklearn.preprocessing import PolynomialFeatures\npf = PolynomialFeatures(degree=2, interaction_only=True)", |
| }) |
|
|
| if rows < 2000: |
| recs.append({ |
| "priority": "🟡 Important", |
| "title": "Collect more training data", |
| "detail": "Score under 70% with a small dataset usually means the model is data-hungry.", |
| "action": "Focus data collection on the features with highest SHAP importance.", |
| }) |
|
|
| |
| if isinstance(shap, dict) and shap: |
| num_cols = profile.get("num_cols") or [] |
| cat_cols = profile.get("cat_cols") or [] |
|
|
| all_features = set(num_cols + cat_cols) |
| important_features = set(shap.keys()) |
| unimportant = all_features - important_features |
|
|
| if len(unimportant) >= 3: |
| sample = list(unimportant)[:5] |
| recs.append({ |
| "priority": "🟢 Suggestion", |
| "title": f"Consider dropping {len(unimportant)} low-importance features", |
| "detail": f"SHAP analysis shows these features contribute minimally: {', '.join(sample)}", |
| "action": f"df.drop(columns={sample[:3]}, inplace=True)", |
| }) |
|
|
| |
| if score >= 85 and not recs: |
| recs.append({ |
| "priority": "✅ Excellent", |
| "title": "Your pipeline is well-optimised!", |
| "detail": f"Score of {score}% with a clean dataset. Consider deploying the model.", |
| "action": "Download the export bundle and run the generated API.", |
| }) |
|
|
| |
| order = {"🔴 Critical": 0, "🟡 Important": 1, "🟢 Suggestion": 2, "✅ Excellent": 3} |
| recs.sort(key=lambda r: order.get(r["priority"], 9)) |
|
|
| return recs |