Spaces:

abhiraj12
/

Auto_ML

Paused

App Files Files Community

Auto_ML / backend /core /recommendations.py

abhiraj12

Initial commit

2c29579 about 2 months ago

raw

history blame contribute delete

6.97 kB

	# ── Smart Recommendations ─────────────────────────────────────────────────────

	def generate_recommendations(profile: dict, results: dict) -> list:
	"""
	Return a prioritised list of actionable data science recommendations.
	Each item: {priority, icon, title, detail, action}
	"""
	recs = []

	# ✅ FIX 1: safe defaults
	col_stats = profile.get("column_stats") or {}
	shap = results.get("shap_summary") or {}
	is_clf = results.get("is_classification", True)

	# ✅ FIX 2: safe score
	try:
	score = float(results.get("score", 0))
	except Exception:
	score = 0

	rows = profile.get("rows", 0)

	# ── Missing values ────────────────────────────────────────────────────
	for col, stats in col_stats.items():
	if not isinstance(stats, dict):
	continue

	mp = stats.get("missing_pct", 0)
	if mp > 50:
	recs.append({
	"priority": "🔴 Critical",
	"title": f"Drop column '{col}'",
	"detail": f"{mp}% of values are missing — this column adds more noise than signal.",
	"action": f"df.drop(columns=['{col}'], inplace=True)",
	})
	elif mp > 20:
	recs.append({
	"priority": "🟡 Important",
	"title": f"Review imputation for '{col}'",
	"detail": f"{mp}% missing. Consider domain-specific fills or median/mode by group.",
	"action": f"df['{col}'].fillna(df['{col}'].median(), inplace=True)",
	})

	# ── Zero variance ─────────────────────────────────────────────────────
	zero_var = [
	c for c, s in col_stats.items()
	if isinstance(s, dict) and s.get("unique", 99) <= 1
	]
	for col in zero_var:
	recs.append({
	"priority": "🔴 Critical",
	"title": f"Drop constant column '{col}'",
	"detail": "This column has only one unique value and contributes nothing to the model.",
	"action": f"df.drop(columns=['{col}'], inplace=True)",
	})

	# ── High skew ─────────────────────────────────────────────────────────
	for col, stats in col_stats.items():
	if not isinstance(stats, dict):
	continue

	skew = stats.get("skew")
	if skew is not None:
	try:
	skew_val = float(skew)
	if abs(skew_val) > 5:
	recs.append({
	"priority": "🟡 Important",
	"title": f"Log-transform skewed feature '{col}'",
	"detail": f"Skewness = {skew_val:.2f}. Heavy skew hurts linear models and slows tree convergence.",
	"action": f"import numpy as np\ndf['{col}'] = np.log1p(df['{col}'].clip(lower=0))",
	})
	except (ValueError, TypeError):
	continue

	# ── Class imbalance ───────────────────────────────────────────────────
	if profile.get("imbalance") == "High ⚠️" and is_clf:
	recs.append({
	"priority": "🔴 Critical",
	"title": "Address class imbalance with SMOTE",
	"detail": "Your target classes are severely skewed. Minority class recall will be poor.",
	"action": "from imblearn.over_sampling import SMOTE\nX_res, y_res = SMOTE().fit_resample(X, y)",
	})

	# ── Small dataset ─────────────────────────────────────────────────────
	if rows < 500:
	recs.append({
	"priority": "🟡 Important",
	"title": "Expand your dataset with Synthetic Data",
	"detail": f"Only {rows} rows detected. Use the Synthetic Generator on the AI Tools page.",
	"action": "Go to 8_AI_Tools → Synthetic Data Generator",
	})

	# ── Low score suggestions ─────────────────────────────────────────────
	if score < 70:
	recs.append({
	"priority": "🟡 Important",
	"title": "Try feature engineering",
	"detail": f"Score is {score}%. Consider creating interaction terms or polynomial features.",
	"action": "from sklearn.preprocessing import PolynomialFeatures\npf = PolynomialFeatures(degree=2, interaction_only=True)",
	})

	if rows < 2000:
	recs.append({
	"priority": "🟡 Important",
	"title": "Collect more training data",
	"detail": "Score under 70% with a small dataset usually means the model is data-hungry.",
	"action": "Focus data collection on the features with highest SHAP importance.",
	})

	# ── SHAP-based feature pruning ────────────────────────────────────────
	if isinstance(shap, dict) and shap:
	num_cols = profile.get("num_cols") or []
	cat_cols = profile.get("cat_cols") or []

	all_features = set(num_cols + cat_cols)
	important_features = set(shap.keys())
	unimportant = all_features - important_features

	if len(unimportant) >= 3:
	sample = list(unimportant)[:5]
	recs.append({
	"priority": "🟢 Suggestion",
	"title": f"Consider dropping {len(unimportant)} low-importance features",
	"detail": f"SHAP analysis shows these features contribute minimally: {', '.join(sample)}",
	"action": f"df.drop(columns={sample[:3]}, inplace=True)",
	})

	# ── Positive confirmation ────────────────────────────────────────────
	if score >= 85 and not recs:
	recs.append({
	"priority": "✅ Excellent",
	"title": "Your pipeline is well-optimised!",
	"detail": f"Score of {score}% with a clean dataset. Consider deploying the model.",
	"action": "Download the export bundle and run the generated API.",
	})

	# Sort: Critical first
	order = {"🔴 Critical": 0, "🟡 Important": 1, "🟢 Suggestion": 2, "✅ Excellent": 3}
	recs.sort(key=lambda r: order.get(r["priority"], 9))

	return recs