Spaces:
Sleeping
Sleeping
| """ | |
| visualize.py — Visual diagnostics and statistics for the House Price Predictor. | |
| Adds a "📊 Analytics" tab to the Gradio UI that shows: | |
| 1. Feature Importance — XGBoost gain-based + Lasso coefficient bar charts | |
| 2. Prediction Distribution — histogram + KDE of predicted prices | |
| 3. Residual Analysis — residual vs predicted scatter + Q-Q plot | |
| 4. Training Data Stats — target distribution, correlation heatmap, numeric summary | |
| 5. Model Comparison — CV RMSE bar chart across the three base learners | |
| """ | |
| import os | |
| import io | |
| import base64 | |
| import warnings | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| import matplotlib | |
| matplotlib.use("Agg") # non-interactive backend for Gradio | |
| import matplotlib.pyplot as plt | |
| import matplotlib.gridspec as gridspec | |
| from matplotlib.ticker import FuncFormatter | |
| import scipy.stats as stats | |
| warnings.filterwarnings("ignore") | |
| # ── shared style ────────────────────────────────────────────────────────────── | |
| PALETTE = ["#2D6A4F", "#40916C", "#74C69D", "#B7E4C7", "#D8F3DC"] | |
| ACCENT = "#1B4332" | |
| WARN = "#E76F51" | |
| BG = "#F8F9FA" | |
| GRID_CLR = "#DEE2E6" | |
| def _style_ax(ax, title="", xlabel="", ylabel=""): | |
| ax.set_facecolor(BG) | |
| ax.grid(axis="y", color=GRID_CLR, linewidth=0.7, linestyle="--", zorder=0) | |
| ax.spines[["top", "right"]].set_visible(False) | |
| ax.spines[["left", "bottom"]].set_color(GRID_CLR) | |
| if title: ax.set_title(title, fontsize=12, fontweight="bold", pad=10, color=ACCENT) | |
| if xlabel: ax.set_xlabel(xlabel, fontsize=9, color="#495057") | |
| if ylabel: ax.set_ylabel(ylabel, fontsize=9, color="#495057") | |
| ax.tick_params(colors="#495057", labelsize=8) | |
| def _fig_to_image(fig): | |
| """Convert a matplotlib figure → PIL Image (Gradio gr.Image compatible).""" | |
| buf = io.BytesIO() | |
| fig.savefig(buf, format="png", dpi=130, bbox_inches="tight", facecolor=fig.get_facecolor()) | |
| buf.seek(0) | |
| from PIL import Image | |
| img = Image.open(buf) | |
| plt.close(fig) | |
| return img | |
| # ── helpers ─────────────────────────────────────────────────────────────────── | |
| def _load_artifacts(): | |
| from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH | |
| for p in (MODEL_PATH, PREPROCESSOR_PATH, META_PATH): | |
| if not os.path.exists(p): | |
| raise FileNotFoundError("No trained model found. Train the model first.") | |
| return joblib.load(MODEL_PATH), joblib.load(PREPROCESSOR_PATH), joblib.load(META_PATH) | |
| def _feature_names(preprocessor, meta): | |
| """Reconstruct feature names after ColumnTransformer.""" | |
| num_feats = meta["numerical_features"] | |
| try: | |
| cat_enc = preprocessor.named_transformers_["cat"].named_steps["onehot"] | |
| cat_feats = cat_enc.get_feature_names_out(meta["categorical_features"]).tolist() | |
| except Exception: | |
| cat_feats = [] | |
| return num_feats + cat_feats | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # PLOT 1 — Feature Importance | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def plot_feature_importance(): | |
| try: | |
| ensemble, preprocessor, meta = _load_artifacts() | |
| feature_names = _feature_names(preprocessor, meta) | |
| n = 20 # top-N to show | |
| estimators = dict(ensemble.named_estimators_) | |
| fig, axes = plt.subplots(1, 2, figsize=(14, 6), facecolor="white") | |
| fig.suptitle("Feature Importance", fontsize=15, fontweight="bold", color=ACCENT, y=1.01) | |
| # ── XGBoost gain importance ── | |
| ax = axes[0] | |
| xgb_model = estimators.get("xgb") | |
| if xgb_model is not None: | |
| raw_imp = xgb_model.feature_importances_ | |
| n_feat = min(len(raw_imp), len(feature_names)) | |
| imp = pd.Series(raw_imp[:n_feat], index=feature_names[:n_feat]) | |
| top = imp.nlargest(n).sort_values() | |
| bars = ax.barh(top.index, top.values, color=PALETTE[1], edgecolor="white", height=0.65) | |
| for bar, val in zip(bars, top.values): | |
| ax.text(val + top.values.max() * 0.01, bar.get_y() + bar.get_height() / 2, | |
| f"{val:.4f}", va="center", fontsize=7, color=ACCENT) | |
| _style_ax(ax, f"XGBoost — Top {n} Features (Gain)", "Importance", "") | |
| else: | |
| ax.text(0.5, 0.5, "XGBoost not available", ha="center", va="center") | |
| # ── Lasso coefficients ── | |
| ax = axes[1] | |
| lasso_model = estimators.get("lasso") | |
| if lasso_model is not None: | |
| n_coef = min(len(lasso_model.coef_), len(feature_names)) | |
| coef = pd.Series(np.abs(lasso_model.coef_[:n_coef]), index=feature_names[:n_coef]) | |
| top = coef.nlargest(n).sort_values() | |
| colors = [PALETTE[0] if v > 0 else WARN for v in top.values] | |
| bars = ax.barh(top.index, top.values, color=colors, edgecolor="white", height=0.65) | |
| for bar, val in zip(bars, top.values): | |
| ax.text(val + top.values.max() * 0.01, bar.get_y() + bar.get_height() / 2, | |
| f"{val:.4f}", va="center", fontsize=7, color=ACCENT) | |
| _style_ax(ax, f"Lasso — Top {n} |Coefficients|", "|Coefficient|", "") | |
| else: | |
| ax.text(0.5, 0.5, "Lasso not available", ha="center", va="center") | |
| fig.tight_layout() | |
| return _fig_to_image(fig), "✅ Feature importance loaded." | |
| except Exception as e: | |
| return None, f"❌ {e}" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # PLOT 2 — Prediction Distribution (requires test CSV) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def plot_prediction_distribution(test_file): | |
| try: | |
| if test_file is None: | |
| return None, "Please upload a test.csv file." | |
| ensemble, preprocessor, meta = _load_artifacts() | |
| from predict import _prepare | |
| test_path = test_file.name if hasattr(test_file, "name") else test_file | |
| test_df = pd.read_csv(test_path) | |
| X_test = _prepare(test_df, meta) | |
| preds = np.expm1(ensemble.predict(preprocessor.transform(X_test))) | |
| fig, axes = plt.subplots(1, 2, figsize=(13, 5), facecolor="white") | |
| fig.suptitle("Predicted Sale Price Distribution", fontsize=15, fontweight="bold", color=ACCENT) | |
| # Histogram | |
| ax = axes[0] | |
| ax.hist(preds, bins=40, color=PALETTE[1], edgecolor="white", alpha=0.85) | |
| ax.axvline(np.median(preds), color=WARN, linewidth=1.8, linestyle="--", label=f"Median: ${np.median(preds):,.0f}") | |
| ax.axvline(np.mean(preds), color=ACCENT, linewidth=1.8, linestyle="-", label=f"Mean: ${np.mean(preds):,.0f}") | |
| ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"${x/1e3:.0f}k")) | |
| ax.legend(fontsize=8) | |
| _style_ax(ax, "Histogram", "Predicted Price", "Count") | |
| # Box + strip | |
| ax = axes[1] | |
| bp = ax.boxplot(preds, vert=True, patch_artist=True, widths=0.4, | |
| boxprops=dict(facecolor=PALETTE[2], color=ACCENT), | |
| medianprops=dict(color=WARN, linewidth=2), | |
| whiskerprops=dict(color=ACCENT), | |
| capprops=dict(color=ACCENT), | |
| flierprops=dict(marker="o", color=PALETTE[0], alpha=0.3, markersize=3)) | |
| jitter = np.random.uniform(-0.15, 0.15, size=len(preds)) | |
| ax.scatter(1 + jitter, preds, alpha=0.12, s=6, color=PALETTE[0], zorder=3) | |
| ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"${y/1e3:.0f}k")) | |
| _style_ax(ax, "Box Plot + Jitter", "", "Predicted Price") | |
| ax.set_xticks([]) | |
| # Stats table below | |
| stats_txt = (f"n={len(preds):,} min=${preds.min():,.0f} " | |
| f"Q1=${np.percentile(preds,25):,.0f} median=${np.median(preds):,.0f} " | |
| f"Q3=${np.percentile(preds,75):,.0f} max=${preds.max():,.0f}") | |
| fig.text(0.5, -0.02, stats_txt, ha="center", fontsize=8, color="#6C757D") | |
| fig.tight_layout() | |
| return _fig_to_image(fig), f"✅ Predictions generated for {len(preds):,} houses." | |
| except Exception as e: | |
| return None, f"❌ {e}" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # PLOT 3 — Residual Analysis (requires train CSV to compute in-sample) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def plot_residuals(train_file): | |
| try: | |
| if train_file is None: | |
| return None, "Please upload train.csv to compute residuals." | |
| ensemble, preprocessor, meta = _load_artifacts() | |
| from predict import _prepare | |
| train_path = train_file.name if hasattr(train_file, "name") else train_file | |
| train_df = pd.read_csv(train_path) | |
| if "SalePrice" not in train_df.columns: | |
| return None, "train.csv must contain a SalePrice column." | |
| y_true = train_df["SalePrice"].copy() | |
| train_df = train_df.drop(columns=["SalePrice"], errors="ignore") | |
| X = _prepare(train_df, meta) | |
| y_pred = np.expm1(ensemble.predict(preprocessor.transform(X))) | |
| residuals = y_true.values - y_pred | |
| fig, axes = plt.subplots(1, 3, figsize=(16, 5), facecolor="white") | |
| fig.suptitle("Residual Analysis (In-Sample)", fontsize=15, fontweight="bold", color=ACCENT) | |
| # Residuals vs Predicted | |
| ax = axes[0] | |
| ax.scatter(y_pred, residuals, alpha=0.25, s=12, color=PALETTE[1]) | |
| ax.axhline(0, color=WARN, linewidth=1.5, linestyle="--") | |
| ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"${x/1e3:.0f}k")) | |
| ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"${y/1e3:.0f}k")) | |
| _style_ax(ax, "Residuals vs Predicted", "Predicted Price", "Residual") | |
| # Residual histogram | |
| ax = axes[1] | |
| ax.hist(residuals, bins=50, color=PALETTE[1], edgecolor="white", alpha=0.85) | |
| ax.axvline(0, color=WARN, linewidth=1.5, linestyle="--") | |
| ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"${x/1e3:.0f}k")) | |
| _style_ax(ax, "Residual Distribution", "Residual", "Count") | |
| # Q-Q plot | |
| ax = axes[2] | |
| (osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm") | |
| ax.scatter(osm, osr, alpha=0.3, s=12, color=PALETTE[1]) | |
| line_x = np.array([osm[0], osm[-1]]) | |
| ax.plot(line_x, slope * line_x + intercept, color=WARN, linewidth=1.8) | |
| _style_ax(ax, f"Q-Q Plot (R²={r**2:.3f})", "Theoretical Quantiles", "Sample Quantiles") | |
| rmse = np.sqrt(np.mean(residuals**2)) | |
| mae = np.mean(np.abs(residuals)) | |
| fig.text(0.5, -0.02, | |
| f"In-sample RMSE: ${rmse:,.0f} | MAE: ${mae:,.0f}", | |
| ha="center", fontsize=9, color="#6C757D") | |
| fig.tight_layout() | |
| return _fig_to_image(fig), f"✅ Residuals computed. RMSE=${rmse:,.0f} MAE=${mae:,.0f}" | |
| except Exception as e: | |
| return None, f"❌ {e}" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # PLOT 4 — Training Data Statistics (requires train CSV) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def plot_data_stats(train_file): | |
| try: | |
| if train_file is None: | |
| return None, "Please upload train.csv." | |
| train_path = train_file.name if hasattr(train_file, "name") else train_file | |
| df = pd.read_csv(train_path) | |
| fig = plt.figure(figsize=(16, 10), facecolor="white") | |
| fig.suptitle("Training Data Statistics", fontsize=15, fontweight="bold", color=ACCENT, y=1.01) | |
| gs = gridspec.GridSpec(2, 3, figure=fig, hspace=0.45, wspace=0.35) | |
| # ── SalePrice distribution ── | |
| ax = fig.add_subplot(gs[0, 0]) | |
| ax.hist(df["SalePrice"], bins=50, color=PALETTE[1], edgecolor="white", alpha=0.85) | |
| ax.axvline(df["SalePrice"].median(), color=WARN, linewidth=1.5, linestyle="--", | |
| label=f"Median ${df['SalePrice'].median()/1e3:.0f}k") | |
| ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"${x/1e3:.0f}k")) | |
| ax.legend(fontsize=7) | |
| _style_ax(ax, "SalePrice Distribution", "Sale Price", "Count") | |
| # ── Log SalePrice ── | |
| ax = fig.add_subplot(gs[0, 1]) | |
| log_price = np.log1p(df["SalePrice"]) | |
| ax.hist(log_price, bins=50, color=PALETTE[0], edgecolor="white", alpha=0.85) | |
| _style_ax(ax, "log(SalePrice) Distribution", "log(1 + SalePrice)", "Count") | |
| # ── Missing values (top 15) ── | |
| ax = fig.add_subplot(gs[0, 2]) | |
| missing = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False).head(15) | |
| missing = missing[missing > 0] | |
| if len(missing): | |
| bars = ax.barh(missing.index[::-1], missing.values[::-1], | |
| color=WARN, edgecolor="white", height=0.6) | |
| for bar, val in zip(bars, missing.values[::-1]): | |
| ax.text(val + 0.3, bar.get_y() + bar.get_height() / 2, | |
| f"{val:.1f}%", va="center", fontsize=7, color=ACCENT) | |
| _style_ax(ax, "Missing Values (top 15)", "Missing %", "") | |
| # ── Overall Quality vs Price ── | |
| ax = fig.add_subplot(gs[1, 0]) | |
| if "OverallQual" in df.columns: | |
| groups = [df[df["OverallQual"] == q]["SalePrice"].values | |
| for q in sorted(df["OverallQual"].unique())] | |
| labels = sorted(df["OverallQual"].unique()) | |
| bp = ax.boxplot(groups, labels=labels, patch_artist=True, | |
| boxprops=dict(facecolor=PALETTE[2], color=ACCENT), | |
| medianprops=dict(color=WARN, linewidth=1.8), | |
| whiskerprops=dict(color=ACCENT), capprops=dict(color=ACCENT), | |
| flierprops=dict(marker=".", color=PALETTE[0], alpha=0.3, markersize=4)) | |
| ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"${y/1e3:.0f}k")) | |
| _style_ax(ax, "Price by Overall Quality", "Quality Score", "Sale Price") | |
| # ── Correlation with SalePrice (top 12 numerics) ── | |
| ax = fig.add_subplot(gs[1, 1]) | |
| num_df = df.select_dtypes(include=[np.number]).drop(columns=["Id"], errors="ignore") | |
| corr = num_df.corr()["SalePrice"].drop("SalePrice").abs().sort_values(ascending=False).head(12) | |
| corr_signed = num_df.corr()["SalePrice"].drop("SalePrice").loc[corr.index] | |
| colors = [PALETTE[0] if v > 0 else WARN for v in corr_signed.values] | |
| ax.barh(corr.index[::-1], corr.values[::-1], color=colors[::-1], edgecolor="white", height=0.65) | |
| _style_ax(ax, "Top Correlations with SalePrice", "|Pearson r|", "") | |
| # ── Scatter GrLivArea vs SalePrice ── | |
| ax = fig.add_subplot(gs[1, 2]) | |
| if "GrLivArea" in df.columns: | |
| sc = ax.scatter(df["GrLivArea"], df["SalePrice"], | |
| alpha=0.25, s=10, c=df.get("OverallQual", pd.Series(5, index=df.index)), | |
| cmap="YlGn", edgecolors="none") | |
| plt.colorbar(sc, ax=ax, label="Overall Quality", shrink=0.8) | |
| ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"${y/1e3:.0f}k")) | |
| _style_ax(ax, "GrLivArea vs SalePrice", "Above-Grade Living Area (sqft)", "Sale Price") | |
| return _fig_to_image(fig), f"✅ Stats for {len(df):,} training samples loaded." | |
| except Exception as e: | |
| return None, f"❌ {e}" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # PLOT 5 — Model CV Comparison (reads saved meta) | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def plot_model_comparison(): | |
| try: | |
| _, _, meta = _load_artifacts() | |
| cv_scores = meta.get("cv_scores", None) | |
| if cv_scores is None: | |
| return None, ("ℹ️ CV score details not stored in this model version.\n" | |
| "Re-train to enable this chart.") | |
| models = list(cv_scores.keys()) | |
| rmses = [cv_scores[m]["rmse"] for m in models] | |
| stds = [cv_scores[m].get("std", 0) for m in models] | |
| fig, ax = plt.subplots(figsize=(7, 4), facecolor="white") | |
| x = np.arange(len(models)) | |
| bars = ax.bar(x, rmses, yerr=stds, color=PALETTE[:len(models)], | |
| edgecolor="white", width=0.45, capsize=6, | |
| error_kw=dict(ecolor=ACCENT, elinewidth=1.5)) | |
| for bar, val in zip(bars, rmses): | |
| ax.text(bar.get_x() + bar.get_width() / 2, val + max(stds) * 0.05, | |
| f"{val:.4f}", ha="center", va="bottom", fontsize=9, fontweight="bold", color=ACCENT) | |
| ax.set_xticks(x) | |
| ax.set_xticklabels(models, fontsize=10) | |
| _style_ax(ax, "Cross-Validation RMSE (log scale)", "Model", "CV RMSE (log)") | |
| fig.tight_layout() | |
| return _fig_to_image(fig), "✅ Model comparison loaded." | |
| except Exception as e: | |
| return None, f"❌ {e}" | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| # Gradio Tab builder — call this from app.py | |
| # ══════════════════════════════════════════════════════════════════════════════ | |
| def build_analytics_tab(): | |
| """ | |
| Returns a gr.Tab block. Import and embed it inside the gr.Tabs() block in app.py. | |
| Usage in app.py: | |
| from visualize import build_analytics_tab | |
| with gr.Tabs(): | |
| ...existing tabs... | |
| build_analytics_tab() | |
| """ | |
| import gradio as gr | |
| with gr.Tab("📊 Analytics") as tab: | |
| gr.Markdown( | |
| "### Visual Diagnostics\n" | |
| "Explore model internals, data statistics, predictions and residuals.\n" | |
| "> **Tip:** Train the model first; some charts also need a CSV upload." | |
| ) | |
| with gr.Tabs(): | |
| # ── Feature Importance ────────────────────────────────────────── | |
| with gr.Tab("Feature Importance"): | |
| gr.Markdown("XGBoost gain-based importance **and** Lasso |coefficients|.") | |
| btn_fi = gr.Button("Load Feature Importance", variant="primary") | |
| img_fi = gr.Image(label="Feature Importance", type="pil") | |
| msg_fi = gr.Markdown() | |
| btn_fi.click(fn=plot_feature_importance, inputs=[], outputs=[img_fi, msg_fi]) | |
| # ── Prediction Distribution ───────────────────────────────────── | |
| with gr.Tab("Prediction Distribution"): | |
| gr.Markdown("Upload **test.csv** to visualise the distribution of predicted prices.") | |
| f_pred = gr.File(label="Upload test.csv", file_types=[".csv"]) | |
| btn_pd = gr.Button("Generate Distribution", variant="primary") | |
| img_pd = gr.Image(label="Prediction Distribution", type="pil") | |
| msg_pd = gr.Markdown() | |
| btn_pd.click(fn=plot_prediction_distribution, inputs=[f_pred], outputs=[img_pd, msg_pd]) | |
| # ── Residual Analysis ─────────────────────────────────────────── | |
| with gr.Tab("Residual Analysis"): | |
| gr.Markdown("Upload **train.csv** to compute in-sample residuals.") | |
| f_res = gr.File(label="Upload train.csv", file_types=[".csv"]) | |
| btn_res = gr.Button("Analyse Residuals", variant="primary") | |
| img_res = gr.Image(label="Residual Analysis", type="pil") | |
| msg_res = gr.Markdown() | |
| btn_res.click(fn=plot_residuals, inputs=[f_res], outputs=[img_res, msg_res]) | |
| # ── Training Data Stats ───────────────────────────────────────── | |
| with gr.Tab("Data Statistics"): | |
| gr.Markdown("Upload **train.csv** to explore raw data distributions and correlations.") | |
| f_stat = gr.File(label="Upload train.csv", file_types=[".csv"]) | |
| btn_st = gr.Button("Show Data Stats", variant="primary") | |
| img_st = gr.Image(label="Data Statistics", type="pil") | |
| msg_st = gr.Markdown() | |
| btn_st.click(fn=plot_data_stats, inputs=[f_stat], outputs=[img_st, msg_st]) | |
| # ── Model Comparison ──────────────────────────────────────────── | |
| with gr.Tab("Model Comparison"): | |
| gr.Markdown("CV RMSE across base learners.") | |
| btn_mc = gr.Button("Load Model Comparison", variant="primary") | |
| img_mc = gr.Image(label="Model Comparison", type="pil") | |
| msg_mc = gr.Markdown() | |
| btn_mc.click(fn=plot_model_comparison, inputs=[], outputs=[img_mc, msg_mc]) | |
| return tab | |