Spaces:

ESCP
/

RX12-Housing-Competition

Sleeping

App Files Files Community

GBDB02 commited on Mar 7

Commit

97b9bf6

verified ·

1 Parent(s): 9ff9d93

Upload 8 files

Browse files

Files changed (8) hide show

app (1).py +102 -0
config.py +25 -0
packages.txt +1 -0
predict.py +135 -0
preprocessing.py +82 -0
preprocessing.r +184 -0
requirements (1).txt +7 -0
train.py +172 -0

app (1).py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+app.py — Gradio UI entry point.
+"""
+import sys
+print("Python started", flush=True)
+try:
+    import gradio as gr
+    print("gradio OK", flush=True)
+except Exception as e:
+    print(f"FAILED gradio: {e}", flush=True)
+    sys.exit(1)
+try:
+    from config import QUICK_FIELDS
+    print("config OK", flush=True)
+except Exception as e:
+    print(f"FAILED config: {e}", flush=True)
+    sys.exit(1)
+try:
+    from train import train_model
+    print("train OK", flush=True)
+except Exception as e:
+    print(f"FAILED train: {e}", flush=True)
+    sys.exit(1)
+try:
+    from predict import predict, predict_single
+    print("predict OK", flush=True)
+except Exception as e:
+    print(f"FAILED predict: {e}", flush=True)
+    sys.exit(1)
+print("Building UI...", flush=True)
+import gradio as gr
+from config import QUICK_FIELDS
+from train import train_model
+from predict import predict, predict_single
+DESCRIPTION = """
+# House Price Predictor
+**Stacking Ensemble: Lasso + Random Forest + XGBoost**
+*Final Project- Giovanni Battista Del Basso, Francesco Ciccarese, Miguel Domingos, Santiago Genoni*
+---
+**How to use:**
+The model uses the Kaggle competition: "Housing Prices Competition for Kaggle Learn Users".
+To use the model, upload the train and test set that you can find in the competition at this link: "https://www.kaggle.com/competitions/home-data-for-ml-course/overview"
+1. **Train Model** - Upload `train.csv` and click *Train Model*
+2. **Predict (CSV)** - Upload `test.csv` to download a full Kaggle submission file
+3. **Single House** - Fill in the key features to get a single price estimate
+"""
+with gr.Blocks(title="House Price Predictor", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Tabs():
+        # ── Tab 1: Train ──────────────────────────────────────────────────────
+        with gr.Tab("Train Model"):
+            gr.Markdown(
+                "Upload your **train.csv** (Kaggle House Prices format) and train the stacking ensemble.\n\n"
+                ">Please wait after clicking!"
+            )
+            train_file = gr.File(label="Upload train.csv", file_types=[".csv"])
+            train_btn  = gr.Button("Train Model", variant="primary")
+            train_out  = gr.Markdown(value="Results will appear here after training.")
+            train_btn.click(fn=train_model, inputs=[train_file], outputs=[train_out])
+        # ── Tab 2: Predict CSV ────────────────────────────────────────────────
+        with gr.Tab("Predict (CSV)"):
+            gr.Markdown("Upload **test.csv** to generate a submission.csv ready for Kaggle.")
+            test_file = gr.File(label="Upload test.csv", file_types=[".csv"])
+            pred_btn  = gr.Button("Generate Predictions", variant="primary")
+            pred_msg  = gr.Markdown(value="Predictions will appear here.")
+            dl_file   = gr.File(label="Download submission.csv")
+            pred_btn.click(fn=predict, inputs=[test_file], outputs=[dl_file, pred_msg])
+        # ── Tab 3: Single House ───────────────────────────────────────────────
+        with gr.Tab("Single House"):
+            gr.Markdown(
+                "Fill in the **8 key features** below to get a price estimate.\n\n"
+                "> All other house features are automatically filled with the **median values from the training data**."
+            )
+            inputs = []
+            with gr.Row():
+                for key, (label, default) in QUICK_FIELDS.items():
+                    inputs.append(gr.Number(label=label, value=default))
+            single_btn = gr.Button("Estimate Price", variant="primary")
+            single_out = gr.Markdown(value="Your estimate will appear here.")
+            single_btn.click(fn=predict_single, inputs=inputs, outputs=[single_out])
+    gr.Markdown("---\nBuilt with Scikit-learn, XGBoost and Gradio.")
+if __name__ == "__main__":
+    demo.launch(ssr_mode=False)

config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+config.py — Shared constants used across all modules.
+"""
+# Paths for saved model artefacts
+MODEL_PATH        = "stacking_model.joblib"
+PREPROCESSOR_PATH = "preprocessor.joblib"
+META_PATH         = "model_meta.joblib"
+# Preprocessing settings
+COLUMNS_TO_DROP_BASE = ["MoSold", "YrSold", "Id"]
+SKEW_THRESHOLD       = 0.75
+# The 8 fields shown in the Single House tab
+# { column_name: (label, default_placeholder) }
+QUICK_FIELDS = {
+    "GrLivArea":    ("Above-Grade Living Area (sqft)", 1500),
+    "OverallQual":  ("Overall Quality (1-10)",         7),
+    "YearBuilt":    ("Year Built",                     2000),
+    "TotalBsmtSF":  ("Total Basement SF",              800),
+    "GarageArea":   ("Garage Area (sqft)",             400),
+    "FullBath":     ("Full Bathrooms",                 2),
+    "BedroomAbvGr": ("Bedrooms Above Grade",           3),
+    "LotArea":      ("Lot Area (sqft)",                8000),
+}

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ r-base

predict.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+predict.py — Prediction logic.
+"""
+import gradio as gr
+import pandas as pd
+import numpy as np
+import joblib
+import os
+import tempfile
+from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH, QUICK_FIELDS, COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
+from preprocessing import engineer_features
+def _load_model():
+    if not (os.path.exists(MODEL_PATH) and
+            os.path.exists(PREPROCESSOR_PATH) and
+            os.path.exists(META_PATH)):
+        raise FileNotFoundError("No trained model found. Please go to the Train Model tab first.")
+    return joblib.load(MODEL_PATH), joblib.load(PREPROCESSOR_PATH), joblib.load(META_PATH)
+def _prepare(df: pd.DataFrame, meta: dict) -> pd.DataFrame:
+    """
+    Applies the exact same steps as training:
+    1. Drop columns
+    2. Engineer features
+    3. Fill missing
+    4. Log-transform the exact skewed_cols saved during training
+    5. Align to selected features
+    """
+    selected_features    = meta["selected_features"]
+    numerical_features   = meta["numerical_features"]
+    categorical_features = meta["categorical_features"]
+    # Use saved skewed_cols if available, otherwise fall back to recalculating
+    skewed_cols = meta.get("skewed_cols", None)
+    # Step 1: drop
+    to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
+    df = df.drop(columns=list(set(to_drop + ["source", "Id"])), errors="ignore")
+    # Step 2: engineer features
+    df = engineer_features(df)
+    # Step 3: fill missing
+    for col in df.select_dtypes(include=["object"]).columns:
+        df[col] = df[col].fillna("Missing").astype(str)
+    for col in df.select_dtypes(include=[np.number]).columns:
+        fill_val = df[col].median() if len(df) > 1 else 0
+        df[col] = df[col].fillna(fill_val).astype(float)
+    # Step 4: log-transform
+    if skewed_cols is not None:
+        # Use exact columns from training
+        for col in skewed_cols:
+            if col in df.columns:
+                df[col] = np.log1p(df[col].clip(lower=0))
+    else:
+        # Fallback: recalculate skewness (only works reliably on large datasets)
+        num_cols = df.select_dtypes(include=[np.number]).columns
+        skewed = df[num_cols].apply(lambda x: x.dropna().skew())
+        for col in skewed[abs(skewed) > SKEW_THRESHOLD].index:
+            df[col] = np.log1p(df[col].clip(lower=0))
+    # Step 5: align to training columns
+    for col in selected_features:
+        if col not in df.columns:
+            df[col] = "Missing" if col in categorical_features else 0.0
+    X = df[selected_features].copy()
+    for col in numerical_features:
+        if col in X.columns:
+            X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0).astype(float)
+    for col in categorical_features:
+        if col in X.columns:
+            X[col] = X[col].fillna("Missing").astype(str)
+    return X
+def predict(test_file, progress=gr.Progress()):
+    try:
+        if test_file is None:
+            return None, "Please upload a test.csv file."
+        progress(0.10, desc="Loading model...")
+        ensemble, preprocessor, meta = _load_model()
+        progress(0.30, desc="Loading test data...")
+        test_path = test_file.name if hasattr(test_file, "name") else test_file
+        test_df   = pd.read_csv(test_path)
+        test_id   = test_df["Id"] if "Id" in test_df.columns else pd.RangeIndex(len(test_df))
+        progress(0.55, desc="Preprocessing...")
+        X_test = _prepare(test_df, meta)
+        progress(0.75, desc="Predicting...")
+        pred = np.expm1(ensemble.predict(preprocessor.transform(X_test)))
+        submission = pd.DataFrame({"Id": test_id, "SalePrice": pred})
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="submission_")
+        submission.to_csv(tmp.name, index=False)
+        preview = submission.head(10).to_markdown(index=False)
+        progress(1.0, desc="Done!")
+        return tmp.name, f"Predictions ready! ({len(submission)} rows)\n\n{preview}\n\nDownload the full file below."
+    except FileNotFoundError as e:
+        return None, str(e)
+    except Exception as e:
+        return None, f"Prediction failed: {str(e)}"
+def predict_single(*args):
+    try:
+        ensemble, preprocessor, meta = _load_model()
+        raw_numeric_medians = meta.get("raw_numeric_medians", {})
+        raw_cat_modes       = meta.get("raw_cat_modes", {})
+        # Build full raw row: training medians/modes as base, override with user input
+        raw_row = {**raw_numeric_medians, **raw_cat_modes}
+        for k, v in zip(list(QUICK_FIELDS.keys()), args):
+            raw_row[k] = float(v)
+        X        = _prepare(pd.DataFrame([raw_row]), meta)
+        pred_log = ensemble.predict(preprocessor.transform(X))[0]
+        pred     = np.expm1(pred_log)
+        return f"### Estimated Sale Price: ${pred:,.0f}"
+    except FileNotFoundError as e:
+        return str(e)
+    except Exception as e:
+        return f"Error: {str(e)}"

preprocessing.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+preprocessing.py — Feature engineering and data preprocessing pipeline.
+"""
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import RobustScaler, OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from config import COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
+def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df["TotalSF"] = (
+        df.get("TotalBsmtSF", pd.Series(0, index=df.index)).fillna(0)
+        + df.get("1stFlrSF",   pd.Series(0, index=df.index)).fillna(0)
+        + df.get("2ndFlrSF",   pd.Series(0, index=df.index)).fillna(0)
+    )
+    df["TotalBath"] = (
+        df.get("FullBath",      pd.Series(0, index=df.index)).fillna(0)
+        + 0.5 * df.get("HalfBath",    pd.Series(0, index=df.index)).fillna(0)
+        + df.get("BsmtFullBath",  pd.Series(0, index=df.index)).fillna(0)
+        + 0.5 * df.get("BsmtHalfBath", pd.Series(0, index=df.index)).fillna(0)
+    )
+    df["HasPool"]     = (df.get("PoolArea",   pd.Series(0,      index=df.index)) > 0).astype(int)
+    df["HasGarage"]   = df.get("GarageArea",  pd.Series(np.nan, index=df.index)).notnull().astype(int)
+    df["HasBsmt"]     = df.get("TotalBsmtSF", pd.Series(np.nan, index=df.index)).notnull().astype(int)
+    df["IsRemodeled"] = (
+        df.get("YearRemodAdd", pd.Series(0, index=df.index))
+        != df.get("YearBuilt", pd.Series(0, index=df.index))
+    ).astype(int)
+    return df
+def preprocess_combined(df: pd.DataFrame) -> pd.DataFrame:
+    # Step 1 — Drop unwanted / high-NaN columns
+    to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
+    missing_pct = df.isnull().sum() * 100 / len(df)
+    to_drop += missing_pct[missing_pct > 50].index.tolist()
+    df = df.drop(columns=list(set(to_drop)), errors="ignore")
+    # Step 2 — Feature engineering
+    df = engineer_features(df)
+    # Step 3 — Fix dtypes: numeric columns must be float, not object
+    for col in df.columns:
+        if col in df.select_dtypes(include=["object"]).columns:
+            # Try to convert to numeric; if it fails keep as string
+            converted = pd.to_numeric(df[col], errors="coerce")
+            if converted.notna().sum() > 0.5 * len(df):
+                df[col] = converted
+    # Step 4 — Log-transform skewed numerics
+    num_cols = df.select_dtypes(include=[np.number]).columns
+    skewed   = df[num_cols].apply(lambda x: x.dropna().skew())
+    for feat in skewed[abs(skewed) > SKEW_THRESHOLD].index:
+        df[feat] = np.log1p(df[feat].clip(lower=0))
+    # Step 5 — Fill missing values
+    for col in df.select_dtypes(include=["object"]).columns:
+        df[col] = df[col].fillna("Missing").astype(str)
+    for col in df.select_dtypes(include=[np.number]).columns:
+        df[col] = df[col].fillna(df[col].median()).astype(float)
+    return df
+def build_preprocessor(numerical_features: list, categorical_features: list) -> ColumnTransformer:
+    num_pipeline = Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler",  RobustScaler()),
+    ])
+    cat_pipeline = Pipeline([
+        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
+    ])
+    return ColumnTransformer([
+        ("num", num_pipeline, numerical_features),
+        ("cat", cat_pipeline, categorical_features),
+    ])

preprocessing.r ADDED Viewed

	@@ -0,0 +1,184 @@

+# =============================================================================
+# preprocessing.R
+# Feature engineering and data preprocessing pipeline
+# R equivalent of preprocessing.py
+# =============================================================================
+library(dplyr)
+library(tidyr)
+# ── Constants ─────────────────────────────────────────────────────────────────
+COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
+SKEW_THRESHOLD <- 0.75
+# ── Feature Engineering ───────────────────────────────────────────────────────
+engineer_features <- function(df) {
+  df <- df %>%
+    mutate(
+      # Total square footage: basement + 1st floor + 2nd floor
+      TotalSF = replace_na(TotalBsmtSF, 0) +
+                replace_na(`1stFlrSF`, 0) +
+                replace_na(`2ndFlrSF`, 0),
+      # Total bathrooms (half baths count as 0.5)
+      TotalBath = replace_na(FullBath, 0) +
+                  0.5 * replace_na(HalfBath, 0) +
+                  replace_na(BsmtFullBath, 0) +
+                  0.5 * replace_na(BsmtHalfBath, 0),
+      # Binary flags
+      HasPool     = as.integer(!is.na(PoolArea) & PoolArea > 0),
+      HasGarage   = as.integer(!is.na(GarageArea)),
+      HasBsmt     = as.integer(!is.na(TotalBsmtSF)),
+      IsRemodeled = as.integer(
+        replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
+      )
+    )
+  return(df)
+}
+# ── Drop High-Missing Columns ─────────────────────────────────────────────────
+drop_high_missing <- function(df, threshold = 50) {
+  missing_pct <- colMeans(is.na(df)) * 100
+  cols_to_drop <- names(missing_pct[missing_pct > threshold])
+  cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
+  cols_to_drop <- intersect(cols_to_drop, names(df))
+  df <- df %>% select(-all_of(cols_to_drop))
+  return(df)
+}
+# ── Skewness Correction ───────────────────────────────────────────────────────
+get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
+  # Get numeric columns only
+  num_cols <- df %>% select(where(is.numeric)) %>% names()
+  # Calculate skewness for each numeric column
+  skew_values <- sapply(num_cols, function(col) {
+    x <- df[[col]]
+    x <- x[!is.na(x)]
+    if (length(x) < 3) return(0)
+    n <- length(x)
+    m <- mean(x)
+    s <- sd(x)
+    if (s == 0) return(0)
+    (sum((x - m)^3) / n) / (s^3)  # Pearson skewness
+  })
+  skewed_cols <- names(skew_values[abs(skew_values) > threshold])
+  return(skewed_cols)
+}
+apply_log_transform <- function(df, skewed_cols) {
+  for (col in skewed_cols) {
+    if (col %in% names(df)) {
+      df[[col]] <- log1p(pmax(df[[col]], 0))
+    }
+  }
+  return(df)
+}
+# ── Fill Missing Values ───────────────────────────────────────────────────────
+fill_missing <- function(df) {
+  # Categorical: fill with "Missing"
+  cat_cols <- df %>% select(where(is.character)) %>% names()
+  for (col in cat_cols) {
+    df[[col]][is.na(df[[col]])] <- "Missing"
+  }
+  # Numeric: fill with median
+  num_cols <- df %>% select(where(is.numeric)) %>% names()
+  for (col in num_cols) {
+    med <- median(df[[col]], na.rm = TRUE)
+    df[[col]][is.na(df[[col]])] <- med
+  }
+  return(df)
+}
+# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
+# This is the R equivalent of preprocess_combined() in preprocessing.py
+preprocess_combined <- function(df) {
+  cat("Step 1: Dropping high-missing and flagged columns...\n")
+  df <- drop_high_missing(df)
+  cat("Step 2: Engineering new features...\n")
+  df <- engineer_features(df)
+  cat("Step 3: Identifying skewed columns...\n")
+  skewed_cols <- get_skewed_cols(df)
+  cat(sprintf("  Found %d skewed columns: %s\n",
+              length(skewed_cols),
+              paste(head(skewed_cols, 5), collapse = ", ")))
+  cat("Step 4: Applying log1p transform to skewed columns...\n")
+  df <- apply_log_transform(df, skewed_cols)
+  cat("Step 5: Filling missing values...\n")
+  df <- fill_missing(df)
+  cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
+  return(list(data = df, skewed_cols = skewed_cols))
+}
+# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────
+get_raw_stats <- function(df) {
+  # Numeric medians
+  num_cols <- df %>% select(where(is.numeric)) %>% names()
+  numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))
+  # Categorical modes
+  cat_cols <- df %>% select(where(is.character)) %>% names()
+  cat_modes <- sapply(cat_cols, function(col) {
+    tbl <- table(df[[col]])
+    names(which.max(tbl))
+  })
+  return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
+}
+# ── Main: Run as standalone script ────────────────────────────────────────────
+if (!interactive()) {
+  args <- commandArgs(trailingOnly = TRUE)
+  if (length(args) < 2) {
+    cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
+    cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
+    quit(status = 1)
+  }
+  input_path  <- args[1]
+  output_path <- args[2]
+  cat(sprintf("Reading data from: %s\n", input_path))
+  df <- read.csv(input_path, stringsAsFactors = FALSE)
+  cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))
+  # Remove target if present (training data)
+  if ("SalePrice" %in% names(df)) {
+    target <- df$SalePrice
+    df <- df %>% select(-SalePrice)
+    cat("Target variable (SalePrice) stored separately.\n")
+  }
+  # Run full preprocessing
+  result      <- preprocess_combined(df)
+  df_processed <- result$data
+  skewed_cols  <- result$skewed_cols
+  # Save processed data
+  write.csv(df_processed, output_path, row.names = FALSE)
+  cat(sprintf("Processed data saved to: %s\n", output_path))
+  # Save skewed columns list
+  skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
+  writeLines(skewed_cols, skewed_path)
+  cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
+}

requirements (1).txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pandas
+numpy
+scikit-learn
+xgboost
+joblib
+tabulate

train.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+train.py — Model training logic.
+"""
+import gradio as gr
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.model_selection import GridSearchCV
+from sklearn.linear_model import Lasso
+from sklearn.ensemble import RandomForestRegressor, StackingRegressor
+from xgboost import XGBRegressor
+from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH, COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
+from preprocessing import engineer_features, build_preprocessor
+def _full_preprocess(df: pd.DataFrame):
+    """
+    Full preprocessing. Returns (X, skewed_cols, selected_features, num_feats, cat_feats).
+    Separated so train can save skewed_cols and predict can reuse them.
+    """
+    # Drop unwanted columns
+    to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
+    missing_pct = df.isnull().sum() * 100 / len(df)
+    to_drop += missing_pct[missing_pct > 50].index.tolist()
+    df = df.drop(columns=list(set(to_drop)), errors="ignore")
+    df = df.drop(columns=["source"], errors="ignore")
+    # Engineer features
+    df = engineer_features(df)
+    # Fill missing before skew calculation
+    for col in df.select_dtypes(include=["object"]).columns:
+        df[col] = df[col].fillna("Missing").astype(str)
+    for col in df.select_dtypes(include=[np.number]).columns:
+        df[col] = df[col].fillna(df[col].median()).astype(float)
+    # Identify skewed numeric columns AFTER engineering and filling
+    num_cols = df.select_dtypes(include=[np.number]).columns
+    skewed_series = df[num_cols].apply(lambda x: x.skew())
+    skewed_cols = skewed_series[abs(skewed_series) > SKEW_THRESHOLD].index.tolist()
+    # Apply log transform
+    for col in skewed_cols:
+        df[col] = np.log1p(df[col].clip(lower=0))
+    selected_features    = df.columns.tolist()
+    numerical_features   = df.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_features = df.select_dtypes(include=["object"]).columns.tolist()
+    return df, skewed_cols, selected_features, numerical_features, categorical_features
+def train_model(train_file, progress=gr.Progress()):
+    try:
+        if train_file is None:
+            return "Please upload a train.csv file first."
+        train_path = train_file.name if hasattr(train_file, "name") else train_file
+        progress(0.05, desc="Loading data...")
+        train_df = pd.read_csv(train_path)
+        if "SalePrice" not in train_df.columns:
+            return "Error: SalePrice column not found. Make sure you upload the Kaggle train.csv."
+        target   = train_df["SalePrice"].copy()
+        train_df = train_df.drop(columns=["SalePrice", "Id"], errors="ignore")
+        # Save RAW medians/modes before any transformation
+        raw_numeric_medians = train_df.select_dtypes(include=[np.number]).median().to_dict()
+        raw_cat_modes = {
+            col: train_df[col].mode()[0] if not train_df[col].mode().empty else "Missing"
+            for col in train_df.select_dtypes(include=["object"]).columns
+        }
+        train_df["source"] = "train"
+        progress(0.12, desc="Preprocessing features...")
+        X, skewed_cols, selected_features, numerical_features, categorical_features = _full_preprocess(train_df)
+        preprocessor = build_preprocessor(numerical_features, categorical_features)
+        X_processed  = preprocessor.fit_transform(X)
+        target_log   = np.log1p(target)
+        # Lasso
+        progress(0.22, desc="Tuning Lasso... (1/3)")
+        lasso_cv = GridSearchCV(
+            Lasso(random_state=42),
+            {"alpha": [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], "max_iter": [10000]},
+            cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
+        )
+        lasso_cv.fit(X_processed, target_log)
+        # Random Forest
+        progress(0.42, desc="Tuning Random Forest... (2/3)")
+        rf_cv = GridSearchCV(
+            RandomForestRegressor(random_state=42, n_jobs=-1),
+            {
+                "n_estimators":      [100],
+                "max_depth":         [10, 20],
+                "min_samples_split": [2, 5],
+                "min_samples_leaf":  [1, 2],
+            },
+            cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
+        )
+        rf_cv.fit(X_processed, target_log)
+        # XGBoost
+        progress(0.65, desc="Tuning XGBoost... (3/3)")
+        xgb_cv = GridSearchCV(
+            XGBRegressor(
+                objective="reg:squarederror", random_state=42,
+                tree_method="hist", n_jobs=1, verbosity=0,
+            ),
+            {
+                "n_estimators":  [200],
+                "max_depth":     [5, 7],
+                "learning_rate": [0.05, 0.1],
+                "subsample":     [0.8, 1.0],
+            },
+            cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
+        )
+        xgb_cv.fit(X_processed, target_log)
+        # Stacking
+        progress(0.83, desc="Fitting Stacking Ensemble...")
+        ensemble = StackingRegressor(
+            estimators=[
+                ("lasso", Lasso(**lasso_cv.best_params_, random_state=42)),
+                ("rf",    RandomForestRegressor(**rf_cv.best_params_, random_state=42, n_jobs=-1)),
+                ("xgb",   XGBRegressor(
+                    **xgb_cv.best_params_, random_state=42,
+                    tree_method="hist", n_jobs=1, verbosity=0,
+                )),
+            ],
+            final_estimator=Lasso(alpha=0.001, random_state=42, max_iter=10000),
+            cv=3, n_jobs=-1,
+        )
+        ensemble.fit(X_processed, target_log)
+        progress(0.96, desc="Saving model...")
+        joblib.dump(ensemble, MODEL_PATH)
+        joblib.dump(preprocessor, PREPROCESSOR_PATH)
+        joblib.dump({
+            "selected_features":    selected_features,
+            "numerical_features":   numerical_features,
+            "categorical_features": categorical_features,
+            "skewed_cols":          skewed_cols,
+            "raw_numeric_medians":  raw_numeric_medians,
+            "raw_cat_modes":        raw_cat_modes,
+        }, META_PATH)
+        lasso_rmse = np.sqrt(-lasso_cv.best_score_)
+        rf_rmse    = np.sqrt(-rf_cv.best_score_)
+        xgb_rmse   = np.sqrt(-xgb_cv.best_score_)
+        progress(1.0, desc="Done!")
+        return (
+            f"Model trained successfully!\n\n"
+            f"| Model | CV RMSE (log) | Best Params |\n"
+            f"|---|---|---|\n"
+            f"| Lasso         | {lasso_rmse:.5f} | {lasso_cv.best_params_} |\n"
+            f"| Random Forest | {rf_rmse:.5f}    | {rf_cv.best_params_} |\n"
+            f"| XGBoost       | {xgb_rmse:.5f}   | {xgb_cv.best_params_} |\n\n"
+            f"Switch to Predict (CSV) or Single House to use it!"
+        )
+    except Exception as e:
+        return f"Training failed: {str(e)}\n\nPlease check your CSV file and try again."