Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- app (1).py +102 -0
- config.py +25 -0
- packages.txt +1 -0
- predict.py +135 -0
- preprocessing.py +82 -0
- preprocessing.r +184 -0
- requirements (1).txt +7 -0
- train.py +172 -0
app (1).py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py — Gradio UI entry point.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
print("Python started", flush=True)
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
import gradio as gr
|
| 10 |
+
print("gradio OK", flush=True)
|
| 11 |
+
except Exception as e:
|
| 12 |
+
print(f"FAILED gradio: {e}", flush=True)
|
| 13 |
+
sys.exit(1)
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from config import QUICK_FIELDS
|
| 17 |
+
print("config OK", flush=True)
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(f"FAILED config: {e}", flush=True)
|
| 20 |
+
sys.exit(1)
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from train import train_model
|
| 24 |
+
print("train OK", flush=True)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"FAILED train: {e}", flush=True)
|
| 27 |
+
sys.exit(1)
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from predict import predict, predict_single
|
| 31 |
+
print("predict OK", flush=True)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"FAILED predict: {e}", flush=True)
|
| 34 |
+
sys.exit(1)
|
| 35 |
+
|
| 36 |
+
print("Building UI...", flush=True)
|
| 37 |
+
|
| 38 |
+
import gradio as gr
|
| 39 |
+
from config import QUICK_FIELDS
|
| 40 |
+
from train import train_model
|
| 41 |
+
from predict import predict, predict_single
|
| 42 |
+
|
| 43 |
+
DESCRIPTION = """
|
| 44 |
+
# House Price Predictor
|
| 45 |
+
**Stacking Ensemble: Lasso + Random Forest + XGBoost**
|
| 46 |
+
|
| 47 |
+
*Final Project- Giovanni Battista Del Basso, Francesco Ciccarese, Miguel Domingos, Santiago Genoni*
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
**How to use:**
|
| 51 |
+
The model uses the Kaggle competition: "Housing Prices Competition for Kaggle Learn Users".
|
| 52 |
+
To use the model, upload the train and test set that you can find in the competition at this link: "https://www.kaggle.com/competitions/home-data-for-ml-course/overview"
|
| 53 |
+
1. **Train Model** - Upload `train.csv` and click *Train Model*
|
| 54 |
+
2. **Predict (CSV)** - Upload `test.csv` to download a full Kaggle submission file
|
| 55 |
+
3. **Single House** - Fill in the key features to get a single price estimate
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
with gr.Blocks(title="House Price Predictor", theme=gr.themes.Soft()) as demo:
|
| 59 |
+
|
| 60 |
+
gr.Markdown(DESCRIPTION)
|
| 61 |
+
|
| 62 |
+
with gr.Tabs():
|
| 63 |
+
|
| 64 |
+
# ── Tab 1: Train ──────────────────────────────────────────────────────
|
| 65 |
+
with gr.Tab("Train Model"):
|
| 66 |
+
gr.Markdown(
|
| 67 |
+
"Upload your **train.csv** (Kaggle House Prices format) and train the stacking ensemble.\n\n"
|
| 68 |
+
">Please wait after clicking!"
|
| 69 |
+
)
|
| 70 |
+
train_file = gr.File(label="Upload train.csv", file_types=[".csv"])
|
| 71 |
+
train_btn = gr.Button("Train Model", variant="primary")
|
| 72 |
+
train_out = gr.Markdown(value="Results will appear here after training.")
|
| 73 |
+
train_btn.click(fn=train_model, inputs=[train_file], outputs=[train_out])
|
| 74 |
+
|
| 75 |
+
# ── Tab 2: Predict CSV ────────────────────────────────────────────────
|
| 76 |
+
with gr.Tab("Predict (CSV)"):
|
| 77 |
+
gr.Markdown("Upload **test.csv** to generate a submission.csv ready for Kaggle.")
|
| 78 |
+
test_file = gr.File(label="Upload test.csv", file_types=[".csv"])
|
| 79 |
+
pred_btn = gr.Button("Generate Predictions", variant="primary")
|
| 80 |
+
pred_msg = gr.Markdown(value="Predictions will appear here.")
|
| 81 |
+
dl_file = gr.File(label="Download submission.csv")
|
| 82 |
+
pred_btn.click(fn=predict, inputs=[test_file], outputs=[dl_file, pred_msg])
|
| 83 |
+
|
| 84 |
+
# ── Tab 3: Single House ───────────────────────────────────────────────
|
| 85 |
+
with gr.Tab("Single House"):
|
| 86 |
+
gr.Markdown(
|
| 87 |
+
"Fill in the **8 key features** below to get a price estimate.\n\n"
|
| 88 |
+
"> All other house features are automatically filled with the **median values from the training data**."
|
| 89 |
+
)
|
| 90 |
+
inputs = []
|
| 91 |
+
with gr.Row():
|
| 92 |
+
for key, (label, default) in QUICK_FIELDS.items():
|
| 93 |
+
inputs.append(gr.Number(label=label, value=default))
|
| 94 |
+
|
| 95 |
+
single_btn = gr.Button("Estimate Price", variant="primary")
|
| 96 |
+
single_out = gr.Markdown(value="Your estimate will appear here.")
|
| 97 |
+
single_btn.click(fn=predict_single, inputs=inputs, outputs=[single_out])
|
| 98 |
+
|
| 99 |
+
gr.Markdown("---\nBuilt with Scikit-learn, XGBoost and Gradio.")
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
demo.launch(ssr_mode=False)
|
config.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
config.py — Shared constants used across all modules.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# Paths for saved model artefacts
|
| 6 |
+
MODEL_PATH = "stacking_model.joblib"
|
| 7 |
+
PREPROCESSOR_PATH = "preprocessor.joblib"
|
| 8 |
+
META_PATH = "model_meta.joblib"
|
| 9 |
+
|
| 10 |
+
# Preprocessing settings
|
| 11 |
+
COLUMNS_TO_DROP_BASE = ["MoSold", "YrSold", "Id"]
|
| 12 |
+
SKEW_THRESHOLD = 0.75
|
| 13 |
+
|
| 14 |
+
# The 8 fields shown in the Single House tab
|
| 15 |
+
# { column_name: (label, default_placeholder) }
|
| 16 |
+
QUICK_FIELDS = {
|
| 17 |
+
"GrLivArea": ("Above-Grade Living Area (sqft)", 1500),
|
| 18 |
+
"OverallQual": ("Overall Quality (1-10)", 7),
|
| 19 |
+
"YearBuilt": ("Year Built", 2000),
|
| 20 |
+
"TotalBsmtSF": ("Total Basement SF", 800),
|
| 21 |
+
"GarageArea": ("Garage Area (sqft)", 400),
|
| 22 |
+
"FullBath": ("Full Bathrooms", 2),
|
| 23 |
+
"BedroomAbvGr": ("Bedrooms Above Grade", 3),
|
| 24 |
+
"LotArea": ("Lot Area (sqft)", 8000),
|
| 25 |
+
}
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
r-base
|
predict.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
predict.py — Prediction logic.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
import joblib
|
| 9 |
+
import os
|
| 10 |
+
import tempfile
|
| 11 |
+
|
| 12 |
+
from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH, QUICK_FIELDS, COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
|
| 13 |
+
from preprocessing import engineer_features
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _load_model():
|
| 17 |
+
if not (os.path.exists(MODEL_PATH) and
|
| 18 |
+
os.path.exists(PREPROCESSOR_PATH) and
|
| 19 |
+
os.path.exists(META_PATH)):
|
| 20 |
+
raise FileNotFoundError("No trained model found. Please go to the Train Model tab first.")
|
| 21 |
+
return joblib.load(MODEL_PATH), joblib.load(PREPROCESSOR_PATH), joblib.load(META_PATH)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _prepare(df: pd.DataFrame, meta: dict) -> pd.DataFrame:
|
| 25 |
+
"""
|
| 26 |
+
Applies the exact same steps as training:
|
| 27 |
+
1. Drop columns
|
| 28 |
+
2. Engineer features
|
| 29 |
+
3. Fill missing
|
| 30 |
+
4. Log-transform the exact skewed_cols saved during training
|
| 31 |
+
5. Align to selected features
|
| 32 |
+
"""
|
| 33 |
+
selected_features = meta["selected_features"]
|
| 34 |
+
numerical_features = meta["numerical_features"]
|
| 35 |
+
categorical_features = meta["categorical_features"]
|
| 36 |
+
# Use saved skewed_cols if available, otherwise fall back to recalculating
|
| 37 |
+
skewed_cols = meta.get("skewed_cols", None)
|
| 38 |
+
|
| 39 |
+
# Step 1: drop
|
| 40 |
+
to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
|
| 41 |
+
df = df.drop(columns=list(set(to_drop + ["source", "Id"])), errors="ignore")
|
| 42 |
+
|
| 43 |
+
# Step 2: engineer features
|
| 44 |
+
df = engineer_features(df)
|
| 45 |
+
|
| 46 |
+
# Step 3: fill missing
|
| 47 |
+
for col in df.select_dtypes(include=["object"]).columns:
|
| 48 |
+
df[col] = df[col].fillna("Missing").astype(str)
|
| 49 |
+
for col in df.select_dtypes(include=[np.number]).columns:
|
| 50 |
+
fill_val = df[col].median() if len(df) > 1 else 0
|
| 51 |
+
df[col] = df[col].fillna(fill_val).astype(float)
|
| 52 |
+
|
| 53 |
+
# Step 4: log-transform
|
| 54 |
+
if skewed_cols is not None:
|
| 55 |
+
# Use exact columns from training
|
| 56 |
+
for col in skewed_cols:
|
| 57 |
+
if col in df.columns:
|
| 58 |
+
df[col] = np.log1p(df[col].clip(lower=0))
|
| 59 |
+
else:
|
| 60 |
+
# Fallback: recalculate skewness (only works reliably on large datasets)
|
| 61 |
+
num_cols = df.select_dtypes(include=[np.number]).columns
|
| 62 |
+
skewed = df[num_cols].apply(lambda x: x.dropna().skew())
|
| 63 |
+
for col in skewed[abs(skewed) > SKEW_THRESHOLD].index:
|
| 64 |
+
df[col] = np.log1p(df[col].clip(lower=0))
|
| 65 |
+
|
| 66 |
+
# Step 5: align to training columns
|
| 67 |
+
for col in selected_features:
|
| 68 |
+
if col not in df.columns:
|
| 69 |
+
df[col] = "Missing" if col in categorical_features else 0.0
|
| 70 |
+
|
| 71 |
+
X = df[selected_features].copy()
|
| 72 |
+
for col in numerical_features:
|
| 73 |
+
if col in X.columns:
|
| 74 |
+
X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0).astype(float)
|
| 75 |
+
for col in categorical_features:
|
| 76 |
+
if col in X.columns:
|
| 77 |
+
X[col] = X[col].fillna("Missing").astype(str)
|
| 78 |
+
|
| 79 |
+
return X
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def predict(test_file, progress=gr.Progress()):
|
| 83 |
+
try:
|
| 84 |
+
if test_file is None:
|
| 85 |
+
return None, "Please upload a test.csv file."
|
| 86 |
+
|
| 87 |
+
progress(0.10, desc="Loading model...")
|
| 88 |
+
ensemble, preprocessor, meta = _load_model()
|
| 89 |
+
|
| 90 |
+
progress(0.30, desc="Loading test data...")
|
| 91 |
+
test_path = test_file.name if hasattr(test_file, "name") else test_file
|
| 92 |
+
test_df = pd.read_csv(test_path)
|
| 93 |
+
test_id = test_df["Id"] if "Id" in test_df.columns else pd.RangeIndex(len(test_df))
|
| 94 |
+
|
| 95 |
+
progress(0.55, desc="Preprocessing...")
|
| 96 |
+
X_test = _prepare(test_df, meta)
|
| 97 |
+
|
| 98 |
+
progress(0.75, desc="Predicting...")
|
| 99 |
+
pred = np.expm1(ensemble.predict(preprocessor.transform(X_test)))
|
| 100 |
+
|
| 101 |
+
submission = pd.DataFrame({"Id": test_id, "SalePrice": pred})
|
| 102 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="submission_")
|
| 103 |
+
submission.to_csv(tmp.name, index=False)
|
| 104 |
+
|
| 105 |
+
preview = submission.head(10).to_markdown(index=False)
|
| 106 |
+
progress(1.0, desc="Done!")
|
| 107 |
+
return tmp.name, f"Predictions ready! ({len(submission)} rows)\n\n{preview}\n\nDownload the full file below."
|
| 108 |
+
|
| 109 |
+
except FileNotFoundError as e:
|
| 110 |
+
return None, str(e)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
return None, f"Prediction failed: {str(e)}"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def predict_single(*args):
|
| 116 |
+
try:
|
| 117 |
+
ensemble, preprocessor, meta = _load_model()
|
| 118 |
+
raw_numeric_medians = meta.get("raw_numeric_medians", {})
|
| 119 |
+
raw_cat_modes = meta.get("raw_cat_modes", {})
|
| 120 |
+
|
| 121 |
+
# Build full raw row: training medians/modes as base, override with user input
|
| 122 |
+
raw_row = {**raw_numeric_medians, **raw_cat_modes}
|
| 123 |
+
for k, v in zip(list(QUICK_FIELDS.keys()), args):
|
| 124 |
+
raw_row[k] = float(v)
|
| 125 |
+
|
| 126 |
+
X = _prepare(pd.DataFrame([raw_row]), meta)
|
| 127 |
+
pred_log = ensemble.predict(preprocessor.transform(X))[0]
|
| 128 |
+
pred = np.expm1(pred_log)
|
| 129 |
+
|
| 130 |
+
return f"### Estimated Sale Price: ${pred:,.0f}"
|
| 131 |
+
|
| 132 |
+
except FileNotFoundError as e:
|
| 133 |
+
return str(e)
|
| 134 |
+
except Exception as e:
|
| 135 |
+
return f"Error: {str(e)}"
|
preprocessing.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
preprocessing.py — Feature engineering and data preprocessing pipeline.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sklearn.preprocessing import RobustScaler, OneHotEncoder
|
| 8 |
+
from sklearn.impute import SimpleImputer
|
| 9 |
+
from sklearn.compose import ColumnTransformer
|
| 10 |
+
from sklearn.pipeline import Pipeline
|
| 11 |
+
|
| 12 |
+
from config import COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
|
| 16 |
+
df = df.copy()
|
| 17 |
+
df["TotalSF"] = (
|
| 18 |
+
df.get("TotalBsmtSF", pd.Series(0, index=df.index)).fillna(0)
|
| 19 |
+
+ df.get("1stFlrSF", pd.Series(0, index=df.index)).fillna(0)
|
| 20 |
+
+ df.get("2ndFlrSF", pd.Series(0, index=df.index)).fillna(0)
|
| 21 |
+
)
|
| 22 |
+
df["TotalBath"] = (
|
| 23 |
+
df.get("FullBath", pd.Series(0, index=df.index)).fillna(0)
|
| 24 |
+
+ 0.5 * df.get("HalfBath", pd.Series(0, index=df.index)).fillna(0)
|
| 25 |
+
+ df.get("BsmtFullBath", pd.Series(0, index=df.index)).fillna(0)
|
| 26 |
+
+ 0.5 * df.get("BsmtHalfBath", pd.Series(0, index=df.index)).fillna(0)
|
| 27 |
+
)
|
| 28 |
+
df["HasPool"] = (df.get("PoolArea", pd.Series(0, index=df.index)) > 0).astype(int)
|
| 29 |
+
df["HasGarage"] = df.get("GarageArea", pd.Series(np.nan, index=df.index)).notnull().astype(int)
|
| 30 |
+
df["HasBsmt"] = df.get("TotalBsmtSF", pd.Series(np.nan, index=df.index)).notnull().astype(int)
|
| 31 |
+
df["IsRemodeled"] = (
|
| 32 |
+
df.get("YearRemodAdd", pd.Series(0, index=df.index))
|
| 33 |
+
!= df.get("YearBuilt", pd.Series(0, index=df.index))
|
| 34 |
+
).astype(int)
|
| 35 |
+
return df
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def preprocess_combined(df: pd.DataFrame) -> pd.DataFrame:
|
| 39 |
+
# Step 1 — Drop unwanted / high-NaN columns
|
| 40 |
+
to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
|
| 41 |
+
missing_pct = df.isnull().sum() * 100 / len(df)
|
| 42 |
+
to_drop += missing_pct[missing_pct > 50].index.tolist()
|
| 43 |
+
df = df.drop(columns=list(set(to_drop)), errors="ignore")
|
| 44 |
+
|
| 45 |
+
# Step 2 — Feature engineering
|
| 46 |
+
df = engineer_features(df)
|
| 47 |
+
|
| 48 |
+
# Step 3 — Fix dtypes: numeric columns must be float, not object
|
| 49 |
+
for col in df.columns:
|
| 50 |
+
if col in df.select_dtypes(include=["object"]).columns:
|
| 51 |
+
# Try to convert to numeric; if it fails keep as string
|
| 52 |
+
converted = pd.to_numeric(df[col], errors="coerce")
|
| 53 |
+
if converted.notna().sum() > 0.5 * len(df):
|
| 54 |
+
df[col] = converted
|
| 55 |
+
|
| 56 |
+
# Step 4 — Log-transform skewed numerics
|
| 57 |
+
num_cols = df.select_dtypes(include=[np.number]).columns
|
| 58 |
+
skewed = df[num_cols].apply(lambda x: x.dropna().skew())
|
| 59 |
+
for feat in skewed[abs(skewed) > SKEW_THRESHOLD].index:
|
| 60 |
+
df[feat] = np.log1p(df[feat].clip(lower=0))
|
| 61 |
+
|
| 62 |
+
# Step 5 — Fill missing values
|
| 63 |
+
for col in df.select_dtypes(include=["object"]).columns:
|
| 64 |
+
df[col] = df[col].fillna("Missing").astype(str)
|
| 65 |
+
for col in df.select_dtypes(include=[np.number]).columns:
|
| 66 |
+
df[col] = df[col].fillna(df[col].median()).astype(float)
|
| 67 |
+
|
| 68 |
+
return df
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def build_preprocessor(numerical_features: list, categorical_features: list) -> ColumnTransformer:
|
| 72 |
+
num_pipeline = Pipeline([
|
| 73 |
+
("imputer", SimpleImputer(strategy="median")),
|
| 74 |
+
("scaler", RobustScaler()),
|
| 75 |
+
])
|
| 76 |
+
cat_pipeline = Pipeline([
|
| 77 |
+
("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
|
| 78 |
+
])
|
| 79 |
+
return ColumnTransformer([
|
| 80 |
+
("num", num_pipeline, numerical_features),
|
| 81 |
+
("cat", cat_pipeline, categorical_features),
|
| 82 |
+
])
|
preprocessing.r
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# preprocessing.R
|
| 3 |
+
# Feature engineering and data preprocessing pipeline
|
| 4 |
+
# R equivalent of preprocessing.py
|
| 5 |
+
# =============================================================================
|
| 6 |
+
|
| 7 |
+
library(dplyr)
|
| 8 |
+
library(tidyr)
|
| 9 |
+
|
| 10 |
+
# ── Constants ─────────────────────────────────────────────────────────────────
|
| 11 |
+
|
| 12 |
+
COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
|
| 13 |
+
SKEW_THRESHOLD <- 0.75
|
| 14 |
+
|
| 15 |
+
# ── Feature Engineering ───────────────────────────────────────────────────────
|
| 16 |
+
|
| 17 |
+
engineer_features <- function(df) {
|
| 18 |
+
df <- df %>%
|
| 19 |
+
mutate(
|
| 20 |
+
# Total square footage: basement + 1st floor + 2nd floor
|
| 21 |
+
TotalSF = replace_na(TotalBsmtSF, 0) +
|
| 22 |
+
replace_na(`1stFlrSF`, 0) +
|
| 23 |
+
replace_na(`2ndFlrSF`, 0),
|
| 24 |
+
|
| 25 |
+
# Total bathrooms (half baths count as 0.5)
|
| 26 |
+
TotalBath = replace_na(FullBath, 0) +
|
| 27 |
+
0.5 * replace_na(HalfBath, 0) +
|
| 28 |
+
replace_na(BsmtFullBath, 0) +
|
| 29 |
+
0.5 * replace_na(BsmtHalfBath, 0),
|
| 30 |
+
|
| 31 |
+
# Binary flags
|
| 32 |
+
HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0),
|
| 33 |
+
HasGarage = as.integer(!is.na(GarageArea)),
|
| 34 |
+
HasBsmt = as.integer(!is.na(TotalBsmtSF)),
|
| 35 |
+
IsRemodeled = as.integer(
|
| 36 |
+
replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
|
| 37 |
+
)
|
| 38 |
+
)
|
| 39 |
+
return(df)
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# ── Drop High-Missing Columns ─────────────────────────────────────────────────
|
| 43 |
+
|
| 44 |
+
drop_high_missing <- function(df, threshold = 50) {
|
| 45 |
+
missing_pct <- colMeans(is.na(df)) * 100
|
| 46 |
+
cols_to_drop <- names(missing_pct[missing_pct > threshold])
|
| 47 |
+
cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
|
| 48 |
+
cols_to_drop <- intersect(cols_to_drop, names(df))
|
| 49 |
+
df <- df %>% select(-all_of(cols_to_drop))
|
| 50 |
+
return(df)
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# ── Skewness Correction ───────────────────────────────────────────────────────
|
| 54 |
+
|
| 55 |
+
get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
|
| 56 |
+
# Get numeric columns only
|
| 57 |
+
num_cols <- df %>% select(where(is.numeric)) %>% names()
|
| 58 |
+
|
| 59 |
+
# Calculate skewness for each numeric column
|
| 60 |
+
skew_values <- sapply(num_cols, function(col) {
|
| 61 |
+
x <- df[[col]]
|
| 62 |
+
x <- x[!is.na(x)]
|
| 63 |
+
if (length(x) < 3) return(0)
|
| 64 |
+
n <- length(x)
|
| 65 |
+
m <- mean(x)
|
| 66 |
+
s <- sd(x)
|
| 67 |
+
if (s == 0) return(0)
|
| 68 |
+
(sum((x - m)^3) / n) / (s^3) # Pearson skewness
|
| 69 |
+
})
|
| 70 |
+
|
| 71 |
+
skewed_cols <- names(skew_values[abs(skew_values) > threshold])
|
| 72 |
+
return(skewed_cols)
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
apply_log_transform <- function(df, skewed_cols) {
|
| 76 |
+
for (col in skewed_cols) {
|
| 77 |
+
if (col %in% names(df)) {
|
| 78 |
+
df[[col]] <- log1p(pmax(df[[col]], 0))
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
return(df)
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# ── Fill Missing Values ───────────────────────────────────────────────────────
|
| 85 |
+
|
| 86 |
+
fill_missing <- function(df) {
|
| 87 |
+
# Categorical: fill with "Missing"
|
| 88 |
+
cat_cols <- df %>% select(where(is.character)) %>% names()
|
| 89 |
+
for (col in cat_cols) {
|
| 90 |
+
df[[col]][is.na(df[[col]])] <- "Missing"
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
# Numeric: fill with median
|
| 94 |
+
num_cols <- df %>% select(where(is.numeric)) %>% names()
|
| 95 |
+
for (col in num_cols) {
|
| 96 |
+
med <- median(df[[col]], na.rm = TRUE)
|
| 97 |
+
df[[col]][is.na(df[[col]])] <- med
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
return(df)
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
|
| 104 |
+
# This is the R equivalent of preprocess_combined() in preprocessing.py
|
| 105 |
+
|
| 106 |
+
preprocess_combined <- function(df) {
|
| 107 |
+
cat("Step 1: Dropping high-missing and flagged columns...\n")
|
| 108 |
+
df <- drop_high_missing(df)
|
| 109 |
+
|
| 110 |
+
cat("Step 2: Engineering new features...\n")
|
| 111 |
+
df <- engineer_features(df)
|
| 112 |
+
|
| 113 |
+
cat("Step 3: Identifying skewed columns...\n")
|
| 114 |
+
skewed_cols <- get_skewed_cols(df)
|
| 115 |
+
cat(sprintf(" Found %d skewed columns: %s\n",
|
| 116 |
+
length(skewed_cols),
|
| 117 |
+
paste(head(skewed_cols, 5), collapse = ", ")))
|
| 118 |
+
|
| 119 |
+
cat("Step 4: Applying log1p transform to skewed columns...\n")
|
| 120 |
+
df <- apply_log_transform(df, skewed_cols)
|
| 121 |
+
|
| 122 |
+
cat("Step 5: Filling missing values...\n")
|
| 123 |
+
df <- fill_missing(df)
|
| 124 |
+
|
| 125 |
+
cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
|
| 126 |
+
return(list(data = df, skewed_cols = skewed_cols))
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────
|
| 130 |
+
|
| 131 |
+
get_raw_stats <- function(df) {
|
| 132 |
+
# Numeric medians
|
| 133 |
+
num_cols <- df %>% select(where(is.numeric)) %>% names()
|
| 134 |
+
numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))
|
| 135 |
+
|
| 136 |
+
# Categorical modes
|
| 137 |
+
cat_cols <- df %>% select(where(is.character)) %>% names()
|
| 138 |
+
cat_modes <- sapply(cat_cols, function(col) {
|
| 139 |
+
tbl <- table(df[[col]])
|
| 140 |
+
names(which.max(tbl))
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
# ── Main: Run as standalone script ────────────────────────────────────────────
|
| 147 |
+
|
| 148 |
+
if (!interactive()) {
|
| 149 |
+
args <- commandArgs(trailingOnly = TRUE)
|
| 150 |
+
|
| 151 |
+
if (length(args) < 2) {
|
| 152 |
+
cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
|
| 153 |
+
cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
|
| 154 |
+
quit(status = 1)
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
input_path <- args[1]
|
| 158 |
+
output_path <- args[2]
|
| 159 |
+
|
| 160 |
+
cat(sprintf("Reading data from: %s\n", input_path))
|
| 161 |
+
df <- read.csv(input_path, stringsAsFactors = FALSE)
|
| 162 |
+
cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))
|
| 163 |
+
|
| 164 |
+
# Remove target if present (training data)
|
| 165 |
+
if ("SalePrice" %in% names(df)) {
|
| 166 |
+
target <- df$SalePrice
|
| 167 |
+
df <- df %>% select(-SalePrice)
|
| 168 |
+
cat("Target variable (SalePrice) stored separately.\n")
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Run full preprocessing
|
| 172 |
+
result <- preprocess_combined(df)
|
| 173 |
+
df_processed <- result$data
|
| 174 |
+
skewed_cols <- result$skewed_cols
|
| 175 |
+
|
| 176 |
+
# Save processed data
|
| 177 |
+
write.csv(df_processed, output_path, row.names = FALSE)
|
| 178 |
+
cat(sprintf("Processed data saved to: %s\n", output_path))
|
| 179 |
+
|
| 180 |
+
# Save skewed columns list
|
| 181 |
+
skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
|
| 182 |
+
writeLines(skewed_cols, skewed_path)
|
| 183 |
+
cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
|
| 184 |
+
}
|
requirements (1).txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
scikit-learn
|
| 4 |
+
xgboost
|
| 5 |
+
joblib
|
| 6 |
+
tabulate
|
| 7 |
+
|
train.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
train.py — Model training logic.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
import joblib
|
| 9 |
+
|
| 10 |
+
from sklearn.model_selection import GridSearchCV
|
| 11 |
+
from sklearn.linear_model import Lasso
|
| 12 |
+
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
|
| 13 |
+
from xgboost import XGBRegressor
|
| 14 |
+
|
| 15 |
+
from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH, COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
|
| 16 |
+
from preprocessing import engineer_features, build_preprocessor
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _full_preprocess(df: pd.DataFrame):
|
| 20 |
+
"""
|
| 21 |
+
Full preprocessing. Returns (X, skewed_cols, selected_features, num_feats, cat_feats).
|
| 22 |
+
Separated so train can save skewed_cols and predict can reuse them.
|
| 23 |
+
"""
|
| 24 |
+
# Drop unwanted columns
|
| 25 |
+
to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
|
| 26 |
+
missing_pct = df.isnull().sum() * 100 / len(df)
|
| 27 |
+
to_drop += missing_pct[missing_pct > 50].index.tolist()
|
| 28 |
+
df = df.drop(columns=list(set(to_drop)), errors="ignore")
|
| 29 |
+
df = df.drop(columns=["source"], errors="ignore")
|
| 30 |
+
|
| 31 |
+
# Engineer features
|
| 32 |
+
df = engineer_features(df)
|
| 33 |
+
|
| 34 |
+
# Fill missing before skew calculation
|
| 35 |
+
for col in df.select_dtypes(include=["object"]).columns:
|
| 36 |
+
df[col] = df[col].fillna("Missing").astype(str)
|
| 37 |
+
for col in df.select_dtypes(include=[np.number]).columns:
|
| 38 |
+
df[col] = df[col].fillna(df[col].median()).astype(float)
|
| 39 |
+
|
| 40 |
+
# Identify skewed numeric columns AFTER engineering and filling
|
| 41 |
+
num_cols = df.select_dtypes(include=[np.number]).columns
|
| 42 |
+
skewed_series = df[num_cols].apply(lambda x: x.skew())
|
| 43 |
+
skewed_cols = skewed_series[abs(skewed_series) > SKEW_THRESHOLD].index.tolist()
|
| 44 |
+
|
| 45 |
+
# Apply log transform
|
| 46 |
+
for col in skewed_cols:
|
| 47 |
+
df[col] = np.log1p(df[col].clip(lower=0))
|
| 48 |
+
|
| 49 |
+
selected_features = df.columns.tolist()
|
| 50 |
+
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 51 |
+
categorical_features = df.select_dtypes(include=["object"]).columns.tolist()
|
| 52 |
+
|
| 53 |
+
return df, skewed_cols, selected_features, numerical_features, categorical_features
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def train_model(train_file, progress=gr.Progress()):
|
| 57 |
+
try:
|
| 58 |
+
if train_file is None:
|
| 59 |
+
return "Please upload a train.csv file first."
|
| 60 |
+
|
| 61 |
+
train_path = train_file.name if hasattr(train_file, "name") else train_file
|
| 62 |
+
|
| 63 |
+
progress(0.05, desc="Loading data...")
|
| 64 |
+
train_df = pd.read_csv(train_path)
|
| 65 |
+
|
| 66 |
+
if "SalePrice" not in train_df.columns:
|
| 67 |
+
return "Error: SalePrice column not found. Make sure you upload the Kaggle train.csv."
|
| 68 |
+
|
| 69 |
+
target = train_df["SalePrice"].copy()
|
| 70 |
+
train_df = train_df.drop(columns=["SalePrice", "Id"], errors="ignore")
|
| 71 |
+
|
| 72 |
+
# Save RAW medians/modes before any transformation
|
| 73 |
+
raw_numeric_medians = train_df.select_dtypes(include=[np.number]).median().to_dict()
|
| 74 |
+
raw_cat_modes = {
|
| 75 |
+
col: train_df[col].mode()[0] if not train_df[col].mode().empty else "Missing"
|
| 76 |
+
for col in train_df.select_dtypes(include=["object"]).columns
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
train_df["source"] = "train"
|
| 80 |
+
|
| 81 |
+
progress(0.12, desc="Preprocessing features...")
|
| 82 |
+
X, skewed_cols, selected_features, numerical_features, categorical_features = _full_preprocess(train_df)
|
| 83 |
+
|
| 84 |
+
preprocessor = build_preprocessor(numerical_features, categorical_features)
|
| 85 |
+
X_processed = preprocessor.fit_transform(X)
|
| 86 |
+
target_log = np.log1p(target)
|
| 87 |
+
|
| 88 |
+
# Lasso
|
| 89 |
+
progress(0.22, desc="Tuning Lasso... (1/3)")
|
| 90 |
+
lasso_cv = GridSearchCV(
|
| 91 |
+
Lasso(random_state=42),
|
| 92 |
+
{"alpha": [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], "max_iter": [10000]},
|
| 93 |
+
cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
|
| 94 |
+
)
|
| 95 |
+
lasso_cv.fit(X_processed, target_log)
|
| 96 |
+
|
| 97 |
+
# Random Forest
|
| 98 |
+
progress(0.42, desc="Tuning Random Forest... (2/3)")
|
| 99 |
+
rf_cv = GridSearchCV(
|
| 100 |
+
RandomForestRegressor(random_state=42, n_jobs=-1),
|
| 101 |
+
{
|
| 102 |
+
"n_estimators": [100],
|
| 103 |
+
"max_depth": [10, 20],
|
| 104 |
+
"min_samples_split": [2, 5],
|
| 105 |
+
"min_samples_leaf": [1, 2],
|
| 106 |
+
},
|
| 107 |
+
cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
|
| 108 |
+
)
|
| 109 |
+
rf_cv.fit(X_processed, target_log)
|
| 110 |
+
|
| 111 |
+
# XGBoost
|
| 112 |
+
progress(0.65, desc="Tuning XGBoost... (3/3)")
|
| 113 |
+
xgb_cv = GridSearchCV(
|
| 114 |
+
XGBRegressor(
|
| 115 |
+
objective="reg:squarederror", random_state=42,
|
| 116 |
+
tree_method="hist", n_jobs=1, verbosity=0,
|
| 117 |
+
),
|
| 118 |
+
{
|
| 119 |
+
"n_estimators": [200],
|
| 120 |
+
"max_depth": [5, 7],
|
| 121 |
+
"learning_rate": [0.05, 0.1],
|
| 122 |
+
"subsample": [0.8, 1.0],
|
| 123 |
+
},
|
| 124 |
+
cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
|
| 125 |
+
)
|
| 126 |
+
xgb_cv.fit(X_processed, target_log)
|
| 127 |
+
|
| 128 |
+
# Stacking
|
| 129 |
+
progress(0.83, desc="Fitting Stacking Ensemble...")
|
| 130 |
+
ensemble = StackingRegressor(
|
| 131 |
+
estimators=[
|
| 132 |
+
("lasso", Lasso(**lasso_cv.best_params_, random_state=42)),
|
| 133 |
+
("rf", RandomForestRegressor(**rf_cv.best_params_, random_state=42, n_jobs=-1)),
|
| 134 |
+
("xgb", XGBRegressor(
|
| 135 |
+
**xgb_cv.best_params_, random_state=42,
|
| 136 |
+
tree_method="hist", n_jobs=1, verbosity=0,
|
| 137 |
+
)),
|
| 138 |
+
],
|
| 139 |
+
final_estimator=Lasso(alpha=0.001, random_state=42, max_iter=10000),
|
| 140 |
+
cv=3, n_jobs=-1,
|
| 141 |
+
)
|
| 142 |
+
ensemble.fit(X_processed, target_log)
|
| 143 |
+
|
| 144 |
+
progress(0.96, desc="Saving model...")
|
| 145 |
+
joblib.dump(ensemble, MODEL_PATH)
|
| 146 |
+
joblib.dump(preprocessor, PREPROCESSOR_PATH)
|
| 147 |
+
joblib.dump({
|
| 148 |
+
"selected_features": selected_features,
|
| 149 |
+
"numerical_features": numerical_features,
|
| 150 |
+
"categorical_features": categorical_features,
|
| 151 |
+
"skewed_cols": skewed_cols,
|
| 152 |
+
"raw_numeric_medians": raw_numeric_medians,
|
| 153 |
+
"raw_cat_modes": raw_cat_modes,
|
| 154 |
+
}, META_PATH)
|
| 155 |
+
|
| 156 |
+
lasso_rmse = np.sqrt(-lasso_cv.best_score_)
|
| 157 |
+
rf_rmse = np.sqrt(-rf_cv.best_score_)
|
| 158 |
+
xgb_rmse = np.sqrt(-xgb_cv.best_score_)
|
| 159 |
+
|
| 160 |
+
progress(1.0, desc="Done!")
|
| 161 |
+
return (
|
| 162 |
+
f"Model trained successfully!\n\n"
|
| 163 |
+
f"| Model | CV RMSE (log) | Best Params |\n"
|
| 164 |
+
f"|---|---|---|\n"
|
| 165 |
+
f"| Lasso | {lasso_rmse:.5f} | {lasso_cv.best_params_} |\n"
|
| 166 |
+
f"| Random Forest | {rf_rmse:.5f} | {rf_cv.best_params_} |\n"
|
| 167 |
+
f"| XGBoost | {xgb_rmse:.5f} | {xgb_cv.best_params_} |\n\n"
|
| 168 |
+
f"Switch to Predict (CSV) or Single House to use it!"
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
return f"Training failed: {str(e)}\n\nPlease check your CSV file and try again."
|