GBDB02 commited on
Commit
97b9bf6
·
verified ·
1 Parent(s): 9ff9d93

Upload 8 files

Browse files
Files changed (8) hide show
  1. app (1).py +102 -0
  2. config.py +25 -0
  3. packages.txt +1 -0
  4. predict.py +135 -0
  5. preprocessing.py +82 -0
  6. preprocessing.r +184 -0
  7. requirements (1).txt +7 -0
  8. train.py +172 -0
app (1).py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py — Gradio UI entry point.
3
+ """
4
+
5
+ import sys
6
+ print("Python started", flush=True)
7
+
8
+ try:
9
+ import gradio as gr
10
+ print("gradio OK", flush=True)
11
+ except Exception as e:
12
+ print(f"FAILED gradio: {e}", flush=True)
13
+ sys.exit(1)
14
+
15
+ try:
16
+ from config import QUICK_FIELDS
17
+ print("config OK", flush=True)
18
+ except Exception as e:
19
+ print(f"FAILED config: {e}", flush=True)
20
+ sys.exit(1)
21
+
22
+ try:
23
+ from train import train_model
24
+ print("train OK", flush=True)
25
+ except Exception as e:
26
+ print(f"FAILED train: {e}", flush=True)
27
+ sys.exit(1)
28
+
29
+ try:
30
+ from predict import predict, predict_single
31
+ print("predict OK", flush=True)
32
+ except Exception as e:
33
+ print(f"FAILED predict: {e}", flush=True)
34
+ sys.exit(1)
35
+
36
+ print("Building UI...", flush=True)
37
+
38
+ import gradio as gr
39
+ from config import QUICK_FIELDS
40
+ from train import train_model
41
+ from predict import predict, predict_single
42
+
43
+ DESCRIPTION = """
44
+ # House Price Predictor
45
+ **Stacking Ensemble: Lasso + Random Forest + XGBoost**
46
+
47
+ *Final Project- Giovanni Battista Del Basso, Francesco Ciccarese, Miguel Domingos, Santiago Genoni*
48
+
49
+ ---
50
+ **How to use:**
51
+ The model uses the Kaggle competition: "Housing Prices Competition for Kaggle Learn Users".
52
+ To use the model, upload the train and test set that you can find in the competition at this link: "https://www.kaggle.com/competitions/home-data-for-ml-course/overview"
53
+ 1. **Train Model** - Upload `train.csv` and click *Train Model*
54
+ 2. **Predict (CSV)** - Upload `test.csv` to download a full Kaggle submission file
55
+ 3. **Single House** - Fill in the key features to get a single price estimate
56
+ """
57
+
58
+ with gr.Blocks(title="House Price Predictor", theme=gr.themes.Soft()) as demo:
59
+
60
+ gr.Markdown(DESCRIPTION)
61
+
62
+ with gr.Tabs():
63
+
64
+ # ── Tab 1: Train ──────────────────────────────────────────────────────
65
+ with gr.Tab("Train Model"):
66
+ gr.Markdown(
67
+ "Upload your **train.csv** (Kaggle House Prices format) and train the stacking ensemble.\n\n"
68
+ ">Please wait after clicking!"
69
+ )
70
+ train_file = gr.File(label="Upload train.csv", file_types=[".csv"])
71
+ train_btn = gr.Button("Train Model", variant="primary")
72
+ train_out = gr.Markdown(value="Results will appear here after training.")
73
+ train_btn.click(fn=train_model, inputs=[train_file], outputs=[train_out])
74
+
75
+ # ── Tab 2: Predict CSV ────────────────────────────────────────────────
76
+ with gr.Tab("Predict (CSV)"):
77
+ gr.Markdown("Upload **test.csv** to generate a submission.csv ready for Kaggle.")
78
+ test_file = gr.File(label="Upload test.csv", file_types=[".csv"])
79
+ pred_btn = gr.Button("Generate Predictions", variant="primary")
80
+ pred_msg = gr.Markdown(value="Predictions will appear here.")
81
+ dl_file = gr.File(label="Download submission.csv")
82
+ pred_btn.click(fn=predict, inputs=[test_file], outputs=[dl_file, pred_msg])
83
+
84
+ # ── Tab 3: Single House ───────────────────────────────────────────────
85
+ with gr.Tab("Single House"):
86
+ gr.Markdown(
87
+ "Fill in the **8 key features** below to get a price estimate.\n\n"
88
+ "> All other house features are automatically filled with the **median values from the training data**."
89
+ )
90
+ inputs = []
91
+ with gr.Row():
92
+ for key, (label, default) in QUICK_FIELDS.items():
93
+ inputs.append(gr.Number(label=label, value=default))
94
+
95
+ single_btn = gr.Button("Estimate Price", variant="primary")
96
+ single_out = gr.Markdown(value="Your estimate will appear here.")
97
+ single_btn.click(fn=predict_single, inputs=inputs, outputs=[single_out])
98
+
99
+ gr.Markdown("---\nBuilt with Scikit-learn, XGBoost and Gradio.")
100
+
101
+ if __name__ == "__main__":
102
+ demo.launch(ssr_mode=False)
config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ config.py — Shared constants used across all modules.
3
+ """
4
+
5
+ # Paths for saved model artefacts
6
+ MODEL_PATH = "stacking_model.joblib"
7
+ PREPROCESSOR_PATH = "preprocessor.joblib"
8
+ META_PATH = "model_meta.joblib"
9
+
10
+ # Preprocessing settings
11
+ COLUMNS_TO_DROP_BASE = ["MoSold", "YrSold", "Id"]
12
+ SKEW_THRESHOLD = 0.75
13
+
14
+ # The 8 fields shown in the Single House tab
15
+ # { column_name: (label, default_placeholder) }
16
+ QUICK_FIELDS = {
17
+ "GrLivArea": ("Above-Grade Living Area (sqft)", 1500),
18
+ "OverallQual": ("Overall Quality (1-10)", 7),
19
+ "YearBuilt": ("Year Built", 2000),
20
+ "TotalBsmtSF": ("Total Basement SF", 800),
21
+ "GarageArea": ("Garage Area (sqft)", 400),
22
+ "FullBath": ("Full Bathrooms", 2),
23
+ "BedroomAbvGr": ("Bedrooms Above Grade", 3),
24
+ "LotArea": ("Lot Area (sqft)", 8000),
25
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ r-base
predict.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ predict.py — Prediction logic.
3
+ """
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import numpy as np
8
+ import joblib
9
+ import os
10
+ import tempfile
11
+
12
+ from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH, QUICK_FIELDS, COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
13
+ from preprocessing import engineer_features
14
+
15
+
16
+ def _load_model():
17
+ if not (os.path.exists(MODEL_PATH) and
18
+ os.path.exists(PREPROCESSOR_PATH) and
19
+ os.path.exists(META_PATH)):
20
+ raise FileNotFoundError("No trained model found. Please go to the Train Model tab first.")
21
+ return joblib.load(MODEL_PATH), joblib.load(PREPROCESSOR_PATH), joblib.load(META_PATH)
22
+
23
+
24
+ def _prepare(df: pd.DataFrame, meta: dict) -> pd.DataFrame:
25
+ """
26
+ Applies the exact same steps as training:
27
+ 1. Drop columns
28
+ 2. Engineer features
29
+ 3. Fill missing
30
+ 4. Log-transform the exact skewed_cols saved during training
31
+ 5. Align to selected features
32
+ """
33
+ selected_features = meta["selected_features"]
34
+ numerical_features = meta["numerical_features"]
35
+ categorical_features = meta["categorical_features"]
36
+ # Use saved skewed_cols if available, otherwise fall back to recalculating
37
+ skewed_cols = meta.get("skewed_cols", None)
38
+
39
+ # Step 1: drop
40
+ to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
41
+ df = df.drop(columns=list(set(to_drop + ["source", "Id"])), errors="ignore")
42
+
43
+ # Step 2: engineer features
44
+ df = engineer_features(df)
45
+
46
+ # Step 3: fill missing
47
+ for col in df.select_dtypes(include=["object"]).columns:
48
+ df[col] = df[col].fillna("Missing").astype(str)
49
+ for col in df.select_dtypes(include=[np.number]).columns:
50
+ fill_val = df[col].median() if len(df) > 1 else 0
51
+ df[col] = df[col].fillna(fill_val).astype(float)
52
+
53
+ # Step 4: log-transform
54
+ if skewed_cols is not None:
55
+ # Use exact columns from training
56
+ for col in skewed_cols:
57
+ if col in df.columns:
58
+ df[col] = np.log1p(df[col].clip(lower=0))
59
+ else:
60
+ # Fallback: recalculate skewness (only works reliably on large datasets)
61
+ num_cols = df.select_dtypes(include=[np.number]).columns
62
+ skewed = df[num_cols].apply(lambda x: x.dropna().skew())
63
+ for col in skewed[abs(skewed) > SKEW_THRESHOLD].index:
64
+ df[col] = np.log1p(df[col].clip(lower=0))
65
+
66
+ # Step 5: align to training columns
67
+ for col in selected_features:
68
+ if col not in df.columns:
69
+ df[col] = "Missing" if col in categorical_features else 0.0
70
+
71
+ X = df[selected_features].copy()
72
+ for col in numerical_features:
73
+ if col in X.columns:
74
+ X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0).astype(float)
75
+ for col in categorical_features:
76
+ if col in X.columns:
77
+ X[col] = X[col].fillna("Missing").astype(str)
78
+
79
+ return X
80
+
81
+
82
+ def predict(test_file, progress=gr.Progress()):
83
+ try:
84
+ if test_file is None:
85
+ return None, "Please upload a test.csv file."
86
+
87
+ progress(0.10, desc="Loading model...")
88
+ ensemble, preprocessor, meta = _load_model()
89
+
90
+ progress(0.30, desc="Loading test data...")
91
+ test_path = test_file.name if hasattr(test_file, "name") else test_file
92
+ test_df = pd.read_csv(test_path)
93
+ test_id = test_df["Id"] if "Id" in test_df.columns else pd.RangeIndex(len(test_df))
94
+
95
+ progress(0.55, desc="Preprocessing...")
96
+ X_test = _prepare(test_df, meta)
97
+
98
+ progress(0.75, desc="Predicting...")
99
+ pred = np.expm1(ensemble.predict(preprocessor.transform(X_test)))
100
+
101
+ submission = pd.DataFrame({"Id": test_id, "SalePrice": pred})
102
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", prefix="submission_")
103
+ submission.to_csv(tmp.name, index=False)
104
+
105
+ preview = submission.head(10).to_markdown(index=False)
106
+ progress(1.0, desc="Done!")
107
+ return tmp.name, f"Predictions ready! ({len(submission)} rows)\n\n{preview}\n\nDownload the full file below."
108
+
109
+ except FileNotFoundError as e:
110
+ return None, str(e)
111
+ except Exception as e:
112
+ return None, f"Prediction failed: {str(e)}"
113
+
114
+
115
+ def predict_single(*args):
116
+ try:
117
+ ensemble, preprocessor, meta = _load_model()
118
+ raw_numeric_medians = meta.get("raw_numeric_medians", {})
119
+ raw_cat_modes = meta.get("raw_cat_modes", {})
120
+
121
+ # Build full raw row: training medians/modes as base, override with user input
122
+ raw_row = {**raw_numeric_medians, **raw_cat_modes}
123
+ for k, v in zip(list(QUICK_FIELDS.keys()), args):
124
+ raw_row[k] = float(v)
125
+
126
+ X = _prepare(pd.DataFrame([raw_row]), meta)
127
+ pred_log = ensemble.predict(preprocessor.transform(X))[0]
128
+ pred = np.expm1(pred_log)
129
+
130
+ return f"### Estimated Sale Price: ${pred:,.0f}"
131
+
132
+ except FileNotFoundError as e:
133
+ return str(e)
134
+ except Exception as e:
135
+ return f"Error: {str(e)}"
preprocessing.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ preprocessing.py — Feature engineering and data preprocessing pipeline.
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from sklearn.preprocessing import RobustScaler, OneHotEncoder
8
+ from sklearn.impute import SimpleImputer
9
+ from sklearn.compose import ColumnTransformer
10
+ from sklearn.pipeline import Pipeline
11
+
12
+ from config import COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
13
+
14
+
15
+ def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
16
+ df = df.copy()
17
+ df["TotalSF"] = (
18
+ df.get("TotalBsmtSF", pd.Series(0, index=df.index)).fillna(0)
19
+ + df.get("1stFlrSF", pd.Series(0, index=df.index)).fillna(0)
20
+ + df.get("2ndFlrSF", pd.Series(0, index=df.index)).fillna(0)
21
+ )
22
+ df["TotalBath"] = (
23
+ df.get("FullBath", pd.Series(0, index=df.index)).fillna(0)
24
+ + 0.5 * df.get("HalfBath", pd.Series(0, index=df.index)).fillna(0)
25
+ + df.get("BsmtFullBath", pd.Series(0, index=df.index)).fillna(0)
26
+ + 0.5 * df.get("BsmtHalfBath", pd.Series(0, index=df.index)).fillna(0)
27
+ )
28
+ df["HasPool"] = (df.get("PoolArea", pd.Series(0, index=df.index)) > 0).astype(int)
29
+ df["HasGarage"] = df.get("GarageArea", pd.Series(np.nan, index=df.index)).notnull().astype(int)
30
+ df["HasBsmt"] = df.get("TotalBsmtSF", pd.Series(np.nan, index=df.index)).notnull().astype(int)
31
+ df["IsRemodeled"] = (
32
+ df.get("YearRemodAdd", pd.Series(0, index=df.index))
33
+ != df.get("YearBuilt", pd.Series(0, index=df.index))
34
+ ).astype(int)
35
+ return df
36
+
37
+
38
+ def preprocess_combined(df: pd.DataFrame) -> pd.DataFrame:
39
+ # Step 1 — Drop unwanted / high-NaN columns
40
+ to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
41
+ missing_pct = df.isnull().sum() * 100 / len(df)
42
+ to_drop += missing_pct[missing_pct > 50].index.tolist()
43
+ df = df.drop(columns=list(set(to_drop)), errors="ignore")
44
+
45
+ # Step 2 — Feature engineering
46
+ df = engineer_features(df)
47
+
48
+ # Step 3 — Fix dtypes: numeric columns must be float, not object
49
+ for col in df.columns:
50
+ if col in df.select_dtypes(include=["object"]).columns:
51
+ # Try to convert to numeric; if it fails keep as string
52
+ converted = pd.to_numeric(df[col], errors="coerce")
53
+ if converted.notna().sum() > 0.5 * len(df):
54
+ df[col] = converted
55
+
56
+ # Step 4 — Log-transform skewed numerics
57
+ num_cols = df.select_dtypes(include=[np.number]).columns
58
+ skewed = df[num_cols].apply(lambda x: x.dropna().skew())
59
+ for feat in skewed[abs(skewed) > SKEW_THRESHOLD].index:
60
+ df[feat] = np.log1p(df[feat].clip(lower=0))
61
+
62
+ # Step 5 — Fill missing values
63
+ for col in df.select_dtypes(include=["object"]).columns:
64
+ df[col] = df[col].fillna("Missing").astype(str)
65
+ for col in df.select_dtypes(include=[np.number]).columns:
66
+ df[col] = df[col].fillna(df[col].median()).astype(float)
67
+
68
+ return df
69
+
70
+
71
+ def build_preprocessor(numerical_features: list, categorical_features: list) -> ColumnTransformer:
72
+ num_pipeline = Pipeline([
73
+ ("imputer", SimpleImputer(strategy="median")),
74
+ ("scaler", RobustScaler()),
75
+ ])
76
+ cat_pipeline = Pipeline([
77
+ ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
78
+ ])
79
+ return ColumnTransformer([
80
+ ("num", num_pipeline, numerical_features),
81
+ ("cat", cat_pipeline, categorical_features),
82
+ ])
preprocessing.r ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # preprocessing.R
3
+ # Feature engineering and data preprocessing pipeline
4
+ # R equivalent of preprocessing.py
5
+ # =============================================================================
6
+
7
+ library(dplyr)
8
+ library(tidyr)
9
+
10
+ # ── Constants ─────────────────────────────────────────────────────────────────
11
+
12
+ COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
13
+ SKEW_THRESHOLD <- 0.75
14
+
15
+ # ── Feature Engineering ───────────────────────────────────────────────────────
16
+
17
+ engineer_features <- function(df) {
18
+ df <- df %>%
19
+ mutate(
20
+ # Total square footage: basement + 1st floor + 2nd floor
21
+ TotalSF = replace_na(TotalBsmtSF, 0) +
22
+ replace_na(`1stFlrSF`, 0) +
23
+ replace_na(`2ndFlrSF`, 0),
24
+
25
+ # Total bathrooms (half baths count as 0.5)
26
+ TotalBath = replace_na(FullBath, 0) +
27
+ 0.5 * replace_na(HalfBath, 0) +
28
+ replace_na(BsmtFullBath, 0) +
29
+ 0.5 * replace_na(BsmtHalfBath, 0),
30
+
31
+ # Binary flags
32
+ HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0),
33
+ HasGarage = as.integer(!is.na(GarageArea)),
34
+ HasBsmt = as.integer(!is.na(TotalBsmtSF)),
35
+ IsRemodeled = as.integer(
36
+ replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
37
+ )
38
+ )
39
+ return(df)
40
+ }
41
+
42
+ # ── Drop High-Missing Columns ─────────────────────────────────────────────────
43
+
44
+ drop_high_missing <- function(df, threshold = 50) {
45
+ missing_pct <- colMeans(is.na(df)) * 100
46
+ cols_to_drop <- names(missing_pct[missing_pct > threshold])
47
+ cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
48
+ cols_to_drop <- intersect(cols_to_drop, names(df))
49
+ df <- df %>% select(-all_of(cols_to_drop))
50
+ return(df)
51
+ }
52
+
53
+ # ── Skewness Correction ───────────────────────────────────────────────────────
54
+
55
+ get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
56
+ # Get numeric columns only
57
+ num_cols <- df %>% select(where(is.numeric)) %>% names()
58
+
59
+ # Calculate skewness for each numeric column
60
+ skew_values <- sapply(num_cols, function(col) {
61
+ x <- df[[col]]
62
+ x <- x[!is.na(x)]
63
+ if (length(x) < 3) return(0)
64
+ n <- length(x)
65
+ m <- mean(x)
66
+ s <- sd(x)
67
+ if (s == 0) return(0)
68
+ (sum((x - m)^3) / n) / (s^3) # Pearson skewness
69
+ })
70
+
71
+ skewed_cols <- names(skew_values[abs(skew_values) > threshold])
72
+ return(skewed_cols)
73
+ }
74
+
75
+ apply_log_transform <- function(df, skewed_cols) {
76
+ for (col in skewed_cols) {
77
+ if (col %in% names(df)) {
78
+ df[[col]] <- log1p(pmax(df[[col]], 0))
79
+ }
80
+ }
81
+ return(df)
82
+ }
83
+
84
+ # ── Fill Missing Values ───────────────────────────────────────────────────────
85
+
86
+ fill_missing <- function(df) {
87
+ # Categorical: fill with "Missing"
88
+ cat_cols <- df %>% select(where(is.character)) %>% names()
89
+ for (col in cat_cols) {
90
+ df[[col]][is.na(df[[col]])] <- "Missing"
91
+ }
92
+
93
+ # Numeric: fill with median
94
+ num_cols <- df %>% select(where(is.numeric)) %>% names()
95
+ for (col in num_cols) {
96
+ med <- median(df[[col]], na.rm = TRUE)
97
+ df[[col]][is.na(df[[col]])] <- med
98
+ }
99
+
100
+ return(df)
101
+ }
102
+
103
+ # ── Full Preprocessing Pipeline ───────────────────────────────────────────────
104
+ # This is the R equivalent of preprocess_combined() in preprocessing.py
105
+
106
+ preprocess_combined <- function(df) {
107
+ cat("Step 1: Dropping high-missing and flagged columns...\n")
108
+ df <- drop_high_missing(df)
109
+
110
+ cat("Step 2: Engineering new features...\n")
111
+ df <- engineer_features(df)
112
+
113
+ cat("Step 3: Identifying skewed columns...\n")
114
+ skewed_cols <- get_skewed_cols(df)
115
+ cat(sprintf(" Found %d skewed columns: %s\n",
116
+ length(skewed_cols),
117
+ paste(head(skewed_cols, 5), collapse = ", ")))
118
+
119
+ cat("Step 4: Applying log1p transform to skewed columns...\n")
120
+ df <- apply_log_transform(df, skewed_cols)
121
+
122
+ cat("Step 5: Filling missing values...\n")
123
+ df <- fill_missing(df)
124
+
125
+ cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
126
+ return(list(data = df, skewed_cols = skewed_cols))
127
+ }
128
+
129
+ # ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────
130
+
131
+ get_raw_stats <- function(df) {
132
+ # Numeric medians
133
+ num_cols <- df %>% select(where(is.numeric)) %>% names()
134
+ numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))
135
+
136
+ # Categorical modes
137
+ cat_cols <- df %>% select(where(is.character)) %>% names()
138
+ cat_modes <- sapply(cat_cols, function(col) {
139
+ tbl <- table(df[[col]])
140
+ names(which.max(tbl))
141
+ })
142
+
143
+ return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
144
+ }
145
+
146
+ # ── Main: Run as standalone script ────────────────────────────────────────────
147
+
148
+ if (!interactive()) {
149
+ args <- commandArgs(trailingOnly = TRUE)
150
+
151
+ if (length(args) < 2) {
152
+ cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
153
+ cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
154
+ quit(status = 1)
155
+ }
156
+
157
+ input_path <- args[1]
158
+ output_path <- args[2]
159
+
160
+ cat(sprintf("Reading data from: %s\n", input_path))
161
+ df <- read.csv(input_path, stringsAsFactors = FALSE)
162
+ cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))
163
+
164
+ # Remove target if present (training data)
165
+ if ("SalePrice" %in% names(df)) {
166
+ target <- df$SalePrice
167
+ df <- df %>% select(-SalePrice)
168
+ cat("Target variable (SalePrice) stored separately.\n")
169
+ }
170
+
171
+ # Run full preprocessing
172
+ result <- preprocess_combined(df)
173
+ df_processed <- result$data
174
+ skewed_cols <- result$skewed_cols
175
+
176
+ # Save processed data
177
+ write.csv(df_processed, output_path, row.names = FALSE)
178
+ cat(sprintf("Processed data saved to: %s\n", output_path))
179
+
180
+ # Save skewed columns list
181
+ skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
182
+ writeLines(skewed_cols, skewed_path)
183
+ cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
184
+ }
requirements (1).txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ xgboost
5
+ joblib
6
+ tabulate
7
+
train.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ train.py — Model training logic.
3
+ """
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import numpy as np
8
+ import joblib
9
+
10
+ from sklearn.model_selection import GridSearchCV
11
+ from sklearn.linear_model import Lasso
12
+ from sklearn.ensemble import RandomForestRegressor, StackingRegressor
13
+ from xgboost import XGBRegressor
14
+
15
+ from config import MODEL_PATH, PREPROCESSOR_PATH, META_PATH, COLUMNS_TO_DROP_BASE, SKEW_THRESHOLD
16
+ from preprocessing import engineer_features, build_preprocessor
17
+
18
+
19
+ def _full_preprocess(df: pd.DataFrame):
20
+ """
21
+ Full preprocessing. Returns (X, skewed_cols, selected_features, num_feats, cat_feats).
22
+ Separated so train can save skewed_cols and predict can reuse them.
23
+ """
24
+ # Drop unwanted columns
25
+ to_drop = [c for c in COLUMNS_TO_DROP_BASE if c in df.columns]
26
+ missing_pct = df.isnull().sum() * 100 / len(df)
27
+ to_drop += missing_pct[missing_pct > 50].index.tolist()
28
+ df = df.drop(columns=list(set(to_drop)), errors="ignore")
29
+ df = df.drop(columns=["source"], errors="ignore")
30
+
31
+ # Engineer features
32
+ df = engineer_features(df)
33
+
34
+ # Fill missing before skew calculation
35
+ for col in df.select_dtypes(include=["object"]).columns:
36
+ df[col] = df[col].fillna("Missing").astype(str)
37
+ for col in df.select_dtypes(include=[np.number]).columns:
38
+ df[col] = df[col].fillna(df[col].median()).astype(float)
39
+
40
+ # Identify skewed numeric columns AFTER engineering and filling
41
+ num_cols = df.select_dtypes(include=[np.number]).columns
42
+ skewed_series = df[num_cols].apply(lambda x: x.skew())
43
+ skewed_cols = skewed_series[abs(skewed_series) > SKEW_THRESHOLD].index.tolist()
44
+
45
+ # Apply log transform
46
+ for col in skewed_cols:
47
+ df[col] = np.log1p(df[col].clip(lower=0))
48
+
49
+ selected_features = df.columns.tolist()
50
+ numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
51
+ categorical_features = df.select_dtypes(include=["object"]).columns.tolist()
52
+
53
+ return df, skewed_cols, selected_features, numerical_features, categorical_features
54
+
55
+
56
+ def train_model(train_file, progress=gr.Progress()):
57
+ try:
58
+ if train_file is None:
59
+ return "Please upload a train.csv file first."
60
+
61
+ train_path = train_file.name if hasattr(train_file, "name") else train_file
62
+
63
+ progress(0.05, desc="Loading data...")
64
+ train_df = pd.read_csv(train_path)
65
+
66
+ if "SalePrice" not in train_df.columns:
67
+ return "Error: SalePrice column not found. Make sure you upload the Kaggle train.csv."
68
+
69
+ target = train_df["SalePrice"].copy()
70
+ train_df = train_df.drop(columns=["SalePrice", "Id"], errors="ignore")
71
+
72
+ # Save RAW medians/modes before any transformation
73
+ raw_numeric_medians = train_df.select_dtypes(include=[np.number]).median().to_dict()
74
+ raw_cat_modes = {
75
+ col: train_df[col].mode()[0] if not train_df[col].mode().empty else "Missing"
76
+ for col in train_df.select_dtypes(include=["object"]).columns
77
+ }
78
+
79
+ train_df["source"] = "train"
80
+
81
+ progress(0.12, desc="Preprocessing features...")
82
+ X, skewed_cols, selected_features, numerical_features, categorical_features = _full_preprocess(train_df)
83
+
84
+ preprocessor = build_preprocessor(numerical_features, categorical_features)
85
+ X_processed = preprocessor.fit_transform(X)
86
+ target_log = np.log1p(target)
87
+
88
+ # Lasso
89
+ progress(0.22, desc="Tuning Lasso... (1/3)")
90
+ lasso_cv = GridSearchCV(
91
+ Lasso(random_state=42),
92
+ {"alpha": [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], "max_iter": [10000]},
93
+ cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
94
+ )
95
+ lasso_cv.fit(X_processed, target_log)
96
+
97
+ # Random Forest
98
+ progress(0.42, desc="Tuning Random Forest... (2/3)")
99
+ rf_cv = GridSearchCV(
100
+ RandomForestRegressor(random_state=42, n_jobs=-1),
101
+ {
102
+ "n_estimators": [100],
103
+ "max_depth": [10, 20],
104
+ "min_samples_split": [2, 5],
105
+ "min_samples_leaf": [1, 2],
106
+ },
107
+ cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
108
+ )
109
+ rf_cv.fit(X_processed, target_log)
110
+
111
+ # XGBoost
112
+ progress(0.65, desc="Tuning XGBoost... (3/3)")
113
+ xgb_cv = GridSearchCV(
114
+ XGBRegressor(
115
+ objective="reg:squarederror", random_state=42,
116
+ tree_method="hist", n_jobs=1, verbosity=0,
117
+ ),
118
+ {
119
+ "n_estimators": [200],
120
+ "max_depth": [5, 7],
121
+ "learning_rate": [0.05, 0.1],
122
+ "subsample": [0.8, 1.0],
123
+ },
124
+ cv=3, scoring="neg_mean_squared_error", n_jobs=-1,
125
+ )
126
+ xgb_cv.fit(X_processed, target_log)
127
+
128
+ # Stacking
129
+ progress(0.83, desc="Fitting Stacking Ensemble...")
130
+ ensemble = StackingRegressor(
131
+ estimators=[
132
+ ("lasso", Lasso(**lasso_cv.best_params_, random_state=42)),
133
+ ("rf", RandomForestRegressor(**rf_cv.best_params_, random_state=42, n_jobs=-1)),
134
+ ("xgb", XGBRegressor(
135
+ **xgb_cv.best_params_, random_state=42,
136
+ tree_method="hist", n_jobs=1, verbosity=0,
137
+ )),
138
+ ],
139
+ final_estimator=Lasso(alpha=0.001, random_state=42, max_iter=10000),
140
+ cv=3, n_jobs=-1,
141
+ )
142
+ ensemble.fit(X_processed, target_log)
143
+
144
+ progress(0.96, desc="Saving model...")
145
+ joblib.dump(ensemble, MODEL_PATH)
146
+ joblib.dump(preprocessor, PREPROCESSOR_PATH)
147
+ joblib.dump({
148
+ "selected_features": selected_features,
149
+ "numerical_features": numerical_features,
150
+ "categorical_features": categorical_features,
151
+ "skewed_cols": skewed_cols,
152
+ "raw_numeric_medians": raw_numeric_medians,
153
+ "raw_cat_modes": raw_cat_modes,
154
+ }, META_PATH)
155
+
156
+ lasso_rmse = np.sqrt(-lasso_cv.best_score_)
157
+ rf_rmse = np.sqrt(-rf_cv.best_score_)
158
+ xgb_rmse = np.sqrt(-xgb_cv.best_score_)
159
+
160
+ progress(1.0, desc="Done!")
161
+ return (
162
+ f"Model trained successfully!\n\n"
163
+ f"| Model | CV RMSE (log) | Best Params |\n"
164
+ f"|---|---|---|\n"
165
+ f"| Lasso | {lasso_rmse:.5f} | {lasso_cv.best_params_} |\n"
166
+ f"| Random Forest | {rf_rmse:.5f} | {rf_cv.best_params_} |\n"
167
+ f"| XGBoost | {xgb_rmse:.5f} | {xgb_cv.best_params_} |\n\n"
168
+ f"Switch to Predict (CSV) or Single House to use it!"
169
+ )
170
+
171
+ except Exception as e:
172
+ return f"Training failed: {str(e)}\n\nPlease check your CSV file and try again."