RX12-Housing-Competition / preprocessing.r
GBDB02's picture
Upload 8 files
97b9bf6 verified
# =============================================================================
# preprocessing.R
# Feature engineering and data preprocessing pipeline
# R equivalent of preprocessing.py
# =============================================================================
library(dplyr)
library(tidyr)
# ── Constants ─────────────────────────────────────────────────────────────────
COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
SKEW_THRESHOLD <- 0.75
# ── Feature Engineering ───────────────────────────────────────────────────────
engineer_features <- function(df) {
df <- df %>%
mutate(
# Total square footage: basement + 1st floor + 2nd floor
TotalSF = replace_na(TotalBsmtSF, 0) +
replace_na(`1stFlrSF`, 0) +
replace_na(`2ndFlrSF`, 0),
# Total bathrooms (half baths count as 0.5)
TotalBath = replace_na(FullBath, 0) +
0.5 * replace_na(HalfBath, 0) +
replace_na(BsmtFullBath, 0) +
0.5 * replace_na(BsmtHalfBath, 0),
# Binary flags
HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0),
HasGarage = as.integer(!is.na(GarageArea)),
HasBsmt = as.integer(!is.na(TotalBsmtSF)),
IsRemodeled = as.integer(
replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
)
)
return(df)
}
# ── Drop High-Missing Columns ─────────────────────────────────────────────────
drop_high_missing <- function(df, threshold = 50) {
missing_pct <- colMeans(is.na(df)) * 100
cols_to_drop <- names(missing_pct[missing_pct > threshold])
cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
cols_to_drop <- intersect(cols_to_drop, names(df))
df <- df %>% select(-all_of(cols_to_drop))
return(df)
}
# ── Skewness Correction ───────────────────────────────────────────────────────
get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
# Get numeric columns only
num_cols <- df %>% select(where(is.numeric)) %>% names()
# Calculate skewness for each numeric column
skew_values <- sapply(num_cols, function(col) {
x <- df[[col]]
x <- x[!is.na(x)]
if (length(x) < 3) return(0)
n <- length(x)
m <- mean(x)
s <- sd(x)
if (s == 0) return(0)
(sum((x - m)^3) / n) / (s^3) # Pearson skewness
})
skewed_cols <- names(skew_values[abs(skew_values) > threshold])
return(skewed_cols)
}
apply_log_transform <- function(df, skewed_cols) {
for (col in skewed_cols) {
if (col %in% names(df)) {
df[[col]] <- log1p(pmax(df[[col]], 0))
}
}
return(df)
}
# ── Fill Missing Values ───────────────────────────────────────────────────────
fill_missing <- function(df) {
# Categorical: fill with "Missing"
cat_cols <- df %>% select(where(is.character)) %>% names()
for (col in cat_cols) {
df[[col]][is.na(df[[col]])] <- "Missing"
}
# Numeric: fill with median
num_cols <- df %>% select(where(is.numeric)) %>% names()
for (col in num_cols) {
med <- median(df[[col]], na.rm = TRUE)
df[[col]][is.na(df[[col]])] <- med
}
return(df)
}
# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
# This is the R equivalent of preprocess_combined() in preprocessing.py
preprocess_combined <- function(df) {
cat("Step 1: Dropping high-missing and flagged columns...\n")
df <- drop_high_missing(df)
cat("Step 2: Engineering new features...\n")
df <- engineer_features(df)
cat("Step 3: Identifying skewed columns...\n")
skewed_cols <- get_skewed_cols(df)
cat(sprintf(" Found %d skewed columns: %s\n",
length(skewed_cols),
paste(head(skewed_cols, 5), collapse = ", ")))
cat("Step 4: Applying log1p transform to skewed columns...\n")
df <- apply_log_transform(df, skewed_cols)
cat("Step 5: Filling missing values...\n")
df <- fill_missing(df)
cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
return(list(data = df, skewed_cols = skewed_cols))
}
# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────
get_raw_stats <- function(df) {
# Numeric medians
num_cols <- df %>% select(where(is.numeric)) %>% names()
numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))
# Categorical modes
cat_cols <- df %>% select(where(is.character)) %>% names()
cat_modes <- sapply(cat_cols, function(col) {
tbl <- table(df[[col]])
names(which.max(tbl))
})
return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
}
# ── Main: Run as standalone script ────────────────────────────────────────────
if (!interactive()) {
args <- commandArgs(trailingOnly = TRUE)
if (length(args) < 2) {
cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
quit(status = 1)
}
input_path <- args[1]
output_path <- args[2]
cat(sprintf("Reading data from: %s\n", input_path))
df <- read.csv(input_path, stringsAsFactors = FALSE)
cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))
# Remove target if present (training data)
if ("SalePrice" %in% names(df)) {
target <- df$SalePrice
df <- df %>% select(-SalePrice)
cat("Target variable (SalePrice) stored separately.\n")
}
# Run full preprocessing
result <- preprocess_combined(df)
df_processed <- result$data
skewed_cols <- result$skewed_cols
# Save processed data
write.csv(df_processed, output_path, row.names = FALSE)
cat(sprintf("Processed data saved to: %s\n", output_path))
# Save skewed columns list
skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
writeLines(skewed_cols, skewed_path)
cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
}