# ============================================================================= # preprocessing.R # Feature engineering and data preprocessing pipeline # R equivalent of preprocessing.py # ============================================================================= library(dplyr) library(tidyr) # ── Constants ───────────────────────────────────────────────────────────────── COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id") SKEW_THRESHOLD <- 0.75 # ── Feature Engineering ─────────────────────────────────────────────────────── engineer_features <- function(df) { df <- df %>% mutate( # Total square footage: basement + 1st floor + 2nd floor TotalSF = replace_na(TotalBsmtSF, 0) + replace_na(`1stFlrSF`, 0) + replace_na(`2ndFlrSF`, 0), # Total bathrooms (half baths count as 0.5) TotalBath = replace_na(FullBath, 0) + 0.5 * replace_na(HalfBath, 0) + replace_na(BsmtFullBath, 0) + 0.5 * replace_na(BsmtHalfBath, 0), # Binary flags HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0), HasGarage = as.integer(!is.na(GarageArea)), HasBsmt = as.integer(!is.na(TotalBsmtSF)), IsRemodeled = as.integer( replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0) ) ) return(df) } # ── Drop High-Missing Columns ───────────────────────────────────────────────── drop_high_missing <- function(df, threshold = 50) { missing_pct <- colMeans(is.na(df)) * 100 cols_to_drop <- names(missing_pct[missing_pct > threshold]) cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE) cols_to_drop <- intersect(cols_to_drop, names(df)) df <- df %>% select(-all_of(cols_to_drop)) return(df) } # ── Skewness Correction ─────────────────────────────────────────────────────── get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) { # Get numeric columns only num_cols <- df %>% select(where(is.numeric)) %>% names() # Calculate skewness for each numeric column skew_values <- sapply(num_cols, function(col) { x <- df[[col]] x <- x[!is.na(x)] if (length(x) < 3) return(0) n <- length(x) m <- mean(x) s <- sd(x) if (s == 0) return(0) (sum((x - m)^3) / n) / (s^3) # Pearson skewness }) skewed_cols <- names(skew_values[abs(skew_values) > threshold]) return(skewed_cols) } apply_log_transform <- function(df, skewed_cols) { for (col in skewed_cols) { if (col %in% names(df)) { df[[col]] <- log1p(pmax(df[[col]], 0)) } } return(df) } # ── Fill Missing Values ─────────────────────────────────────────────────────── fill_missing <- function(df) { # Categorical: fill with "Missing" cat_cols <- df %>% select(where(is.character)) %>% names() for (col in cat_cols) { df[[col]][is.na(df[[col]])] <- "Missing" } # Numeric: fill with median num_cols <- df %>% select(where(is.numeric)) %>% names() for (col in num_cols) { med <- median(df[[col]], na.rm = TRUE) df[[col]][is.na(df[[col]])] <- med } return(df) } # ── Full Preprocessing Pipeline ─────────────────────────────────────────────── # This is the R equivalent of preprocess_combined() in preprocessing.py preprocess_combined <- function(df) { cat("Step 1: Dropping high-missing and flagged columns...\n") df <- drop_high_missing(df) cat("Step 2: Engineering new features...\n") df <- engineer_features(df) cat("Step 3: Identifying skewed columns...\n") skewed_cols <- get_skewed_cols(df) cat(sprintf(" Found %d skewed columns: %s\n", length(skewed_cols), paste(head(skewed_cols, 5), collapse = ", "))) cat("Step 4: Applying log1p transform to skewed columns...\n") df <- apply_log_transform(df, skewed_cols) cat("Step 5: Filling missing values...\n") df <- fill_missing(df) cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n") return(list(data = df, skewed_cols = skewed_cols)) } # ── Save Raw Medians and Modes (for Single House prediction defaults) ────────── get_raw_stats <- function(df) { # Numeric medians num_cols <- df %>% select(where(is.numeric)) %>% names() numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE)) # Categorical modes cat_cols <- df %>% select(where(is.character)) %>% names() cat_modes <- sapply(cat_cols, function(col) { tbl <- table(df[[col]]) names(which.max(tbl)) }) return(list(numeric_medians = numeric_medians, cat_modes = cat_modes)) } # ── Main: Run as standalone script ──────────────────────────────────────────── if (!interactive()) { args <- commandArgs(trailingOnly = TRUE) if (length(args) < 2) { cat("Usage: Rscript preprocessing.R \n") cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n") quit(status = 1) } input_path <- args[1] output_path <- args[2] cat(sprintf("Reading data from: %s\n", input_path)) df <- read.csv(input_path, stringsAsFactors = FALSE) cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df))) # Remove target if present (training data) if ("SalePrice" %in% names(df)) { target <- df$SalePrice df <- df %>% select(-SalePrice) cat("Target variable (SalePrice) stored separately.\n") } # Run full preprocessing result <- preprocess_combined(df) df_processed <- result$data skewed_cols <- result$skewed_cols # Save processed data write.csv(df_processed, output_path, row.names = FALSE) cat(sprintf("Processed data saved to: %s\n", output_path)) # Save skewed columns list skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path) writeLines(skewed_cols, skewed_path) cat(sprintf("Skewed columns list saved to: %s\n", skewed_path)) }