Spaces:
Sleeping
Sleeping
| # ============================================================================= | |
| # preprocessing.R | |
| # Feature engineering and data preprocessing pipeline | |
| # R equivalent of preprocessing.py | |
| # ============================================================================= | |
| library(dplyr) | |
| library(tidyr) | |
| # ── Constants ───────────────────────────────────────────────────────────────── | |
| COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id") | |
| SKEW_THRESHOLD <- 0.75 | |
| # ── Feature Engineering ─────────────────────────────────────────────────────── | |
| engineer_features <- function(df) { | |
| df <- df %>% | |
| mutate( | |
| # Total square footage: basement + 1st floor + 2nd floor | |
| TotalSF = replace_na(TotalBsmtSF, 0) + | |
| replace_na(`1stFlrSF`, 0) + | |
| replace_na(`2ndFlrSF`, 0), | |
| # Total bathrooms (half baths count as 0.5) | |
| TotalBath = replace_na(FullBath, 0) + | |
| 0.5 * replace_na(HalfBath, 0) + | |
| replace_na(BsmtFullBath, 0) + | |
| 0.5 * replace_na(BsmtHalfBath, 0), | |
| # Binary flags | |
| HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0), | |
| HasGarage = as.integer(!is.na(GarageArea)), | |
| HasBsmt = as.integer(!is.na(TotalBsmtSF)), | |
| IsRemodeled = as.integer( | |
| replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0) | |
| ) | |
| ) | |
| return(df) | |
| } | |
| # ── Drop High-Missing Columns ───────────────────────────────────────────────── | |
| drop_high_missing <- function(df, threshold = 50) { | |
| missing_pct <- colMeans(is.na(df)) * 100 | |
| cols_to_drop <- names(missing_pct[missing_pct > threshold]) | |
| cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE) | |
| cols_to_drop <- intersect(cols_to_drop, names(df)) | |
| df <- df %>% select(-all_of(cols_to_drop)) | |
| return(df) | |
| } | |
| # ── Skewness Correction ─────────────────────────────────────────────────────── | |
| get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) { | |
| # Get numeric columns only | |
| num_cols <- df %>% select(where(is.numeric)) %>% names() | |
| # Calculate skewness for each numeric column | |
| skew_values <- sapply(num_cols, function(col) { | |
| x <- df[[col]] | |
| x <- x[!is.na(x)] | |
| if (length(x) < 3) return(0) | |
| n <- length(x) | |
| m <- mean(x) | |
| s <- sd(x) | |
| if (s == 0) return(0) | |
| (sum((x - m)^3) / n) / (s^3) # Pearson skewness | |
| }) | |
| skewed_cols <- names(skew_values[abs(skew_values) > threshold]) | |
| return(skewed_cols) | |
| } | |
| apply_log_transform <- function(df, skewed_cols) { | |
| for (col in skewed_cols) { | |
| if (col %in% names(df)) { | |
| df[[col]] <- log1p(pmax(df[[col]], 0)) | |
| } | |
| } | |
| return(df) | |
| } | |
| # ── Fill Missing Values ─────────────────────────────────────────────────────── | |
| fill_missing <- function(df) { | |
| # Categorical: fill with "Missing" | |
| cat_cols <- df %>% select(where(is.character)) %>% names() | |
| for (col in cat_cols) { | |
| df[[col]][is.na(df[[col]])] <- "Missing" | |
| } | |
| # Numeric: fill with median | |
| num_cols <- df %>% select(where(is.numeric)) %>% names() | |
| for (col in num_cols) { | |
| med <- median(df[[col]], na.rm = TRUE) | |
| df[[col]][is.na(df[[col]])] <- med | |
| } | |
| return(df) | |
| } | |
| # ── Full Preprocessing Pipeline ─────────────────────────────────────────────── | |
| # This is the R equivalent of preprocess_combined() in preprocessing.py | |
| preprocess_combined <- function(df) { | |
| cat("Step 1: Dropping high-missing and flagged columns...\n") | |
| df <- drop_high_missing(df) | |
| cat("Step 2: Engineering new features...\n") | |
| df <- engineer_features(df) | |
| cat("Step 3: Identifying skewed columns...\n") | |
| skewed_cols <- get_skewed_cols(df) | |
| cat(sprintf(" Found %d skewed columns: %s\n", | |
| length(skewed_cols), | |
| paste(head(skewed_cols, 5), collapse = ", "))) | |
| cat("Step 4: Applying log1p transform to skewed columns...\n") | |
| df <- apply_log_transform(df, skewed_cols) | |
| cat("Step 5: Filling missing values...\n") | |
| df <- fill_missing(df) | |
| cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n") | |
| return(list(data = df, skewed_cols = skewed_cols)) | |
| } | |
| # ── Save Raw Medians and Modes (for Single House prediction defaults) ────────── | |
| get_raw_stats <- function(df) { | |
| # Numeric medians | |
| num_cols <- df %>% select(where(is.numeric)) %>% names() | |
| numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE)) | |
| # Categorical modes | |
| cat_cols <- df %>% select(where(is.character)) %>% names() | |
| cat_modes <- sapply(cat_cols, function(col) { | |
| tbl <- table(df[[col]]) | |
| names(which.max(tbl)) | |
| }) | |
| return(list(numeric_medians = numeric_medians, cat_modes = cat_modes)) | |
| } | |
| # ── Main: Run as standalone script ──────────────────────────────────────────── | |
| if (!interactive()) { | |
| args <- commandArgs(trailingOnly = TRUE) | |
| if (length(args) < 2) { | |
| cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n") | |
| cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n") | |
| quit(status = 1) | |
| } | |
| input_path <- args[1] | |
| output_path <- args[2] | |
| cat(sprintf("Reading data from: %s\n", input_path)) | |
| df <- read.csv(input_path, stringsAsFactors = FALSE) | |
| cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df))) | |
| # Remove target if present (training data) | |
| if ("SalePrice" %in% names(df)) { | |
| target <- df$SalePrice | |
| df <- df %>% select(-SalePrice) | |
| cat("Target variable (SalePrice) stored separately.\n") | |
| } | |
| # Run full preprocessing | |
| result <- preprocess_combined(df) | |
| df_processed <- result$data | |
| skewed_cols <- result$skewed_cols | |
| # Save processed data | |
| write.csv(df_processed, output_path, row.names = FALSE) | |
| cat(sprintf("Processed data saved to: %s\n", output_path)) | |
| # Save skewed columns list | |
| skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path) | |
| writeLines(skewed_cols, skewed_path) | |
| cat(sprintf("Skewed columns list saved to: %s\n", skewed_path)) | |
| } |