Spaces:

ESCP
/

RX12-Housing-Competition

Sleeping

File size: 6,606 Bytes

97b9bf6

# =============================================================================
# preprocessing.R
# Feature engineering and data preprocessing pipeline
# R equivalent of preprocessing.py
# =============================================================================

library(dplyr)
library(tidyr)

# ── Constants ─────────────────────────────────────────────────────────────────

COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
SKEW_THRESHOLD <- 0.75

# ── Feature Engineering ───────────────────────────────────────────────────────

engineer_features <- function(df) {
  df <- df %>%
    mutate(
      # Total square footage: basement + 1st floor + 2nd floor
      TotalSF = replace_na(TotalBsmtSF, 0) +
                replace_na(`1stFlrSF`, 0) +
                replace_na(`2ndFlrSF`, 0),

      # Total bathrooms (half baths count as 0.5)
      TotalBath = replace_na(FullBath, 0) +
                  0.5 * replace_na(HalfBath, 0) +
                  replace_na(BsmtFullBath, 0) +
                  0.5 * replace_na(BsmtHalfBath, 0),

      # Binary flags
      HasPool     = as.integer(!is.na(PoolArea) & PoolArea > 0),
      HasGarage   = as.integer(!is.na(GarageArea)),
      HasBsmt     = as.integer(!is.na(TotalBsmtSF)),
      IsRemodeled = as.integer(
        replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
      )
    )
  return(df)
}

# ── Drop High-Missing Columns ─────────────────────────────────────────────────

drop_high_missing <- function(df, threshold = 50) {
  missing_pct <- colMeans(is.na(df)) * 100
  cols_to_drop <- names(missing_pct[missing_pct > threshold])
  cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
  cols_to_drop <- intersect(cols_to_drop, names(df))
  df <- df %>% select(-all_of(cols_to_drop))
  return(df)
}

# ── Skewness Correction ───────────────────────────────────────────────────────

get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
  # Get numeric columns only
  num_cols <- df %>% select(where(is.numeric)) %>% names()

  # Calculate skewness for each numeric column
  skew_values <- sapply(num_cols, function(col) {
    x <- df[[col]]
    x <- x[!is.na(x)]
    if (length(x) < 3) return(0)
    n <- length(x)
    m <- mean(x)
    s <- sd(x)
    if (s == 0) return(0)
    (sum((x - m)^3) / n) / (s^3)  # Pearson skewness
  })

  skewed_cols <- names(skew_values[abs(skew_values) > threshold])
  return(skewed_cols)
}

apply_log_transform <- function(df, skewed_cols) {
  for (col in skewed_cols) {
    if (col %in% names(df)) {
      df[[col]] <- log1p(pmax(df[[col]], 0))
    }
  }
  return(df)
}

# ── Fill Missing Values ───────────────────────────────────────────────────────

fill_missing <- function(df) {
  # Categorical: fill with "Missing"
  cat_cols <- df %>% select(where(is.character)) %>% names()
  for (col in cat_cols) {
    df[[col]][is.na(df[[col]])] <- "Missing"
  }

  # Numeric: fill with median
  num_cols <- df %>% select(where(is.numeric)) %>% names()
  for (col in num_cols) {
    med <- median(df[[col]], na.rm = TRUE)
    df[[col]][is.na(df[[col]])] <- med
  }

  return(df)
}

# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
# This is the R equivalent of preprocess_combined() in preprocessing.py

preprocess_combined <- function(df) {
  cat("Step 1: Dropping high-missing and flagged columns...\n")
  df <- drop_high_missing(df)

  cat("Step 2: Engineering new features...\n")
  df <- engineer_features(df)

  cat("Step 3: Identifying skewed columns...\n")
  skewed_cols <- get_skewed_cols(df)
  cat(sprintf("  Found %d skewed columns: %s\n",
              length(skewed_cols),
              paste(head(skewed_cols, 5), collapse = ", ")))

  cat("Step 4: Applying log1p transform to skewed columns...\n")
  df <- apply_log_transform(df, skewed_cols)

  cat("Step 5: Filling missing values...\n")
  df <- fill_missing(df)

  cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
  return(list(data = df, skewed_cols = skewed_cols))
}

# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────

get_raw_stats <- function(df) {
  # Numeric medians
  num_cols <- df %>% select(where(is.numeric)) %>% names()
  numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))

  # Categorical modes
  cat_cols <- df %>% select(where(is.character)) %>% names()
  cat_modes <- sapply(cat_cols, function(col) {
    tbl <- table(df[[col]])
    names(which.max(tbl))
  })

  return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
}

# ── Main: Run as standalone script ────────────────────────────────────────────

if (!interactive()) {
  args <- commandArgs(trailingOnly = TRUE)

  if (length(args) < 2) {
    cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
    cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
    quit(status = 1)
  }

  input_path  <- args[1]
  output_path <- args[2]

  cat(sprintf("Reading data from: %s\n", input_path))
  df <- read.csv(input_path, stringsAsFactors = FALSE)
  cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))

  # Remove target if present (training data)
  if ("SalePrice" %in% names(df)) {
    target <- df$SalePrice
    df <- df %>% select(-SalePrice)
    cat("Target variable (SalePrice) stored separately.\n")
  }

  # Run full preprocessing
  result      <- preprocess_combined(df)
  df_processed <- result$data
  skewed_cols  <- result$skewed_cols

  # Save processed data
  write.csv(df_processed, output_path, row.names = FALSE)
  cat(sprintf("Processed data saved to: %s\n", output_path))

  # Save skewed columns list
  skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
  writeLines(skewed_cols, skewed_path)
  cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
}