Spaces:

ESCP
/

RX12-Housing-Competition

Sleeping

App Files Files Community

RX12-Housing-Competition / preprocessing.r

GBDB02

Upload 8 files

97b9bf6 verified about 1 month ago

raw

history blame contribute delete

6.61 kB

	# =============================================================================
	# preprocessing.R
	# Feature engineering and data preprocessing pipeline
	# R equivalent of preprocessing.py
	# =============================================================================

	library(dplyr)
	library(tidyr)

	# ── Constants ─────────────────────────────────────────────────────────────────

	COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
	SKEW_THRESHOLD <- 0.75

	# ── Feature Engineering ───────────────────────────────────────────────────────

	engineer_features <- function(df) {
	df <- df %>%
	mutate(
	# Total square footage: basement + 1st floor + 2nd floor
	TotalSF = replace_na(TotalBsmtSF, 0) +
	replace_na(`1stFlrSF`, 0) +
	replace_na(`2ndFlrSF`, 0),

	# Total bathrooms (half baths count as 0.5)
	TotalBath = replace_na(FullBath, 0) +
	0.5 * replace_na(HalfBath, 0) +
	replace_na(BsmtFullBath, 0) +
	0.5 * replace_na(BsmtHalfBath, 0),

	# Binary flags
	HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0),
	HasGarage = as.integer(!is.na(GarageArea)),
	HasBsmt = as.integer(!is.na(TotalBsmtSF)),
	IsRemodeled = as.integer(
	replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
	)
	)
	return(df)
	}

	# ── Drop High-Missing Columns ─────────────────────────────────────────────────

	drop_high_missing <- function(df, threshold = 50) {
	missing_pct <- colMeans(is.na(df)) * 100
	cols_to_drop <- names(missing_pct[missing_pct > threshold])
	cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
	cols_to_drop <- intersect(cols_to_drop, names(df))
	df <- df %>% select(-all_of(cols_to_drop))
	return(df)
	}

	# ── Skewness Correction ───────────────────────────────────────────────────────

	get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
	# Get numeric columns only
	num_cols <- df %>% select(where(is.numeric)) %>% names()

	# Calculate skewness for each numeric column
	skew_values <- sapply(num_cols, function(col) {
	x <- df[[col]]
	x <- x[!is.na(x)]
	if (length(x) < 3) return(0)
	n <- length(x)
	m <- mean(x)
	s <- sd(x)
	if (s == 0) return(0)
	(sum((x - m)^3) / n) / (s^3) # Pearson skewness
	})

	skewed_cols <- names(skew_values[abs(skew_values) > threshold])
	return(skewed_cols)
	}

	apply_log_transform <- function(df, skewed_cols) {
	for (col in skewed_cols) {
	if (col %in% names(df)) {
	df[[col]] <- log1p(pmax(df[[col]], 0))
	}
	}
	return(df)
	}

	# ── Fill Missing Values ───────────────────────────────────────────────────────

	fill_missing <- function(df) {
	# Categorical: fill with "Missing"
	cat_cols <- df %>% select(where(is.character)) %>% names()
	for (col in cat_cols) {
	df[[col]][is.na(df[[col]])] <- "Missing"
	}

	# Numeric: fill with median
	num_cols <- df %>% select(where(is.numeric)) %>% names()
	for (col in num_cols) {
	med <- median(df[[col]], na.rm = TRUE)
	df[[col]][is.na(df[[col]])] <- med
	}

	return(df)
	}

	# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
	# This is the R equivalent of preprocess_combined() in preprocessing.py

	preprocess_combined <- function(df) {
	cat("Step 1: Dropping high-missing and flagged columns...\n")
	df <- drop_high_missing(df)

	cat("Step 2: Engineering new features...\n")
	df <- engineer_features(df)

	cat("Step 3: Identifying skewed columns...\n")
	skewed_cols <- get_skewed_cols(df)
	cat(sprintf(" Found %d skewed columns: %s\n",
	length(skewed_cols),
	paste(head(skewed_cols, 5), collapse = ", ")))

	cat("Step 4: Applying log1p transform to skewed columns...\n")
	df <- apply_log_transform(df, skewed_cols)

	cat("Step 5: Filling missing values...\n")
	df <- fill_missing(df)

	cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
	return(list(data = df, skewed_cols = skewed_cols))
	}

	# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────

	get_raw_stats <- function(df) {
	# Numeric medians
	num_cols <- df %>% select(where(is.numeric)) %>% names()
	numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))

	# Categorical modes
	cat_cols <- df %>% select(where(is.character)) %>% names()
	cat_modes <- sapply(cat_cols, function(col) {
	tbl <- table(df[[col]])
	names(which.max(tbl))
	})

	return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
	}

	# ── Main: Run as standalone script ────────────────────────────────────────────

	if (!interactive()) {
	args <- commandArgs(trailingOnly = TRUE)

	if (length(args) < 2) {
	cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
	cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
	quit(status = 1)
	}

	input_path <- args[1]
	output_path <- args[2]

	cat(sprintf("Reading data from: %s\n", input_path))
	df <- read.csv(input_path, stringsAsFactors = FALSE)
	cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))

	# Remove target if present (training data)
	if ("SalePrice" %in% names(df)) {
	target <- df$SalePrice
	df <- df %>% select(-SalePrice)
	cat("Target variable (SalePrice) stored separately.\n")
	}

	# Run full preprocessing
	result <- preprocess_combined(df)
	df_processed <- result$data
	skewed_cols <- result$skewed_cols

	# Save processed data
	write.csv(df_processed, output_path, row.names = FALSE)
	cat(sprintf("Processed data saved to: %s\n", output_path))

	# Save skewed columns list
	skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
	writeLines(skewed_cols, skewed_path)
	cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
	}