Spaces:
Sleeping
Sleeping
File size: 6,606 Bytes
97b9bf6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | # =============================================================================
# preprocessing.R
# Feature engineering and data preprocessing pipeline
# R equivalent of preprocessing.py
# =============================================================================
library(dplyr)
library(tidyr)
# ── Constants ─────────────────────────────────────────────────────────────────
COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
SKEW_THRESHOLD <- 0.75
# ── Feature Engineering ───────────────────────────────────────────────────────
engineer_features <- function(df) {
df <- df %>%
mutate(
# Total square footage: basement + 1st floor + 2nd floor
TotalSF = replace_na(TotalBsmtSF, 0) +
replace_na(`1stFlrSF`, 0) +
replace_na(`2ndFlrSF`, 0),
# Total bathrooms (half baths count as 0.5)
TotalBath = replace_na(FullBath, 0) +
0.5 * replace_na(HalfBath, 0) +
replace_na(BsmtFullBath, 0) +
0.5 * replace_na(BsmtHalfBath, 0),
# Binary flags
HasPool = as.integer(!is.na(PoolArea) & PoolArea > 0),
HasGarage = as.integer(!is.na(GarageArea)),
HasBsmt = as.integer(!is.na(TotalBsmtSF)),
IsRemodeled = as.integer(
replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
)
)
return(df)
}
# ── Drop High-Missing Columns ─────────────────────────────────────────────────
drop_high_missing <- function(df, threshold = 50) {
missing_pct <- colMeans(is.na(df)) * 100
cols_to_drop <- names(missing_pct[missing_pct > threshold])
cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
cols_to_drop <- intersect(cols_to_drop, names(df))
df <- df %>% select(-all_of(cols_to_drop))
return(df)
}
# ── Skewness Correction ───────────────────────────────────────────────────────
get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
# Get numeric columns only
num_cols <- df %>% select(where(is.numeric)) %>% names()
# Calculate skewness for each numeric column
skew_values <- sapply(num_cols, function(col) {
x <- df[[col]]
x <- x[!is.na(x)]
if (length(x) < 3) return(0)
n <- length(x)
m <- mean(x)
s <- sd(x)
if (s == 0) return(0)
(sum((x - m)^3) / n) / (s^3) # Pearson skewness
})
skewed_cols <- names(skew_values[abs(skew_values) > threshold])
return(skewed_cols)
}
apply_log_transform <- function(df, skewed_cols) {
for (col in skewed_cols) {
if (col %in% names(df)) {
df[[col]] <- log1p(pmax(df[[col]], 0))
}
}
return(df)
}
# ── Fill Missing Values ───────────────────────────────────────────────────────
fill_missing <- function(df) {
# Categorical: fill with "Missing"
cat_cols <- df %>% select(where(is.character)) %>% names()
for (col in cat_cols) {
df[[col]][is.na(df[[col]])] <- "Missing"
}
# Numeric: fill with median
num_cols <- df %>% select(where(is.numeric)) %>% names()
for (col in num_cols) {
med <- median(df[[col]], na.rm = TRUE)
df[[col]][is.na(df[[col]])] <- med
}
return(df)
}
# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
# This is the R equivalent of preprocess_combined() in preprocessing.py
preprocess_combined <- function(df) {
cat("Step 1: Dropping high-missing and flagged columns...\n")
df <- drop_high_missing(df)
cat("Step 2: Engineering new features...\n")
df <- engineer_features(df)
cat("Step 3: Identifying skewed columns...\n")
skewed_cols <- get_skewed_cols(df)
cat(sprintf(" Found %d skewed columns: %s\n",
length(skewed_cols),
paste(head(skewed_cols, 5), collapse = ", ")))
cat("Step 4: Applying log1p transform to skewed columns...\n")
df <- apply_log_transform(df, skewed_cols)
cat("Step 5: Filling missing values...\n")
df <- fill_missing(df)
cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
return(list(data = df, skewed_cols = skewed_cols))
}
# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────
get_raw_stats <- function(df) {
# Numeric medians
num_cols <- df %>% select(where(is.numeric)) %>% names()
numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))
# Categorical modes
cat_cols <- df %>% select(where(is.character)) %>% names()
cat_modes <- sapply(cat_cols, function(col) {
tbl <- table(df[[col]])
names(which.max(tbl))
})
return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
}
# ── Main: Run as standalone script ────────────────────────────────────────────
if (!interactive()) {
args <- commandArgs(trailingOnly = TRUE)
if (length(args) < 2) {
cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
quit(status = 1)
}
input_path <- args[1]
output_path <- args[2]
cat(sprintf("Reading data from: %s\n", input_path))
df <- read.csv(input_path, stringsAsFactors = FALSE)
cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))
# Remove target if present (training data)
if ("SalePrice" %in% names(df)) {
target <- df$SalePrice
df <- df %>% select(-SalePrice)
cat("Target variable (SalePrice) stored separately.\n")
}
# Run full preprocessing
result <- preprocess_combined(df)
df_processed <- result$data
skewed_cols <- result$skewed_cols
# Save processed data
write.csv(df_processed, output_path, row.names = FALSE)
cat(sprintf("Processed data saved to: %s\n", output_path))
# Save skewed columns list
skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
writeLines(skewed_cols, skewed_path)
cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
} |