File size: 6,606 Bytes
97b9bf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# =============================================================================
# preprocessing.R
# Feature engineering and data preprocessing pipeline
# R equivalent of preprocessing.py
# =============================================================================

library(dplyr)
library(tidyr)

# ── Constants ─────────────────────────────────────────────────────────────────

COLUMNS_TO_DROP_BASE <- c("MoSold", "YrSold", "Id")
SKEW_THRESHOLD <- 0.75

# ── Feature Engineering ───────────────────────────────────────────────────────

engineer_features <- function(df) {
  df <- df %>%
    mutate(
      # Total square footage: basement + 1st floor + 2nd floor
      TotalSF = replace_na(TotalBsmtSF, 0) +
                replace_na(`1stFlrSF`, 0) +
                replace_na(`2ndFlrSF`, 0),

      # Total bathrooms (half baths count as 0.5)
      TotalBath = replace_na(FullBath, 0) +
                  0.5 * replace_na(HalfBath, 0) +
                  replace_na(BsmtFullBath, 0) +
                  0.5 * replace_na(BsmtHalfBath, 0),

      # Binary flags
      HasPool     = as.integer(!is.na(PoolArea) & PoolArea > 0),
      HasGarage   = as.integer(!is.na(GarageArea)),
      HasBsmt     = as.integer(!is.na(TotalBsmtSF)),
      IsRemodeled = as.integer(
        replace_na(YearRemodAdd, 0) != replace_na(YearBuilt, 0)
      )
    )
  return(df)
}

# ── Drop High-Missing Columns ─────────────────────────────────────────────────

drop_high_missing <- function(df, threshold = 50) {
  missing_pct <- colMeans(is.na(df)) * 100
  cols_to_drop <- names(missing_pct[missing_pct > threshold])
  cols_to_drop <- union(cols_to_drop, COLUMNS_TO_DROP_BASE)
  cols_to_drop <- intersect(cols_to_drop, names(df))
  df <- df %>% select(-all_of(cols_to_drop))
  return(df)
}

# ── Skewness Correction ───────────────────────────────────────────────────────

get_skewed_cols <- function(df, threshold = SKEW_THRESHOLD) {
  # Get numeric columns only
  num_cols <- df %>% select(where(is.numeric)) %>% names()

  # Calculate skewness for each numeric column
  skew_values <- sapply(num_cols, function(col) {
    x <- df[[col]]
    x <- x[!is.na(x)]
    if (length(x) < 3) return(0)
    n <- length(x)
    m <- mean(x)
    s <- sd(x)
    if (s == 0) return(0)
    (sum((x - m)^3) / n) / (s^3)  # Pearson skewness
  })

  skewed_cols <- names(skew_values[abs(skew_values) > threshold])
  return(skewed_cols)
}

apply_log_transform <- function(df, skewed_cols) {
  for (col in skewed_cols) {
    if (col %in% names(df)) {
      df[[col]] <- log1p(pmax(df[[col]], 0))
    }
  }
  return(df)
}

# ── Fill Missing Values ───────────────────────────────────────────────────────

fill_missing <- function(df) {
  # Categorical: fill with "Missing"
  cat_cols <- df %>% select(where(is.character)) %>% names()
  for (col in cat_cols) {
    df[[col]][is.na(df[[col]])] <- "Missing"
  }

  # Numeric: fill with median
  num_cols <- df %>% select(where(is.numeric)) %>% names()
  for (col in num_cols) {
    med <- median(df[[col]], na.rm = TRUE)
    df[[col]][is.na(df[[col]])] <- med
  }

  return(df)
}

# ── Full Preprocessing Pipeline ───────────────────────────────────────────────
# This is the R equivalent of preprocess_combined() in preprocessing.py

preprocess_combined <- function(df) {
  cat("Step 1: Dropping high-missing and flagged columns...\n")
  df <- drop_high_missing(df)

  cat("Step 2: Engineering new features...\n")
  df <- engineer_features(df)

  cat("Step 3: Identifying skewed columns...\n")
  skewed_cols <- get_skewed_cols(df)
  cat(sprintf("  Found %d skewed columns: %s\n",
              length(skewed_cols),
              paste(head(skewed_cols, 5), collapse = ", ")))

  cat("Step 4: Applying log1p transform to skewed columns...\n")
  df <- apply_log_transform(df, skewed_cols)

  cat("Step 5: Filling missing values...\n")
  df <- fill_missing(df)

  cat("Done. Final shape:", nrow(df), "rows x", ncol(df), "cols\n")
  return(list(data = df, skewed_cols = skewed_cols))
}

# ── Save Raw Medians and Modes (for Single House prediction defaults) ──────────

get_raw_stats <- function(df) {
  # Numeric medians
  num_cols <- df %>% select(where(is.numeric)) %>% names()
  numeric_medians <- sapply(num_cols, function(col) median(df[[col]], na.rm = TRUE))

  # Categorical modes
  cat_cols <- df %>% select(where(is.character)) %>% names()
  cat_modes <- sapply(cat_cols, function(col) {
    tbl <- table(df[[col]])
    names(which.max(tbl))
  })

  return(list(numeric_medians = numeric_medians, cat_modes = cat_modes))
}

# ── Main: Run as standalone script ────────────────────────────────────────────

if (!interactive()) {
  args <- commandArgs(trailingOnly = TRUE)

  if (length(args) < 2) {
    cat("Usage: Rscript preprocessing.R <input.csv> <output.csv>\n")
    cat("Example: Rscript preprocessing.R data/train.csv data/train_processed.csv\n")
    quit(status = 1)
  }

  input_path  <- args[1]
  output_path <- args[2]

  cat(sprintf("Reading data from: %s\n", input_path))
  df <- read.csv(input_path, stringsAsFactors = FALSE)
  cat(sprintf("Loaded %d rows x %d columns\n", nrow(df), ncol(df)))

  # Remove target if present (training data)
  if ("SalePrice" %in% names(df)) {
    target <- df$SalePrice
    df <- df %>% select(-SalePrice)
    cat("Target variable (SalePrice) stored separately.\n")
  }

  # Run full preprocessing
  result      <- preprocess_combined(df)
  df_processed <- result$data
  skewed_cols  <- result$skewed_cols

  # Save processed data
  write.csv(df_processed, output_path, row.names = FALSE)
  cat(sprintf("Processed data saved to: %s\n", output_path))

  # Save skewed columns list
  skewed_path <- sub("\\.csv$", "_skewed_cols.txt", output_path)
  writeLines(skewed_cols, skewed_path)
  cat(sprintf("Skewed columns list saved to: %s\n", skewed_path))
}