Spaces:
Sleeping
Sleeping
File size: 4,699 Bytes
af61511 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | """
build_features.py – Load raw Quick Draw .npy files, split into train/test,
and extract HOG features for the classical ML pipeline.
Saved artefacts (under data/processed/):
X_train_raw.npy, y_train.npy -> pixel arrays for deep model
X_test_raw.npy, y_test.npy -> pixel arrays for evaluation
X_train_hog.npy -> HOG feature matrix for Random Forest
X_test_hog.npy
Usage:
python scripts/build_features.py
"""
import sys
from pathlib import Path
import numpy as np
from skimage.feature import hog
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import (
CLASSES,
RAW_DIR,
PROCESSED_DIR,
TRAIN_SAMPLES_PER_CLASS,
TEST_SAMPLES_PER_CLASS,
IMG_SIZE,
HOG_ORIENTATIONS,
HOG_PIXELS_PER_CELL,
HOG_CELLS_PER_BLOCK,
)
def load_class_data(cls: str, n_train: int, n_test: int) -> tuple[np.ndarray, np.ndarray]:
"""Load and slice pixel data for a single class.
The Quick Draw .npy files contain rows of 784-element uint8 vectors
(28×28 flattened, pixel values 0–255, white stroke on black background).
Args:
cls: Class name.
n_train: Number of training samples to keep.
n_test: Number of test samples to keep.
Returns:
Tuple of (train_pixels, test_pixels) each shaped (n, 784).
"""
path = RAW_DIR / f"{cls}.npy"
if not path.exists():
raise FileNotFoundError(
f"Missing {path}. Run scripts/make_dataset.py first."
)
data = np.load(path, mmap_mode="r") # memory mapped for large files
# Shuffle deterministically so splits are reproducible
rng = np.random.default_rng(seed=42)
indices = rng.permutation(len(data))[: n_train + n_test]
data = data[indices]
return data[:n_train], data[n_train : n_train + n_test]
def extract_hog_features(pixel_matrix: np.ndarray) -> np.ndarray:
"""Compute HOG descriptors for a batch of flat pixel vectors.
Args:
pixel_matrix: Array of shape (N, 784), dtype uint8.
Returns:
Feature matrix of shape (N, D) where D is the HOG descriptor length.
"""
features = []
for row in pixel_matrix:
img = row.reshape(IMG_SIZE, IMG_SIZE)
desc = hog(
img,
orientations=HOG_ORIENTATIONS,
pixels_per_cell=HOG_PIXELS_PER_CELL,
cells_per_block=HOG_CELLS_PER_BLOCK,
visualize=False,
channel_axis=None,
)
features.append(desc)
return np.array(features, dtype=np.float32)
def build_splits() -> None:
"""Assemble train/test raw arrays and labels from all classes."""
train_raws, test_raws = [], []
train_labels, test_labels = [], []
print("Loading raw data …")
for label_idx, cls in enumerate(CLASSES):
print(f" {cls} ({label_idx + 1}/{len(CLASSES)})")
tr, te = load_class_data(cls, TRAIN_SAMPLES_PER_CLASS, TEST_SAMPLES_PER_CLASS)
train_raws.append(tr)
test_raws.append(te)
train_labels.append(np.full(len(tr), label_idx, dtype=np.int64))
test_labels.append(np.full(len(te), label_idx, dtype=np.int64))
X_train_raw = np.concatenate(train_raws)
X_test_raw = np.concatenate(test_raws)
y_train = np.concatenate(train_labels)
y_test = np.concatenate(test_labels)
# Shuffle training set
rng = np.random.default_rng(seed=0)
perm = rng.permutation(len(X_train_raw))
X_train_raw = X_train_raw[perm]
y_train = y_train[perm]
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
np.save(PROCESSED_DIR / "X_train_raw.npy", X_train_raw)
np.save(PROCESSED_DIR / "X_test_raw.npy", X_test_raw)
np.save(PROCESSED_DIR / "y_train.npy", y_train)
np.save(PROCESSED_DIR / "y_test.npy", y_test)
print(f"\nSaved raw splits → train {X_train_raw.shape}, test {X_test_raw.shape}")
def build_hog_features() -> None:
"""Extract HOG features from saved raw arrays."""
X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy")
X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy")
print("Extracting HOG features (train) …")
X_train_hog = extract_hog_features(X_train_raw)
print("Extracting HOG features (test) …")
X_test_hog = extract_hog_features(X_test_raw)
np.save(PROCESSED_DIR / "X_train_hog.npy", X_train_hog)
np.save(PROCESSED_DIR / "X_test_hog.npy", X_test_hog)
print(f"Saved HOG features → train {X_train_hog.shape}, test {X_test_hog.shape}")
def build_all() -> None:
"""Run the complete feature building pipeline."""
build_splits()
build_hog_features()
print("\nFeature pipeline complete.")
if __name__ == "__main__":
build_all()
|