ScribbleBot_Huggingface / scripts /build_features.py
Jog-sama's picture
initial deployment
af61511
"""
build_features.py – Load raw Quick Draw .npy files, split into train/test,
and extract HOG features for the classical ML pipeline.
Saved artefacts (under data/processed/):
X_train_raw.npy, y_train.npy -> pixel arrays for deep model
X_test_raw.npy, y_test.npy -> pixel arrays for evaluation
X_train_hog.npy -> HOG feature matrix for Random Forest
X_test_hog.npy
Usage:
python scripts/build_features.py
"""
import sys
from pathlib import Path
import numpy as np
from skimage.feature import hog
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import (
CLASSES,
RAW_DIR,
PROCESSED_DIR,
TRAIN_SAMPLES_PER_CLASS,
TEST_SAMPLES_PER_CLASS,
IMG_SIZE,
HOG_ORIENTATIONS,
HOG_PIXELS_PER_CELL,
HOG_CELLS_PER_BLOCK,
)
def load_class_data(cls: str, n_train: int, n_test: int) -> tuple[np.ndarray, np.ndarray]:
"""Load and slice pixel data for a single class.
The Quick Draw .npy files contain rows of 784-element uint8 vectors
(28×28 flattened, pixel values 0–255, white stroke on black background).
Args:
cls: Class name.
n_train: Number of training samples to keep.
n_test: Number of test samples to keep.
Returns:
Tuple of (train_pixels, test_pixels) each shaped (n, 784).
"""
path = RAW_DIR / f"{cls}.npy"
if not path.exists():
raise FileNotFoundError(
f"Missing {path}. Run scripts/make_dataset.py first."
)
data = np.load(path, mmap_mode="r") # memory mapped for large files
# Shuffle deterministically so splits are reproducible
rng = np.random.default_rng(seed=42)
indices = rng.permutation(len(data))[: n_train + n_test]
data = data[indices]
return data[:n_train], data[n_train : n_train + n_test]
def extract_hog_features(pixel_matrix: np.ndarray) -> np.ndarray:
"""Compute HOG descriptors for a batch of flat pixel vectors.
Args:
pixel_matrix: Array of shape (N, 784), dtype uint8.
Returns:
Feature matrix of shape (N, D) where D is the HOG descriptor length.
"""
features = []
for row in pixel_matrix:
img = row.reshape(IMG_SIZE, IMG_SIZE)
desc = hog(
img,
orientations=HOG_ORIENTATIONS,
pixels_per_cell=HOG_PIXELS_PER_CELL,
cells_per_block=HOG_CELLS_PER_BLOCK,
visualize=False,
channel_axis=None,
)
features.append(desc)
return np.array(features, dtype=np.float32)
def build_splits() -> None:
"""Assemble train/test raw arrays and labels from all classes."""
train_raws, test_raws = [], []
train_labels, test_labels = [], []
print("Loading raw data …")
for label_idx, cls in enumerate(CLASSES):
print(f" {cls} ({label_idx + 1}/{len(CLASSES)})")
tr, te = load_class_data(cls, TRAIN_SAMPLES_PER_CLASS, TEST_SAMPLES_PER_CLASS)
train_raws.append(tr)
test_raws.append(te)
train_labels.append(np.full(len(tr), label_idx, dtype=np.int64))
test_labels.append(np.full(len(te), label_idx, dtype=np.int64))
X_train_raw = np.concatenate(train_raws)
X_test_raw = np.concatenate(test_raws)
y_train = np.concatenate(train_labels)
y_test = np.concatenate(test_labels)
# Shuffle training set
rng = np.random.default_rng(seed=0)
perm = rng.permutation(len(X_train_raw))
X_train_raw = X_train_raw[perm]
y_train = y_train[perm]
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
np.save(PROCESSED_DIR / "X_train_raw.npy", X_train_raw)
np.save(PROCESSED_DIR / "X_test_raw.npy", X_test_raw)
np.save(PROCESSED_DIR / "y_train.npy", y_train)
np.save(PROCESSED_DIR / "y_test.npy", y_test)
print(f"\nSaved raw splits → train {X_train_raw.shape}, test {X_test_raw.shape}")
def build_hog_features() -> None:
"""Extract HOG features from saved raw arrays."""
X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy")
X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy")
print("Extracting HOG features (train) …")
X_train_hog = extract_hog_features(X_train_raw)
print("Extracting HOG features (test) …")
X_test_hog = extract_hog_features(X_test_raw)
np.save(PROCESSED_DIR / "X_train_hog.npy", X_train_hog)
np.save(PROCESSED_DIR / "X_test_hog.npy", X_test_hog)
print(f"Saved HOG features → train {X_train_hog.shape}, test {X_test_hog.shape}")
def build_all() -> None:
"""Run the complete feature building pipeline."""
build_splits()
build_hog_features()
print("\nFeature pipeline complete.")
if __name__ == "__main__":
build_all()