File size: 4,699 Bytes
af61511
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
build_features.py – Load raw Quick Draw .npy files, split into train/test,
and extract HOG features for the classical ML pipeline.

Saved artefacts (under data/processed/):
    X_train_raw.npy, y_train.npy  -> pixel arrays for deep model
    X_test_raw.npy,  y_test.npy   -> pixel arrays for evaluation
    X_train_hog.npy               -> HOG feature matrix for Random Forest
    X_test_hog.npy

Usage:
    python scripts/build_features.py
"""

import sys
from pathlib import Path

import numpy as np
from skimage.feature import hog

sys.path.insert(0, str(Path(__file__).parent.parent))

from config import (
    CLASSES,
    RAW_DIR,
    PROCESSED_DIR,
    TRAIN_SAMPLES_PER_CLASS,
    TEST_SAMPLES_PER_CLASS,
    IMG_SIZE,
    HOG_ORIENTATIONS,
    HOG_PIXELS_PER_CELL,
    HOG_CELLS_PER_BLOCK,
)


def load_class_data(cls: str, n_train: int, n_test: int) -> tuple[np.ndarray, np.ndarray]:
    """Load and slice pixel data for a single class.

    The Quick Draw .npy files contain rows of 784-element uint8 vectors
    (28×28 flattened, pixel values 0–255, white stroke on black background).

    Args:
        cls:    Class name.
        n_train: Number of training samples to keep.
        n_test:  Number of test samples to keep.

    Returns:
        Tuple of (train_pixels, test_pixels) each shaped (n, 784).
    """
    path = RAW_DIR / f"{cls}.npy"
    if not path.exists():
        raise FileNotFoundError(
            f"Missing {path}. Run scripts/make_dataset.py first."
        )
    data = np.load(path, mmap_mode="r")  # memory mapped for large files

    # Shuffle deterministically so splits are reproducible
    rng = np.random.default_rng(seed=42)
    indices = rng.permutation(len(data))[: n_train + n_test]
    data = data[indices]

    return data[:n_train], data[n_train : n_train + n_test]


def extract_hog_features(pixel_matrix: np.ndarray) -> np.ndarray:
    """Compute HOG descriptors for a batch of flat pixel vectors.

    Args:
        pixel_matrix: Array of shape (N, 784), dtype uint8.

    Returns:
        Feature matrix of shape (N, D) where D is the HOG descriptor length.
    """
    features = []
    for row in pixel_matrix:
        img = row.reshape(IMG_SIZE, IMG_SIZE)
        desc = hog(
            img,
            orientations=HOG_ORIENTATIONS,
            pixels_per_cell=HOG_PIXELS_PER_CELL,
            cells_per_block=HOG_CELLS_PER_BLOCK,
            visualize=False,
            channel_axis=None,
        )
        features.append(desc)
    return np.array(features, dtype=np.float32)


def build_splits() -> None:
    """Assemble train/test raw arrays and labels from all classes."""
    train_raws, test_raws = [], []
    train_labels, test_labels = [], []

    print("Loading raw data …")
    for label_idx, cls in enumerate(CLASSES):
        print(f"  {cls} ({label_idx + 1}/{len(CLASSES)})")
        tr, te = load_class_data(cls, TRAIN_SAMPLES_PER_CLASS, TEST_SAMPLES_PER_CLASS)
        train_raws.append(tr)
        test_raws.append(te)
        train_labels.append(np.full(len(tr), label_idx, dtype=np.int64))
        test_labels.append(np.full(len(te), label_idx, dtype=np.int64))

    X_train_raw = np.concatenate(train_raws)
    X_test_raw = np.concatenate(test_raws)
    y_train = np.concatenate(train_labels)
    y_test = np.concatenate(test_labels)

    # Shuffle training set
    rng = np.random.default_rng(seed=0)
    perm = rng.permutation(len(X_train_raw))
    X_train_raw = X_train_raw[perm]
    y_train = y_train[perm]

    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    np.save(PROCESSED_DIR / "X_train_raw.npy", X_train_raw)
    np.save(PROCESSED_DIR / "X_test_raw.npy", X_test_raw)
    np.save(PROCESSED_DIR / "y_train.npy", y_train)
    np.save(PROCESSED_DIR / "y_test.npy", y_test)
    print(f"\nSaved raw splits  →  train {X_train_raw.shape}, test {X_test_raw.shape}")


def build_hog_features() -> None:
    """Extract HOG features from saved raw arrays."""
    X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy")
    X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy")

    print("Extracting HOG features (train) …")
    X_train_hog = extract_hog_features(X_train_raw)
    print("Extracting HOG features (test) …")
    X_test_hog = extract_hog_features(X_test_raw)

    np.save(PROCESSED_DIR / "X_train_hog.npy", X_train_hog)
    np.save(PROCESSED_DIR / "X_test_hog.npy", X_test_hog)
    print(f"Saved HOG features  →  train {X_train_hog.shape}, test {X_test_hog.shape}")


def build_all() -> None:
    """Run the complete feature building pipeline."""
    build_splits()
    build_hog_features()
    print("\nFeature pipeline complete.")


if __name__ == "__main__":
    build_all()