Spaces:
Sleeping
Sleeping
| """ | |
| build_features.py – Load raw Quick Draw .npy files, split into train/test, | |
| and extract HOG features for the classical ML pipeline. | |
| Saved artefacts (under data/processed/): | |
| X_train_raw.npy, y_train.npy -> pixel arrays for deep model | |
| X_test_raw.npy, y_test.npy -> pixel arrays for evaluation | |
| X_train_hog.npy -> HOG feature matrix for Random Forest | |
| X_test_hog.npy | |
| Usage: | |
| python scripts/build_features.py | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import numpy as np | |
| from skimage.feature import hog | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from config import ( | |
| CLASSES, | |
| RAW_DIR, | |
| PROCESSED_DIR, | |
| TRAIN_SAMPLES_PER_CLASS, | |
| TEST_SAMPLES_PER_CLASS, | |
| IMG_SIZE, | |
| HOG_ORIENTATIONS, | |
| HOG_PIXELS_PER_CELL, | |
| HOG_CELLS_PER_BLOCK, | |
| ) | |
| def load_class_data(cls: str, n_train: int, n_test: int) -> tuple[np.ndarray, np.ndarray]: | |
| """Load and slice pixel data for a single class. | |
| The Quick Draw .npy files contain rows of 784-element uint8 vectors | |
| (28×28 flattened, pixel values 0–255, white stroke on black background). | |
| Args: | |
| cls: Class name. | |
| n_train: Number of training samples to keep. | |
| n_test: Number of test samples to keep. | |
| Returns: | |
| Tuple of (train_pixels, test_pixels) each shaped (n, 784). | |
| """ | |
| path = RAW_DIR / f"{cls}.npy" | |
| if not path.exists(): | |
| raise FileNotFoundError( | |
| f"Missing {path}. Run scripts/make_dataset.py first." | |
| ) | |
| data = np.load(path, mmap_mode="r") # memory mapped for large files | |
| # Shuffle deterministically so splits are reproducible | |
| rng = np.random.default_rng(seed=42) | |
| indices = rng.permutation(len(data))[: n_train + n_test] | |
| data = data[indices] | |
| return data[:n_train], data[n_train : n_train + n_test] | |
| def extract_hog_features(pixel_matrix: np.ndarray) -> np.ndarray: | |
| """Compute HOG descriptors for a batch of flat pixel vectors. | |
| Args: | |
| pixel_matrix: Array of shape (N, 784), dtype uint8. | |
| Returns: | |
| Feature matrix of shape (N, D) where D is the HOG descriptor length. | |
| """ | |
| features = [] | |
| for row in pixel_matrix: | |
| img = row.reshape(IMG_SIZE, IMG_SIZE) | |
| desc = hog( | |
| img, | |
| orientations=HOG_ORIENTATIONS, | |
| pixels_per_cell=HOG_PIXELS_PER_CELL, | |
| cells_per_block=HOG_CELLS_PER_BLOCK, | |
| visualize=False, | |
| channel_axis=None, | |
| ) | |
| features.append(desc) | |
| return np.array(features, dtype=np.float32) | |
| def build_splits() -> None: | |
| """Assemble train/test raw arrays and labels from all classes.""" | |
| train_raws, test_raws = [], [] | |
| train_labels, test_labels = [], [] | |
| print("Loading raw data …") | |
| for label_idx, cls in enumerate(CLASSES): | |
| print(f" {cls} ({label_idx + 1}/{len(CLASSES)})") | |
| tr, te = load_class_data(cls, TRAIN_SAMPLES_PER_CLASS, TEST_SAMPLES_PER_CLASS) | |
| train_raws.append(tr) | |
| test_raws.append(te) | |
| train_labels.append(np.full(len(tr), label_idx, dtype=np.int64)) | |
| test_labels.append(np.full(len(te), label_idx, dtype=np.int64)) | |
| X_train_raw = np.concatenate(train_raws) | |
| X_test_raw = np.concatenate(test_raws) | |
| y_train = np.concatenate(train_labels) | |
| y_test = np.concatenate(test_labels) | |
| # Shuffle training set | |
| rng = np.random.default_rng(seed=0) | |
| perm = rng.permutation(len(X_train_raw)) | |
| X_train_raw = X_train_raw[perm] | |
| y_train = y_train[perm] | |
| PROCESSED_DIR.mkdir(parents=True, exist_ok=True) | |
| np.save(PROCESSED_DIR / "X_train_raw.npy", X_train_raw) | |
| np.save(PROCESSED_DIR / "X_test_raw.npy", X_test_raw) | |
| np.save(PROCESSED_DIR / "y_train.npy", y_train) | |
| np.save(PROCESSED_DIR / "y_test.npy", y_test) | |
| print(f"\nSaved raw splits → train {X_train_raw.shape}, test {X_test_raw.shape}") | |
| def build_hog_features() -> None: | |
| """Extract HOG features from saved raw arrays.""" | |
| X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy") | |
| X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy") | |
| print("Extracting HOG features (train) …") | |
| X_train_hog = extract_hog_features(X_train_raw) | |
| print("Extracting HOG features (test) …") | |
| X_test_hog = extract_hog_features(X_test_raw) | |
| np.save(PROCESSED_DIR / "X_train_hog.npy", X_train_hog) | |
| np.save(PROCESSED_DIR / "X_test_hog.npy", X_test_hog) | |
| print(f"Saved HOG features → train {X_train_hog.shape}, test {X_test_hog.shape}") | |
| def build_all() -> None: | |
| """Run the complete feature building pipeline.""" | |
| build_splits() | |
| build_hog_features() | |
| print("\nFeature pipeline complete.") | |
| if __name__ == "__main__": | |
| build_all() | |