Spaces:

Jog-sama
/

ScribbleBot_Huggingface

Sleeping

App Files Files Community

ScribbleBot_Huggingface / scripts /build_features.py

Jog-sama

initial deployment

af61511 23 days ago

raw

history blame contribute delete

4.7 kB

	"""
	build_features.py – Load raw Quick Draw .npy files, split into train/test,
	and extract HOG features for the classical ML pipeline.

	Saved artefacts (under data/processed/):
	X_train_raw.npy, y_train.npy -> pixel arrays for deep model
	X_test_raw.npy, y_test.npy -> pixel arrays for evaluation
	X_train_hog.npy -> HOG feature matrix for Random Forest
	X_test_hog.npy

	Usage:
	python scripts/build_features.py
	"""

	import sys
	from pathlib import Path

	import numpy as np
	from skimage.feature import hog

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from config import (
	CLASSES,
	RAW_DIR,
	PROCESSED_DIR,
	TRAIN_SAMPLES_PER_CLASS,
	TEST_SAMPLES_PER_CLASS,
	IMG_SIZE,
	HOG_ORIENTATIONS,
	HOG_PIXELS_PER_CELL,
	HOG_CELLS_PER_BLOCK,
	)


	def load_class_data(cls: str, n_train: int, n_test: int) -> tuple[np.ndarray, np.ndarray]:
	"""Load and slice pixel data for a single class.

	The Quick Draw .npy files contain rows of 784-element uint8 vectors
	(28×28 flattened, pixel values 0–255, white stroke on black background).

	Args:
	cls: Class name.
	n_train: Number of training samples to keep.
	n_test: Number of test samples to keep.

	Returns:
	Tuple of (train_pixels, test_pixels) each shaped (n, 784).
	"""
	path = RAW_DIR / f"{cls}.npy"
	if not path.exists():
	raise FileNotFoundError(
	f"Missing {path}. Run scripts/make_dataset.py first."
	)
	data = np.load(path, mmap_mode="r") # memory mapped for large files

	# Shuffle deterministically so splits are reproducible
	rng = np.random.default_rng(seed=42)
	indices = rng.permutation(len(data))[: n_train + n_test]
	data = data[indices]

	return data[:n_train], data[n_train : n_train + n_test]


	def extract_hog_features(pixel_matrix: np.ndarray) -> np.ndarray:
	"""Compute HOG descriptors for a batch of flat pixel vectors.

	Args:
	pixel_matrix: Array of shape (N, 784), dtype uint8.

	Returns:
	Feature matrix of shape (N, D) where D is the HOG descriptor length.
	"""
	features = []
	for row in pixel_matrix:
	img = row.reshape(IMG_SIZE, IMG_SIZE)
	desc = hog(
	img,
	orientations=HOG_ORIENTATIONS,
	pixels_per_cell=HOG_PIXELS_PER_CELL,
	cells_per_block=HOG_CELLS_PER_BLOCK,
	visualize=False,
	channel_axis=None,
	)
	features.append(desc)
	return np.array(features, dtype=np.float32)


	def build_splits() -> None:
	"""Assemble train/test raw arrays and labels from all classes."""
	train_raws, test_raws = [], []
	train_labels, test_labels = [], []

	print("Loading raw data …")
	for label_idx, cls in enumerate(CLASSES):
	print(f" {cls} ({label_idx + 1}/{len(CLASSES)})")
	tr, te = load_class_data(cls, TRAIN_SAMPLES_PER_CLASS, TEST_SAMPLES_PER_CLASS)
	train_raws.append(tr)
	test_raws.append(te)
	train_labels.append(np.full(len(tr), label_idx, dtype=np.int64))
	test_labels.append(np.full(len(te), label_idx, dtype=np.int64))

	X_train_raw = np.concatenate(train_raws)
	X_test_raw = np.concatenate(test_raws)
	y_train = np.concatenate(train_labels)
	y_test = np.concatenate(test_labels)

	# Shuffle training set
	rng = np.random.default_rng(seed=0)
	perm = rng.permutation(len(X_train_raw))
	X_train_raw = X_train_raw[perm]
	y_train = y_train[perm]

	PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
	np.save(PROCESSED_DIR / "X_train_raw.npy", X_train_raw)
	np.save(PROCESSED_DIR / "X_test_raw.npy", X_test_raw)
	np.save(PROCESSED_DIR / "y_train.npy", y_train)
	np.save(PROCESSED_DIR / "y_test.npy", y_test)
	print(f"\nSaved raw splits → train {X_train_raw.shape}, test {X_test_raw.shape}")


	def build_hog_features() -> None:
	"""Extract HOG features from saved raw arrays."""
	X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy")
	X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy")

	print("Extracting HOG features (train) …")
	X_train_hog = extract_hog_features(X_train_raw)
	print("Extracting HOG features (test) …")
	X_test_hog = extract_hog_features(X_test_raw)

	np.save(PROCESSED_DIR / "X_train_hog.npy", X_train_hog)
	np.save(PROCESSED_DIR / "X_test_hog.npy", X_test_hog)
	print(f"Saved HOG features → train {X_train_hog.shape}, test {X_test_hog.shape}")


	def build_all() -> None:
	"""Run the complete feature building pipeline."""
	build_splits()
	build_hog_features()
	print("\nFeature pipeline complete.")


	if __name__ == "__main__":
	build_all()