""" model.py – Define, train, and evaluate all three models: 1. Naive baseline (majority class classifier) 2. Classical ML (Random Forest on HOG features) 3. Deep learning (ScribblNet CNN) Also runs the training size sensitivity experiment and saves results/plots. Usage: python scripts/model.py """ import json import sys import time from pathlib import Path from typing import Any import joblib import matplotlib matplotlib.use("Agg") # headless backend import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import ( accuracy_score, classification_report, confusion_matrix, ) from sklearn.preprocessing import StandardScaler from torch.utils.data import DataLoader, TensorDataset import seaborn as sns sys.path.insert(0, str(Path(__file__).parent.parent)) from config import ( CLASSES, MODELS_DIR, OUTPUTS_DIR, PROCESSED_DIR, NUM_CLASSES, RF_MAX_DEPTH, RF_N_ESTIMATORS, DEEP_BATCH_SIZE, DEEP_EPOCHS, DEEP_LR, DEEP_WEIGHT_DECAY, IMG_SIZE, EXPERIMENT_FRACTIONS, EXPERIMENT_EPOCHS, ) # Utility def get_device() -> torch.device: """Return the best available torch device (MPS > CUDA > CPU).""" if torch.backends.mps.is_available(): return torch.device("mps") if torch.cuda.is_available(): return torch.device("cuda") return torch.device("cpu") def load_processed_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Load all processed arrays from disk. Returns: X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog """ X_train_raw = np.load(PROCESSED_DIR / "X_train_raw.npy") X_test_raw = np.load(PROCESSED_DIR / "X_test_raw.npy") y_train = np.load(PROCESSED_DIR / "y_train.npy") y_test = np.load(PROCESSED_DIR / "y_test.npy") X_train_hog = np.load(PROCESSED_DIR / "X_train_hog.npy") X_test_hog = np.load(PROCESSED_DIR / "X_test_hog.npy") return X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog # 1. Naive Baseline class MajorityClassifier: """Naive baseline: always predicts the most frequent class in training.""" def __init__(self) -> None: self.majority_class: int = 0 def fit(self, y: np.ndarray) -> "MajorityClassifier": """Fit by finding the majority class label. Args: y: 1-D array of integer class labels. Returns: self """ counts = np.bincount(y) self.majority_class = int(np.argmax(counts)) return self def predict(self, n_samples: int) -> np.ndarray: """Return the majority class repeated n_samples times. Args: n_samples: Number of predictions to generate. Returns: Array of length n_samples, all equal to majority_class. """ return np.full(n_samples, self.majority_class, dtype=np.int64) def train_naive(y_train: np.ndarray, y_test: np.ndarray) -> dict[str, Any]: """Train and evaluate the majority class baseline. Args: y_train: Training labels. y_test: Test labels. Returns: Dictionary of evaluation metrics. """ print(f"\nNaive Baseline") clf = MajorityClassifier().fit(y_train) preds = clf.predict(len(y_test)) acc = accuracy_score(y_test, preds) print(f" Majority class: {CLASSES[clf.majority_class]}") print(f" Test accuracy: {acc:.4f}") model_data = {"majority_class": clf.majority_class, "accuracy": acc} joblib.dump(model_data, MODELS_DIR / "naive_model.pkl") return {"model": "naive", "accuracy": acc} # 2. Classical ML def train_classical( X_train_hog: np.ndarray, X_test_hog: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, ) -> dict[str, Any]: """Train Random Forest on HOG features and evaluate. Args: X_train_hog: Training HOG feature matrix. X_test_hog: Test HOG feature matrix. y_train: Training labels. y_test: Test labels. Returns: Dictionary of evaluation metrics. """ print(f"\nClassical ML (Random Forest on HOG)") # Standardise features scaler = StandardScaler() X_tr = scaler.fit_transform(X_train_hog) X_te = scaler.transform(X_test_hog) clf = RandomForestClassifier( n_estimators=RF_N_ESTIMATORS, max_depth=RF_MAX_DEPTH, n_jobs=-1, random_state=42, ) t0 = time.time() clf.fit(X_tr, y_train) elapsed = time.time() - t0 preds = clf.predict(X_te) acc = accuracy_score(y_test, preds) report = classification_report(y_test, preds, target_names=CLASSES) print(f" Training time: {elapsed:.1f}s") print(f" Test accuracy: {acc:.4f}") print(f"\n{report}") joblib.dump({"clf": clf, "scaler": scaler}, MODELS_DIR / "classical_model.pkl") _save_confusion_matrix(y_test, preds, "classical_confusion_matrix.png") return {"model": "classical", "accuracy": acc, "training_time_s": elapsed} # 3. Deep Model class ScribblNet(nn.Module): """Lightweight CNN for 28×28 grayscale sketch classification. Architecture: 3 × (Conv2d → BatchNorm → ReLU → MaxPool) Dropout → FC(1152→256) → ReLU → Dropout → FC(256→num_classes) """ def __init__(self, num_classes: int = NUM_CLASSES) -> None: super().__init__() self.features = nn.Sequential( nn.Conv2d(1, 32, kernel_size=3, padding=1), nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), nn.ReLU(inplace=True), nn.MaxPool2d(2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.BatchNorm2d(128), nn.ReLU(inplace=True), nn.MaxPool2d(2), ) # 28→14→7→3 ∴ feature map is 128×3×3 = 1152 self.classifier = nn.Sequential( nn.Dropout(0.5), nn.Linear(128 * 3 * 3, 256), nn.ReLU(inplace=True), nn.Dropout(0.3), nn.Linear(256, num_classes), ) def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass. Args: x: Tensor of shape (B, 1, 28, 28), values in [0, 1]. Returns: Logits tensor of shape (B, num_classes). """ x = self.features(x) x = x.view(x.size(0), -1) return self.classifier(x) def make_dataloaders( X_raw: np.ndarray, y: np.ndarray, X_test_raw: np.ndarray, y_test: np.ndarray, batch_size: int = DEEP_BATCH_SIZE, train_fraction: float = 1.0, ) -> tuple[DataLoader, DataLoader]: """Build PyTorch DataLoaders from raw pixel arrays. Pixel values are normalised to [0, 1]. Training set can be subsampled via train_fraction for the sensitivity experiment. Args: X_raw: Training pixel array (N, 784), uint8. y: Training labels. X_test_raw: Test pixel array. y_test: Test labels. batch_size: Minibatch size. train_fraction: Fraction of training samples to use (0 < f ≤ 1). Returns: (train_loader, test_loader) """ if train_fraction < 1.0: n = max(1, int(len(X_raw) * train_fraction)) idx = np.random.default_rng(seed=7).permutation(len(X_raw))[:n] X_raw = X_raw[idx] y = y[idx] def _to_tensor(X: np.ndarray, labels: np.ndarray) -> TensorDataset: imgs = torch.from_numpy(X.astype(np.float32) / 255.0) imgs = imgs.view(-1, 1, IMG_SIZE, IMG_SIZE) return TensorDataset(imgs, torch.from_numpy(labels)) train_ds = _to_tensor(X_raw, y) test_ds = _to_tensor(X_test_raw, y_test) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0) test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0) return train_loader, test_loader def train_one_epoch( model: nn.Module, loader: DataLoader, optimizer: torch.optim.Optimizer, criterion: nn.Module, device: torch.device, ) -> float: """Run one training epoch and return average loss. Args: model: ScribblNet instance. loader: Training DataLoader. optimizer: Optimiser (Adam). criterion: Loss function (CrossEntropyLoss). device: Torch device. Returns: Mean loss over all minibatches. """ model.train() total_loss = 0.0 for imgs, labels in loader: imgs, labels = imgs.to(device), labels.to(device) optimizer.zero_grad() loss = criterion(model(imgs), labels) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) def evaluate( model: nn.Module, loader: DataLoader, device: torch.device, ) -> tuple[float, np.ndarray]: """Evaluate model on a DataLoader. Args: model: ScribblNet instance. loader: Evaluation DataLoader. device: Torch device. Returns: (accuracy, predictions_array) """ model.eval() all_preds, all_labels = [], [] with torch.no_grad(): for imgs, labels in loader: imgs = imgs.to(device) preds = model(imgs).argmax(dim=1).cpu().numpy() all_preds.append(preds) all_labels.append(labels.numpy()) preds = np.concatenate(all_preds) labels = np.concatenate(all_labels) return accuracy_score(labels, preds), preds def train_deep( X_train_raw: np.ndarray, X_test_raw: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, epochs: int = DEEP_EPOCHS, train_fraction: float = 1.0, save_model: bool = True, ) -> dict[str, Any]: """Train ScribblNet and evaluate on test set. Args: X_train_raw: Raw training pixel array. X_test_raw: Raw test pixel array. y_train: Training labels. y_test: Test labels. epochs: Number of training epochs. train_fraction: Fraction of training data to use. save_model: Whether to save weights to disk. Returns: Dictionary of evaluation metrics and training history. """ print(f"\nDeep Model (ScribblNet, fraction={train_fraction:.0%})") device = get_device() print(f" Device: {device}") train_loader, test_loader = make_dataloaders( X_train_raw, y_train, X_test_raw, y_test, train_fraction=train_fraction ) model = ScribblNet(num_classes=NUM_CLASSES).to(device) optimizer = torch.optim.Adam( model.parameters(), lr=DEEP_LR, weight_decay=DEEP_WEIGHT_DECAY ) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) criterion = nn.CrossEntropyLoss() history = {"loss": [], "val_acc": []} best_acc = 0.0 for epoch in range(1, epochs + 1): loss = train_one_epoch(model, train_loader, optimizer, criterion, device) acc, _ = evaluate(model, test_loader, device) scheduler.step() history["loss"].append(loss) history["val_acc"].append(acc) print(f" epoch {epoch:02d}/{epochs} loss={loss:.4f} val_acc={acc:.4f}") if acc > best_acc: best_acc = acc if save_model: torch.save(model.state_dict(), MODELS_DIR / "deep_model.pth") # Final evaluation with best weights if save_model: model.load_state_dict(torch.load(MODELS_DIR / "deep_model.pth", map_location=device)) final_acc, final_preds = evaluate(model, test_loader, device) print(f"\n Best test accuracy: {best_acc:.4f}") if save_model: report = classification_report(y_test, final_preds, target_names=CLASSES) print(f"\n{report}") _save_confusion_matrix(y_test, final_preds, "deep_confusion_matrix.png") _save_training_curves(history) return {"model": "deep", "accuracy": best_acc, "history": history} # Experiment: Training Size Sensitivity def run_experiment( X_train_raw: np.ndarray, X_test_raw: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, X_train_hog: np.ndarray, X_test_hog: np.ndarray, ) -> None: """Training set size sensitivity analysis. Sweeps over EXPERIMENT_FRACTIONS, training both the deep model and Random Forest at each fraction, then plots accuracy vs number of training samples. Motivation: Understanding how each model scales with data volume helps justify architectural choices and highlights when more data is beneficial. Args: X_train_raw: Raw training pixels. X_test_raw: Raw test pixels. y_train: Training labels. y_test: Test labels. X_train_hog: HOG training features. X_test_hog: HOG test features. """ print(f"\nExperiment: Training Size Sensitivity") deep_accs, rf_accs, n_samples = [], [], [] scaler = StandardScaler() X_test_scaled = scaler.fit_transform(X_test_hog) for frac in EXPERIMENT_FRACTIONS: n = int(len(X_train_raw) * frac) n_samples.append(n) print(f"\n Fraction={frac:.0%} (n={n})") # Deep model result = train_deep( X_train_raw, X_test_raw, y_train, y_test, epochs=EXPERIMENT_EPOCHS, train_fraction=frac, save_model=False, ) deep_accs.append(result["accuracy"]) # Random Forest idx = np.random.default_rng(seed=42).permutation(len(X_train_hog))[:n] X_tr = scaler.fit_transform(X_train_hog[idx]) rf = RandomForestClassifier( n_estimators=100, n_jobs=-1, random_state=42 ) rf.fit(X_tr, y_train[idx]) rf_pred = rf.predict(X_test_scaled) rf_accs.append(accuracy_score(y_test, rf_pred)) print(f" RF acc={rf_accs[-1]:.4f}") # Plot fig, ax = plt.subplots(figsize=(8, 5)) ax.plot(n_samples, deep_accs, marker="o", linestyle="solid", label="ScribblNet (CNN)", linewidth=2, markersize=7) ax.plot(n_samples, rf_accs, marker="s", linestyle="dashed", label="Random Forest (HOG)", linewidth=2, markersize=7) ax.set_xlabel("Training samples", fontsize=12) ax.set_ylabel("Test accuracy", fontsize=12) ax.set_title("Training Set Size Sensitivity", fontsize=14) ax.legend(fontsize=11) ax.grid(True, alpha=0.3) ax.set_ylim(0, 1) plt.tight_layout() out_path = OUTPUTS_DIR / "experiment_sensitivity.png" fig.savefig(out_path, dpi=150) plt.close(fig) print(f"\n Saved experiment plot → {out_path}") results = { "fractions": EXPERIMENT_FRACTIONS, "n_samples": n_samples, "deep_accs": deep_accs, "rf_accs": rf_accs, } with open(OUTPUTS_DIR / "experiment_results.json", "w") as f: json.dump(results, f, indent=2) print(" Saved experiment_results.json") # Plotting Helpers def _save_confusion_matrix( y_true: np.ndarray, y_pred: np.ndarray, filename: str, ) -> None: """Save a normalised confusion matrix heatmap. Args: y_true: Ground truth labels. y_pred: Predicted labels. filename: Output filename (saved under OUTPUTS_DIR). """ cm = confusion_matrix(y_true, y_pred, normalize="true") fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap( cm, annot=True, fmt=".2f", xticklabels=CLASSES, yticklabels=CLASSES, cmap="Blues", ax=ax, linewidths=0.5, ) ax.set_xlabel("Predicted", fontsize=11) ax.set_ylabel("True", fontsize=11) ax.set_title(filename.replace("_", " ").replace(".png", "").title(), fontsize=13) plt.xticks(rotation=45, ha="right") plt.tight_layout() fig.savefig(OUTPUTS_DIR / filename, dpi=150) plt.close(fig) print(f" Saved {filename}") def _save_training_curves(history: dict[str, list[float]]) -> None: """Save loss and validation accuracy curves for the deep model. Args: history: Dict with keys 'loss' and 'val_acc', each a list of per epoch values. """ epochs = range(1, len(history["loss"]) + 1) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4)) ax1.plot(epochs, history["loss"], color="steelblue", marker="o", linestyle="solid", markersize=5) ax1.set_xlabel("Epoch") ax1.set_ylabel("Training Loss") ax1.set_title("ScribblNet Training Loss") ax1.grid(True, alpha=0.3) ax2.plot(epochs, history["val_acc"], color="seagreen", marker="o", linestyle="solid", markersize=5) ax2.set_xlabel("Epoch") ax2.set_ylabel("Validation Accuracy") ax2.set_title("ScribblNet Validation Accuracy") ax2.grid(True, alpha=0.3) plt.tight_layout() fig.savefig(OUTPUTS_DIR / "deep_training_curves.png", dpi=150) plt.close(fig) print(" Saved deep_training_curves.png") def _save_model_comparison(results: list[dict[str, Any]]) -> None: """Bar chart comparing test accuracy across all three models. Args: results: List of result dicts each containing 'model' and 'accuracy'. """ names = [r["model"].capitalize() for r in results] accs = [r["accuracy"] for r in results] fig, ax = plt.subplots(figsize=(7, 4)) bars = ax.bar(names, accs, color=["#94a3b8", "#60a5fa", "#34d399"], width=0.5) ax.set_ylim(0, 1) ax.set_ylabel("Test Accuracy") ax.set_title("Model Comparison") for bar, acc in zip(bars, accs): ax.text( bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01, f"{acc:.3f}", ha="center", fontsize=12, ) ax.grid(True, axis="y", alpha=0.3) plt.tight_layout() fig.savefig(OUTPUTS_DIR / "model_comparison.png", dpi=150) plt.close(fig) print(" Saved model_comparison.png") # Orchestrator def train_all() -> None: """Train all three models, run the experiment, and save all artefacts.""" X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog = ( load_processed_data() ) r_naive = train_naive(y_train, y_test) r_classical = train_classical(X_train_hog, X_test_hog, y_train, y_test) r_deep = train_deep(X_train_raw, X_test_raw, y_train, y_test) _save_model_comparison([r_naive, r_classical, r_deep]) run_experiment( X_train_raw, X_test_raw, y_train, y_test, X_train_hog, X_test_hog ) summary = { "naive_accuracy": r_naive["accuracy"], "classical_accuracy": r_classical["accuracy"], "deep_accuracy": r_deep["accuracy"], } with open(OUTPUTS_DIR / "results_summary.json", "w") as f: json.dump(summary, f, indent=2) print("\nTraining complete. Summary:") for k, v in summary.items(): print(f" {k}: {v:.4f}") if __name__ == "__main__": train_all()