import os import json from typing import Tuple import numpy as np from tqdm import tqdm import torch from torchvision import datasets, transforms from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report import joblib def get_datasets(data_root: str, image_size: int = 64) -> Tuple[torch.utils.data.Dataset, torch.utils.data.Dataset, dict]: """ Load Oxford-IIIT Pet train/test splits with simple transforms. Returns: train_dataset, test_dataset, class_to_idx """ # Simple transform: resize -> grayscale -> tensor in [0,1] transform = transforms.Compose([ transforms.Resize((image_size, image_size)), transforms.Grayscale(num_output_channels=1), transforms.ToTensor(), # (1, H, W), float32 in [0,1] ]) train_dataset = datasets.OxfordIIITPet( root=data_root, split="trainval", target_types="category", transform=transform, download=True, # downloads to root/oxford-iiit-pet if not present ) test_dataset = datasets.OxfordIIITPet( root=data_root, split="test", target_types="category", transform=transform, download=True, ) # class_to_idx mapping # Many torchvision datasets expose this attribute class_to_idx = train_dataset.class_to_idx return train_dataset, test_dataset, class_to_idx def dataset_to_numpy(dataset: torch.utils.data.Dataset) -> Tuple[np.ndarray, np.ndarray]: """ Convert a torchvision dataset (with tensor images) to numpy arrays suitable for scikit-learn. X: (N, D) flattened grayscale pixels y: (N,) int labels """ X_list = [] y_list = [] for img, label in tqdm(dataset, desc="Converting to numpy"): # img: torch.Tensor, shape (1, H, W) arr = img.numpy() # (1, H, W) arr = arr.reshape(-1) # flatten to (D,) X_list.append(arr) y_list.append(label) X = np.stack(X_list, axis=0).astype(np.float32) # (N, D) y = np.array(y_list, dtype=np.int64) # (N,) return X, y def save_labels(class_to_idx: dict, labels_path: str): """ Save labels as id -> class_name in a JSON file for inference/UI. """ # Invert mapping: idx -> class_name idx_to_class = {idx: cls_name for cls_name, idx in class_to_idx.items()} os.makedirs(os.path.dirname(labels_path), exist_ok=True) with open(labels_path, "w") as f: json.dump(idx_to_class, f, indent=2) print(f"[INFO] Saved labels to {labels_path}") def train_logistic_regression(X_train: np.ndarray, y_train: np.ndarray) -> LogisticRegression: """ Train multinomial Logistic Regression on given features. We use 'saga' because it supports multinomial loss and L1/L2, and works decently with high-dimensional sparse-ish data. """ num_classes = len(np.unique(y_train)) print(f"[INFO] Training Logistic Regression on {X_train.shape[0]} samples, " f"{X_train.shape[1]} features, {num_classes} classes") clf = LogisticRegression( penalty="l2", C=1.0, solver="saga", multi_class="multinomial", max_iter=1000, n_jobs=-1, verbose=1, ) clf.fit(X_train, y_train) return clf def evaluate_model(clf: LogisticRegression, X: np.ndarray, y: np.ndarray, split_name: str): """ Print accuracy and basic classification report for a given split. """ y_pred = clf.predict(X) acc = accuracy_score(y, y_pred) print(f"\n[{split_name}] Accuracy: {acc * 100:.2f}%") print(f"[{split_name}] Classification report (macro avg at bottom):") print(classification_report(y, y_pred, digits=3)) def main(): # -------- configs (tweak paths as needed) -------- project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) data_root = os.path.join(project_root, "data") checkpoints_dir = os.path.join(project_root, "checkpoints") configs_dir = os.path.join(project_root, "configs") os.makedirs(checkpoints_dir, exist_ok=True) os.makedirs(configs_dir, exist_ok=True) labels_path = os.path.join(configs_dir, "labels.json") model_path = os.path.join(checkpoints_dir, "lr_model.joblib") image_size = 64 # 64x64 grayscale baseline # ------------------------------------------------ print("[INFO] Loading datasets...") train_dataset, test_dataset, class_to_idx = get_datasets(data_root, image_size=image_size) print(f"[INFO] Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}") print(f"[INFO] Number of classes: {len(class_to_idx)}") print("[INFO] Converting train split to numpy...") X_train, y_train = dataset_to_numpy(train_dataset) print("[INFO] Converting test split to numpy...") X_test, y_test = dataset_to_numpy(test_dataset) # Save label mapping for later inference save_labels(class_to_idx, labels_path) # Train LR clf = train_logistic_regression(X_train, y_train) # Evaluate evaluate_model(clf, X_train, y_train, split_name="Train") evaluate_model(clf, X_test, y_test, split_name="Test") # Save model joblib.dump(clf, model_path) print(f"[INFO] Saved Logistic Regression model to {model_path}") if __name__ == "__main__": main()