Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from typing import Tuple | |
| import numpy as np | |
| from tqdm import tqdm | |
| import torch | |
| from torchvision import datasets, transforms | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score, classification_report | |
| import joblib | |
| def get_datasets(data_root: str, image_size: int = 64) -> Tuple[torch.utils.data.Dataset, | |
| torch.utils.data.Dataset, | |
| dict]: | |
| """ | |
| Load Oxford-IIIT Pet train/test splits with simple transforms. | |
| Returns: | |
| train_dataset, test_dataset, class_to_idx | |
| """ | |
| # Simple transform: resize -> grayscale -> tensor in [0,1] | |
| transform = transforms.Compose([ | |
| transforms.Resize((image_size, image_size)), | |
| transforms.Grayscale(num_output_channels=1), | |
| transforms.ToTensor(), # (1, H, W), float32 in [0,1] | |
| ]) | |
| train_dataset = datasets.OxfordIIITPet( | |
| root=data_root, | |
| split="trainval", | |
| target_types="category", | |
| transform=transform, | |
| download=True, # downloads to root/oxford-iiit-pet if not present | |
| ) | |
| test_dataset = datasets.OxfordIIITPet( | |
| root=data_root, | |
| split="test", | |
| target_types="category", | |
| transform=transform, | |
| download=True, | |
| ) | |
| # class_to_idx mapping | |
| # Many torchvision datasets expose this attribute | |
| class_to_idx = train_dataset.class_to_idx | |
| return train_dataset, test_dataset, class_to_idx | |
| def dataset_to_numpy(dataset: torch.utils.data.Dataset) -> Tuple[np.ndarray, np.ndarray]: | |
| """ | |
| Convert a torchvision dataset (with tensor images) to numpy arrays | |
| suitable for scikit-learn. | |
| X: (N, D) flattened grayscale pixels | |
| y: (N,) int labels | |
| """ | |
| X_list = [] | |
| y_list = [] | |
| for img, label in tqdm(dataset, desc="Converting to numpy"): | |
| # img: torch.Tensor, shape (1, H, W) | |
| arr = img.numpy() # (1, H, W) | |
| arr = arr.reshape(-1) # flatten to (D,) | |
| X_list.append(arr) | |
| y_list.append(label) | |
| X = np.stack(X_list, axis=0).astype(np.float32) # (N, D) | |
| y = np.array(y_list, dtype=np.int64) # (N,) | |
| return X, y | |
| def save_labels(class_to_idx: dict, labels_path: str): | |
| """ | |
| Save labels as id -> class_name in a JSON file for inference/UI. | |
| """ | |
| # Invert mapping: idx -> class_name | |
| idx_to_class = {idx: cls_name for cls_name, idx in class_to_idx.items()} | |
| os.makedirs(os.path.dirname(labels_path), exist_ok=True) | |
| with open(labels_path, "w") as f: | |
| json.dump(idx_to_class, f, indent=2) | |
| print(f"[INFO] Saved labels to {labels_path}") | |
| def train_logistic_regression(X_train: np.ndarray, y_train: np.ndarray) -> LogisticRegression: | |
| """ | |
| Train multinomial Logistic Regression on given features. | |
| We use 'saga' because it supports multinomial loss and L1/L2, | |
| and works decently with high-dimensional sparse-ish data. | |
| """ | |
| num_classes = len(np.unique(y_train)) | |
| print(f"[INFO] Training Logistic Regression on {X_train.shape[0]} samples, " | |
| f"{X_train.shape[1]} features, {num_classes} classes") | |
| clf = LogisticRegression( | |
| penalty="l2", | |
| C=1.0, | |
| solver="saga", | |
| multi_class="multinomial", | |
| max_iter=1000, | |
| n_jobs=-1, | |
| verbose=1, | |
| ) | |
| clf.fit(X_train, y_train) | |
| return clf | |
| def evaluate_model(clf: LogisticRegression, X: np.ndarray, y: np.ndarray, split_name: str): | |
| """ | |
| Print accuracy and basic classification report for a given split. | |
| """ | |
| y_pred = clf.predict(X) | |
| acc = accuracy_score(y, y_pred) | |
| print(f"\n[{split_name}] Accuracy: {acc * 100:.2f}%") | |
| print(f"[{split_name}] Classification report (macro avg at bottom):") | |
| print(classification_report(y, y_pred, digits=3)) | |
| def main(): | |
| # -------- configs (tweak paths as needed) -------- | |
| project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) | |
| data_root = os.path.join(project_root, "data") | |
| checkpoints_dir = os.path.join(project_root, "checkpoints") | |
| configs_dir = os.path.join(project_root, "configs") | |
| os.makedirs(checkpoints_dir, exist_ok=True) | |
| os.makedirs(configs_dir, exist_ok=True) | |
| labels_path = os.path.join(configs_dir, "labels.json") | |
| model_path = os.path.join(checkpoints_dir, "lr_model.joblib") | |
| image_size = 64 # 64x64 grayscale baseline | |
| # ------------------------------------------------ | |
| print("[INFO] Loading datasets...") | |
| train_dataset, test_dataset, class_to_idx = get_datasets(data_root, image_size=image_size) | |
| print(f"[INFO] Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}") | |
| print(f"[INFO] Number of classes: {len(class_to_idx)}") | |
| print("[INFO] Converting train split to numpy...") | |
| X_train, y_train = dataset_to_numpy(train_dataset) | |
| print("[INFO] Converting test split to numpy...") | |
| X_test, y_test = dataset_to_numpy(test_dataset) | |
| # Save label mapping for later inference | |
| save_labels(class_to_idx, labels_path) | |
| # Train LR | |
| clf = train_logistic_regression(X_train, y_train) | |
| # Evaluate | |
| evaluate_model(clf, X_train, y_train, split_name="Train") | |
| evaluate_model(clf, X_test, y_test, split_name="Test") | |
| # Save model | |
| joblib.dump(clf, model_path) | |
| print(f"[INFO] Saved Logistic Regression model to {model_path}") | |
| if __name__ == "__main__": | |
| main() | |