Spaces:

BART-ender
/

ml_trainer_env

Sleeping

File size: 3,523 Bytes

8f24287

"""
Neural network architectures for the ML Training Optimizer environment.

Provides three model architectures of increasing complexity:
- SimpleMLP: For MNIST (Easy task)
- SmallCNN: For FashionMNIST (Medium task)
- DeeperCNN: For CIFAR-10 (Hard task)

All models accept a configurable dropout rate set by the agent.
"""

import torch.nn as nn


class SimpleMLP(nn.Module):
    """
    2-layer MLP for MNIST digit classification.
    ~100k parameters. Trains in ~0.5-1s per epoch on CPU with 4k samples.

    Architecture: 784 → 128 → 64 → 10
    """

    def __init__(self, dropout: float = 0.0):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.layers(x)


class SmallCNN(nn.Module):
    """
    Small CNN for FashionMNIST classification.
    ~200k parameters. Trains in ~1-2s per epoch on CPU with 6.5k samples.

    Architecture: 2×(Conv→ReLU→MaxPool) → FC(128) → FC(10)
    """

    def __init__(self, dropout: float = 0.0):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)


class DeeperCNN(nn.Module):
    """
    Deeper CNN for CIFAR-10 classification.
    ~500k parameters. Trains in ~2-3s per epoch on CPU with 8k samples.

    Architecture: 3×(Conv→BN→ReLU→MaxPool) → FC(256) → FC(10)
    """

    def __init__(self, dropout: float = 0.0):
        super().__init__()
        self.features = nn.Sequential(
            # Block 1: 3→32, 32×32 → 16×16
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            # Block 2: 32→64, 16×16 → 8×8
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            # Block 3: 64→128, 8×8 → 4×4
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 10),
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)


def create_model(model_type: str, dropout: float = 0.0) -> nn.Module:
    """Factory function to create a model by name."""
    models = {
        "simple_mlp": SimpleMLP,
        "small_cnn": SmallCNN,
        "deeper_cnn": DeeperCNN,
    }
    if model_type not in models:
        raise ValueError(f"Unknown model type: {model_type}. Choose from {list(models.keys())}")
    return models[model_type](dropout=dropout)