feat(model, train): improved architecture, overfit prevention, re-trained the model

Browse files

Files changed (8) hide show

README.md +36 -0
model.py +23 -18
performance.json +649 -115
performance_plot.png +0 -0
plots.py +3 -3
predictions.csv +0 -0
train.py +0 -394
train_dist.py +40 -7

README.md CHANGED Viewed

@@ -1,2 +1,38 @@
 # Places-ResNet
 My experiment training a ResNet-inspired model for image classification using PyTorch.

 # Places-ResNet
 My experiment training a ResNet-inspired model for image classification using PyTorch.
+**Key terms: distributed training, residual layers, convolutional layers, batch normalization, dropout, pooling, SGD, label smoothing, learning rate scheduling, early stopping, data augmentation.**
+Training time was approximately 10 hours (108 epochs) using **distributed training** across university server GPUs.
+## Dataset:
+**MIT MiniPlaces Dataset:** Contains 100,000 training images, 10,000 validation images, and 10,000 testing images across 100 scene categories. Each image is 128x128 pixels.
+## Model:
+I implemented a 13-layer ResNet-inspired model for image classification. The architecture consists of:
+- Initial **convolutional layer** with 64 filters, followed by **batch normalization, max pooling, and dropout**
+- 3 stages of **residual blocks**, each with 4 convolutional layers
+- Each **residual block** has two 3x3 **convolutional layers** with **batch normalization and dropout**
+- The number of filters increases from 64 in the first stage, to 128, 256, and 512 in the later stages
+- **Global average pooling and dropout** before a final **fully connected layer**
+The total number of trainable model parameters is 29,678,180.
+## Training:
+The training setup used a **distributed training** approach, with **early stopping** to prevent overfitting. **Data augmentation** techniques were applied to the training and validation sets. An **SGD optimizer** with **label smoothing** was used, along with a **ReduceLROnPlateau learning rate scheduler**.
+## Performance:
+Best model checkpoint results (epoch 108):
+- Training Loss: 2.3231, Training Accuracy: 53.02%
+- Validation Loss: 2.3426, Validation Accuracy: 54.09%
+- Top-5 Validation Accuracy: 81.34%
+Achieving a **Top-1 accuracy of 54.09% and Top-5 accuracy of 81.34%**
+![Training Performance Plot](performance_plot.png "Training Performance Plot")

model.py CHANGED Viewed

@@ -3,14 +3,15 @@ import torch.nn as nn
 class ResidualBlock(nn.Module):
-    def __init__(self, in_channels, out_channels):
         super(ResidualBlock, self).__init__()
         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
         self.bn1 = nn.BatchNorm2d(out_channels)
         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
         self.bn2 = nn.BatchNorm2d(out_channels)
-        # Skip connection (identity mapping)
         self.skip_connection = nn.Sequential()
         if in_channels != out_channels:
             self.skip_connection = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
@@ -18,53 +19,57 @@ class ResidualBlock(nn.Module):
     def forward(self, x):
         residual = self.skip_connection(x)
         out = nn.functional.relu(self.bn1(self.conv1(x)))
         out = self.bn2(self.conv2(out))
-        out += residual  # Adding the skip connection
         out = nn.functional.relu(out)
         return out
 class MyModel(nn.Module):
-    def __init__(self, num_classes=100):
         super(MyModel, self).__init__()
-        # Initial convolutional layer
         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
         self.bn1 = nn.BatchNorm2d(64)
         self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        # Residual blocks
-        self.block1 = self._resnet_layers(64, 128, num_blocks=3)  # 3 residual blocks
-        self.block2 = self._resnet_layers(128, 256, num_blocks=3)  # 3 residual blocks
-        self.block3 = self._resnet_layers(256, 512, num_blocks=3)  # 3 residual blocks
-        # Global average pooling
         self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
-        # Combine features
         self.features = nn.Sequential(
             self.conv1,
             self.bn1,
             nn.ReLU(),
             self.pool1,
             self.block1,
             self.block2,
             self.block3,
-            self.global_avg_pool
         )
-        # Fully connected layer
-        self.fc = nn.Linear(512, num_classes)
     @staticmethod
     def _resnet_layers(in_channels, out_channels, num_blocks):
         return nn.Sequential(
-            ResidualBlock(in_channels, out_channels),
-            *[ResidualBlock(out_channels, out_channels) for _ in range(num_blocks)]
         )
     def forward(self, x):
         x = self.features(x)
-        x = torch.flatten(x, 1)  # Flatten the output for the fully connected layer
         x = self.fc(x)
         return x

 class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, dropout_rate=0.2):
         super(ResidualBlock, self).__init__()
         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
         self.bn1 = nn.BatchNorm2d(out_channels)
+        self.dropout1 = nn.Dropout2d(p=dropout_rate)
         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
         self.bn2 = nn.BatchNorm2d(out_channels)
+        self.dropout2 = nn.Dropout2d(p=dropout_rate)
         self.skip_connection = nn.Sequential()
         if in_channels != out_channels:
             self.skip_connection = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
     def forward(self, x):
         residual = self.skip_connection(x)
         out = nn.functional.relu(self.bn1(self.conv1(x)))
+        out = self.dropout1(out)
         out = self.bn2(self.conv2(out))
+        out = self.dropout2(out)
+        out += residual
         out = nn.functional.relu(out)
         return out
 class MyModel(nn.Module):
+    def __init__(self, num_classes=100, dropout_rate=0.2):
         super(MyModel, self).__init__()
+        self.dropout_rate = dropout_rate
         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
         self.bn1 = nn.BatchNorm2d(64)
         self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.dropout1 = nn.Dropout2d(p=self.dropout_rate)
+        # Increase the number of residual blocks
+        self.block1 = self._resnet_layers(64, 128, num_blocks=4)
+        self.block2 = self._resnet_layers(128, 256, num_blocks=4)
+        self.block3 = self._resnet_layers(256, 512, num_blocks=4)
         self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.dropout2 = nn.Dropout(p=self.dropout_rate)
+        # Reduce the size of the fully connected layer
+        self.fc = nn.Linear(512, num_classes)
         self.features = nn.Sequential(
             self.conv1,
             self.bn1,
             nn.ReLU(),
             self.pool1,
+            self.dropout1,
             self.block1,
             self.block2,
             self.block3,
+            self.global_avg_pool,
+            self.dropout2
         )
     @staticmethod
     def _resnet_layers(in_channels, out_channels, num_blocks):
         return nn.Sequential(
+            ResidualBlock(in_channels, out_channels, dropout_rate=0.2),
+            *[ResidualBlock(out_channels, out_channels, dropout_rate=0.2) for _ in range(num_blocks)]
         )
     def forward(self, x):
         x = self.features(x)
+        x = torch.flatten(x, 1)
         x = self.fc(x)
         return x

performance.json CHANGED Viewed

@@ -1,176 +1,710 @@
 [
     {
-        "avg_train_loss": 4.105753168425597,
-        "train_accuracy": 0.08726,
-        "avg_val_loss": 3.8632843403876582,
-        "val_accuracy": 0.1306000053882599
     },
     {
-        "avg_train_loss": 3.7184383619167005,
-        "train_accuracy": 0.1609,
-        "avg_val_loss": 3.5157296868819223,
-        "val_accuracy": 0.20409999787807465
     },
     {
-        "avg_train_loss": 3.5134875548770057,
-        "train_accuracy": 0.20752,
-        "avg_val_loss": 3.3557024605666537,
-        "val_accuracy": 0.24459999799728394
     },
     {
-        "avg_train_loss": 3.3635203539562957,
-        "train_accuracy": 0.244,
-        "avg_val_loss": 3.320155662826345,
-        "val_accuracy": 0.25769999623298645
     },
     {
-        "avg_train_loss": 3.2561175189054836,
-        "train_accuracy": 0.2721,
-        "avg_val_loss": 3.2409366655953322,
-        "val_accuracy": 0.2786000072956085
     },
     {
-        "avg_train_loss": 3.165564750466505,
-        "train_accuracy": 0.2952,
-        "avg_val_loss": 3.3207412912875793,
-        "val_accuracy": 0.28110000491142273
     },
     {
-        "avg_train_loss": 3.089012709724934,
-        "train_accuracy": 0.31326,
-        "avg_val_loss": 3.1548544968230816,
-        "val_accuracy": 0.3131999969482422
     },
     {
-        "avg_train_loss": 3.0239714097488872,
-        "train_accuracy": 0.33192,
-        "avg_val_loss": 3.0669574978985366,
-        "val_accuracy": 0.3246999979019165
     },
     {
-        "avg_train_loss": 2.9728026246780628,
-        "train_accuracy": 0.34622,
-        "avg_val_loss": 3.1410958978194223,
-        "val_accuracy": 0.3125999867916107
     },
     {
-        "avg_train_loss": 2.926501644236962,
-        "train_accuracy": 0.35726,
-        "avg_val_loss": 3.0194991872280457,
-        "val_accuracy": 0.34369999170303345
     },
     {
-        "avg_train_loss": 2.881767414719857,
-        "train_accuracy": 0.37002,
-        "avg_val_loss": 3.1654707510260085,
-        "val_accuracy": 0.3264000117778778
     },
     {
-        "avg_train_loss": 2.8386977173178396,
-        "train_accuracy": 0.37992,
-        "avg_val_loss": 2.908680589893196,
-        "val_accuracy": 0.3734000027179718
     },
     {
-        "avg_train_loss": 2.7958365852570597,
-        "train_accuracy": 0.39322,
-        "avg_val_loss": 2.818336969689478,
-        "val_accuracy": 0.38659998774528503
     },
     {
-        "avg_train_loss": 2.7660993075431763,
-        "train_accuracy": 0.40192,
-        "avg_val_loss": 2.941794866248022,
-        "val_accuracy": 0.3686000108718872
     },
     {
-        "avg_train_loss": 2.7263018761754343,
-        "train_accuracy": 0.41194,
-        "avg_val_loss": 2.8387841333316852,
-        "val_accuracy": 0.39089998602867126
     },
     {
-        "avg_train_loss": 2.6966828406619294,
-        "train_accuracy": 0.42182,
-        "avg_val_loss": 2.975855187524723,
-        "val_accuracy": 0.36570000648498535
     },
     {
-        "avg_train_loss": 2.6595005979928215,
-        "train_accuracy": 0.43252,
-        "avg_val_loss": 2.866811245302611,
-        "val_accuracy": 0.3898000121116638
     },
     {
-        "avg_train_loss": 2.626687426396343,
-        "train_accuracy": 0.4389,
-        "avg_val_loss": 2.797353237490111,
-        "val_accuracy": 0.4056999981403351
     },
     {
-        "avg_train_loss": 2.607301542521133,
-        "train_accuracy": 0.44456,
-        "avg_val_loss": 2.8504348948032043,
         "val_accuracy": 0.4027000069618225
     },
     {
-        "avg_train_loss": 2.571250948454718,
-        "train_accuracy": 0.45616,
-        "avg_val_loss": 2.87233859677858,
-        "val_accuracy": 0.39640000462532043
     },
     {
-        "avg_train_loss": 2.5492486542143173,
-        "train_accuracy": 0.46084,
-        "avg_val_loss": 2.743163096753857,
-        "val_accuracy": 0.42489999532699585
     },
     {
-        "avg_train_loss": 2.525591837780555,
-        "train_accuracy": 0.46876,
-        "avg_val_loss": 2.9510807085640822,
-        "val_accuracy": 0.3882000148296356
     },
     {
-        "avg_train_loss": 2.5095781770813494,
-        "train_accuracy": 0.47198,
-        "avg_val_loss": 2.7676040069966374,
-        "val_accuracy": 0.4284000098705292
     },
     {
-        "avg_train_loss": 2.4809405361599937,
-        "train_accuracy": 0.48236,
-        "avg_val_loss": 2.7205014772053007,
-        "val_accuracy": 0.4325999915599823
     },
     {
-        "avg_train_loss": 2.4620410210031376,
-        "train_accuracy": 0.4867,
-        "avg_val_loss": 2.674741914000692,
-        "val_accuracy": 0.439300000667572
     },
     {
-        "avg_train_loss": 2.431113924059417,
-        "train_accuracy": 0.494,
-        "avg_val_loss": 2.6500483645668513,
-        "val_accuracy": 0.4472000002861023
     },
     {
-        "avg_train_loss": 2.4075386673593155,
-        "train_accuracy": 0.50418,
-        "avg_val_loss": 2.7652997366989713,
-        "val_accuracy": 0.4194999933242798
     },
     {
-        "avg_train_loss": 2.390994114796524,
-        "train_accuracy": 0.50546,
-        "avg_val_loss": 2.750720060324367,
-        "val_accuracy": 0.42570000886917114
     },
     {
-        "avg_train_loss": 2.3609870321610393,
-        "train_accuracy": 0.5147,
-        "avg_val_loss": 2.6785139252867878,
-        "val_accuracy": 0.4438999891281128
     }
 ]

 [
     {
+        "avg_train_loss": 4.452685312236971,
+        "train_accuracy": 0.03584,
+        "avg_val_loss": 4.1456576963014244,
+        "val_accuracy": 0.07199999690055847
     },
     {
+        "avg_train_loss": 4.22897495028308,
+        "train_accuracy": 0.06454,
+        "avg_val_loss": 3.984914272646361,
+        "val_accuracy": 0.10199999809265137
     },
     {
+        "avg_train_loss": 4.097483895318892,
+        "train_accuracy": 0.08686,
+        "avg_val_loss": 3.8411196938043908,
+        "val_accuracy": 0.13650000095367432
     },
     {
+        "avg_train_loss": 3.993970947192453,
+        "train_accuracy": 0.1051,
+        "avg_val_loss": 3.7907390353045884,
+        "val_accuracy": 0.14059999585151672
     },
     {
+        "avg_train_loss": 3.9059129904603105,
+        "train_accuracy": 0.1232,
+        "avg_val_loss": 3.656763390649723,
+        "val_accuracy": 0.164900004863739
     },
     {
+        "avg_train_loss": 3.8222918010428737,
+        "train_accuracy": 0.13996,
+        "avg_val_loss": 3.5380990716475473,
+        "val_accuracy": 0.19670000672340393
     },
     {
+        "avg_train_loss": 3.756975746520645,
+        "train_accuracy": 0.1505,
+        "avg_val_loss": 3.4915640142899527,
+        "val_accuracy": 0.20239999890327454
     },
     {
+        "avg_train_loss": 3.6912049436203356,
+        "train_accuracy": 0.16672,
+        "avg_val_loss": 3.4289465795589398,
+        "val_accuracy": 0.2176000028848648
     },
     {
+        "avg_train_loss": 3.6327530968829493,
+        "train_accuracy": 0.18014,
+        "avg_val_loss": 3.3895603614517404,
+        "val_accuracy": 0.2402999997138977
     },
     {
+        "avg_train_loss": 3.578799500489784,
+        "train_accuracy": 0.19102,
+        "avg_val_loss": 3.3035911849782438,
+        "val_accuracy": 0.2533000111579895
     },
     {
+        "avg_train_loss": 3.5295029982276587,
+        "train_accuracy": 0.20518,
+        "avg_val_loss": 3.2572307345233384,
+        "val_accuracy": 0.2711000144481659
     },
     {
+        "avg_train_loss": 3.49012098257499,
+        "train_accuracy": 0.21266,
+        "avg_val_loss": 3.194729382478738,
+        "val_accuracy": 0.29030001163482666
     },
     {
+        "avg_train_loss": 3.444543719901453,
+        "train_accuracy": 0.22732,
+        "avg_val_loss": 3.1562333891663372,
+        "val_accuracy": 0.2962000072002411
     },
     {
+        "avg_train_loss": 3.409660898206179,
+        "train_accuracy": 0.23328,
+        "avg_val_loss": 3.1249258306962027,
+        "val_accuracy": 0.303600013256073
     },
     {
+        "avg_train_loss": 3.365924084278019,
+        "train_accuracy": 0.2484,
+        "avg_val_loss": 3.1003157217291335,
+        "val_accuracy": 0.3172000050544739
     },
     {
+        "avg_train_loss": 3.3438522516918914,
+        "train_accuracy": 0.25078,
+        "avg_val_loss": 3.0775443934187106,
+        "val_accuracy": 0.3222000002861023
     },
     {
+        "avg_train_loss": 3.310065208188713,
+        "train_accuracy": 0.26132,
+        "avg_val_loss": 3.0591898085195806,
+        "val_accuracy": 0.3260999917984009
     },
     {
+        "avg_train_loss": 3.2758816282462586,
+        "train_accuracy": 0.2669,
+        "avg_val_loss": 3.0423757818680777,
+        "val_accuracy": 0.3319999873638153
     },
     {
+        "avg_train_loss": 3.2471999869017343,
+        "train_accuracy": 0.276,
+        "avg_val_loss": 3.0076527655879155,
+        "val_accuracy": 0.3409999907016754
+    },
+    {
+        "avg_train_loss": 3.2196996800429987,
+        "train_accuracy": 0.28116,
+        "avg_val_loss": 2.9659501087816458,
+        "val_accuracy": 0.3515999913215637
+    },
+    {
+        "avg_train_loss": 3.195254132875701,
+        "train_accuracy": 0.28704,
+        "avg_val_loss": 2.9751936514166335,
+        "val_accuracy": 0.35580000281333923
+    },
+    {
+        "avg_train_loss": 3.171722950532918,
+        "train_accuracy": 0.29404,
+        "avg_val_loss": 2.9623639070535006,
+        "val_accuracy": 0.3531999886035919
+    },
+    {
+        "avg_train_loss": 3.1580248549771124,
+        "train_accuracy": 0.2984,
+        "avg_val_loss": 2.909536240976068,
+        "val_accuracy": 0.3702000081539154
+    },
+    {
+        "avg_train_loss": 3.125413342814921,
+        "train_accuracy": 0.30764,
+        "avg_val_loss": 2.8885996371884888,
+        "val_accuracy": 0.3765999972820282
+    },
+    {
+        "avg_train_loss": 3.11991933300672,
+        "train_accuracy": 0.30932,
+        "avg_val_loss": 2.8885996371884888,
+        "val_accuracy": 0.3788999915122986
+    },
+    {
+        "avg_train_loss": 3.0930506728799143,
+        "train_accuracy": 0.3174,
+        "avg_val_loss": 2.852832456178303,
+        "val_accuracy": 0.3903000056743622
+    },
+    {
+        "avg_train_loss": 3.0655275760099405,
+        "train_accuracy": 0.32236,
+        "avg_val_loss": 2.868115388894383,
+        "val_accuracy": 0.38449999690055847
+    },
+    {
+        "avg_train_loss": 3.0548000393621146,
+        "train_accuracy": 0.32792,
+        "avg_val_loss": 2.8296788855444026,
+        "val_accuracy": 0.3880999982357025
+    },
+    {
+        "avg_train_loss": 3.032537115809253,
+        "train_accuracy": 0.3308,
+        "avg_val_loss": 2.7847178012509888,
+        "val_accuracy": 0.4097999930381775
+    },
+    {
+        "avg_train_loss": 3.009990167434868,
+        "train_accuracy": 0.3352,
+        "avg_val_loss": 2.7986656864987145,
+        "val_accuracy": 0.4027999937534332
+    },
+    {
+        "avg_train_loss": 2.9918381585489455,
+        "train_accuracy": 0.34312,
+        "avg_val_loss": 2.7371235135235366,
+        "val_accuracy": 0.42089998722076416
+    },
+    {
+        "avg_train_loss": 2.9780573729054094,
+        "train_accuracy": 0.34844,
+        "avg_val_loss": 2.7818868130068237,
+        "val_accuracy": 0.4077000021934509
+    },
+    {
+        "avg_train_loss": 2.965448642020945,
+        "train_accuracy": 0.34998,
+        "avg_val_loss": 2.7794544847705698,
         "val_accuracy": 0.4027000069618225
     },
     {
+        "avg_train_loss": 2.949932183451055,
+        "train_accuracy": 0.354,
+        "avg_val_loss": 2.7443399550039556,
+        "val_accuracy": 0.4169999957084656
+    },
+    {
+        "avg_train_loss": 2.934478478968296,
+        "train_accuracy": 0.35772,
+        "avg_val_loss": 2.7268808099287973,
+        "val_accuracy": 0.4259999990463257
+    },
+    {
+        "avg_train_loss": 2.925810183710454,
+        "train_accuracy": 0.36066,
+        "avg_val_loss": 2.7194407016416138,
+        "val_accuracy": 0.4253999888896942
+    },
+    {
+        "avg_train_loss": 2.9101742749933694,
+        "train_accuracy": 0.36768,
+        "avg_val_loss": 2.682816324354727,
+        "val_accuracy": 0.4357999861240387
+    },
+    {
+        "avg_train_loss": 2.897818704395343,
+        "train_accuracy": 0.36902,
+        "avg_val_loss": 2.7000809681566458,
+        "val_accuracy": 0.4271000027656555
+    },
+    {
+        "avg_train_loss": 2.882525757450582,
+        "train_accuracy": 0.3717,
+        "avg_val_loss": 2.6881412554390822,
+        "val_accuracy": 0.4334000051021576
+    },
+    {
+        "avg_train_loss": 2.868310006683135,
+        "train_accuracy": 0.37626,
+        "avg_val_loss": 2.696658943272844,
+        "val_accuracy": 0.4318999946117401
+    },
+    {
+        "avg_train_loss": 2.8593416683509223,
+        "train_accuracy": 0.37938,
+        "avg_val_loss": 2.6929442973076543,
+        "val_accuracy": 0.4361000061035156
+    },
+    {
+        "avg_train_loss": 2.8498354638019183,
+        "train_accuracy": 0.38216,
+        "avg_val_loss": 2.6896122799643987,
+        "val_accuracy": 0.43959999084472656
+    },
+    {
+        "avg_train_loss": 2.841897433066307,
+        "train_accuracy": 0.38478,
+        "avg_val_loss": 2.6792281911342957,
+        "val_accuracy": 0.4350000023841858
+    },
+    {
+        "avg_train_loss": 2.829337977387411,
+        "train_accuracy": 0.38544,
+        "avg_val_loss": 2.6498478756675237,
+        "val_accuracy": 0.44699999690055847
+    },
+    {
+        "avg_train_loss": 2.817666833967809,
+        "train_accuracy": 0.38926,
+        "avg_val_loss": 2.6291940423506723,
+        "val_accuracy": 0.454800009727478
+    },
+    {
+        "avg_train_loss": 2.798290427993326,
+        "train_accuracy": 0.39618,
+        "avg_val_loss": 2.6448449243473102,
+        "val_accuracy": 0.4528999924659729
+    },
+    {
+        "avg_train_loss": 2.7893726972057995,
+        "train_accuracy": 0.39842,
+        "avg_val_loss": 2.6335499437549448,
+        "val_accuracy": 0.447299987077713
+    },
+    {
+        "avg_train_loss": 2.779167408528535,
+        "train_accuracy": 0.401,
+        "avg_val_loss": 2.625039595591871,
+        "val_accuracy": 0.4578000009059906
+    },
+    {
+        "avg_train_loss": 2.769523390723616,
+        "train_accuracy": 0.40362,
+        "avg_val_loss": 2.606762849831883,
+        "val_accuracy": 0.4546999931335449
+    },
+    {
+        "avg_train_loss": 2.7618973175887866,
+        "train_accuracy": 0.40552,
+        "avg_val_loss": 2.6070912035205698,
+        "val_accuracy": 0.4577000141143799
+    },
+    {
+        "avg_train_loss": 2.744235341811119,
+        "train_accuracy": 0.4127,
+        "avg_val_loss": 2.577495912962322,
+        "val_accuracy": 0.4657000005245209
+    },
+    {
+        "avg_train_loss": 2.7439488306679687,
+        "train_accuracy": 0.4089,
+        "avg_val_loss": 2.610289561597607,
+        "val_accuracy": 0.45730000734329224
+    },
+    {
+        "avg_train_loss": 2.726577682263406,
+        "train_accuracy": 0.41524,
+        "avg_val_loss": 2.5453265226339994,
+        "val_accuracy": 0.4781000018119812
+    },
+    {
+        "avg_train_loss": 2.7213859829451423,
+        "train_accuracy": 0.41772,
+        "avg_val_loss": 2.5277760179736948,
+        "val_accuracy": 0.4763000011444092
+    },
+    {
+        "avg_train_loss": 2.712547524810752,
+        "train_accuracy": 0.421,
+        "avg_val_loss": 2.5841199657585046,
+        "val_accuracy": 0.4708000123500824
+    },
+    {
+        "avg_train_loss": 2.6989601530382394,
+        "train_accuracy": 0.4234,
+        "avg_val_loss": 2.5469041655335247,
+        "val_accuracy": 0.47589999437332153
+    },
+    {
+        "avg_train_loss": 2.6853119339174625,
+        "train_accuracy": 0.42606,
+        "avg_val_loss": 2.5726125210146362,
+        "val_accuracy": 0.4747999906539917
+    },
+    {
+        "avg_train_loss": 2.6801007284837612,
+        "train_accuracy": 0.42704,
+        "avg_val_loss": 2.5191534501087816,
+        "val_accuracy": 0.4855000078678131
+    },
+    {
+        "avg_train_loss": 2.673392378765604,
+        "train_accuracy": 0.429,
+        "avg_val_loss": 2.543144902096519,
+        "val_accuracy": 0.46720001101493835
+    },
+    {
+        "avg_train_loss": 2.6638625219959735,
+        "train_accuracy": 0.43398,
+        "avg_val_loss": 2.537560185299644,
+        "val_accuracy": 0.486299991607666
+    },
+    {
+        "avg_train_loss": 2.651780778184876,
+        "train_accuracy": 0.43742,
+        "avg_val_loss": 2.536473914037777,
+        "val_accuracy": 0.4837999939918518
+    },
+    {
+        "avg_train_loss": 2.6511071432581947,
+        "train_accuracy": 0.43532,
+        "avg_val_loss": 2.520467637460443,
+        "val_accuracy": 0.4832000136375427
+    },
+    {
+        "avg_train_loss": 2.6305635774227056,
+        "train_accuracy": 0.44088,
+        "avg_val_loss": 2.49658203125,
+        "val_accuracy": 0.4912000000476837
+    },
+    {
+        "avg_train_loss": 2.634286204262463,
+        "train_accuracy": 0.44186,
+        "avg_val_loss": 2.554328242434731,
+        "val_accuracy": 0.47690001130104065
+    },
+    {
+        "avg_train_loss": 2.624523153390421,
+        "train_accuracy": 0.44464,
+        "avg_val_loss": 2.492234821561017,
+        "val_accuracy": 0.4921000003814697
+    },
+    {
+        "avg_train_loss": 2.6103764879124243,
+        "train_accuracy": 0.44712,
+        "avg_val_loss": 2.4733666528629352,
+        "val_accuracy": 0.49219998717308044
+    },
+    {
+        "avg_train_loss": 2.597310994592164,
+        "train_accuracy": 0.45222,
+        "avg_val_loss": 2.4752954410601267,
+        "val_accuracy": 0.4984999895095825
+    },
+    {
+        "avg_train_loss": 2.6000741299460914,
+        "train_accuracy": 0.45098,
+        "avg_val_loss": 2.4956019920638846,
+        "val_accuracy": 0.4957999885082245
+    },
+    {
+        "avg_train_loss": 2.585392991295251,
+        "train_accuracy": 0.45412,
+        "avg_val_loss": 2.4977424718156644,
+        "val_accuracy": 0.4909999966621399
+    },
+    {
+        "avg_train_loss": 2.5859423464216538,
+        "train_accuracy": 0.45544,
+        "avg_val_loss": 2.4699751455572585,
+        "val_accuracy": 0.4993000030517578
+    },
+    {
+        "avg_train_loss": 2.5671919788545963,
+        "train_accuracy": 0.4598,
+        "avg_val_loss": 2.4434667659711233,
+        "val_accuracy": 0.5040000081062317
+    },
+    {
+        "avg_train_loss": 2.552455409103647,
+        "train_accuracy": 0.4653,
+        "avg_val_loss": 2.4661942494066458,
+        "val_accuracy": 0.4943000078201294
+    },
+    {
+        "avg_train_loss": 2.5519124113995098,
+        "train_accuracy": 0.46254,
+        "avg_val_loss": 2.43858723700801,
+        "val_accuracy": 0.5048999786376953
+    },
+    {
+        "avg_train_loss": 2.5457511021353096,
+        "train_accuracy": 0.4658,
+        "avg_val_loss": 2.48284912109375,
+        "val_accuracy": 0.5008000135421753
+    },
+    {
+        "avg_train_loss": 2.539912354915648,
+        "train_accuracy": 0.46686,
+        "avg_val_loss": 2.4563181430478638,
+        "val_accuracy": 0.5048999786376953
+    },
+    {
+        "avg_train_loss": 2.5310022376687327,
+        "train_accuracy": 0.4681,
+        "avg_val_loss": 2.4287455112119263,
+        "val_accuracy": 0.5077000260353088
+    },
+    {
+        "avg_train_loss": 2.5193640140011486,
+        "train_accuracy": 0.4733,
+        "avg_val_loss": 2.4722490914260287,
+        "val_accuracy": 0.4941999912261963
+    },
+    {
+        "avg_train_loss": 2.511948669353105,
+        "train_accuracy": 0.47668,
+        "avg_val_loss": 2.4270836552487145,
+        "val_accuracy": 0.510200023651123
+    },
+    {
+        "avg_train_loss": 2.5021252122986346,
+        "train_accuracy": 0.47688,
+        "avg_val_loss": 2.3986882076987737,
+        "val_accuracy": 0.5188000202178955
+    },
+    {
+        "avg_train_loss": 2.4900416806530767,
+        "train_accuracy": 0.48144,
+        "avg_val_loss": 2.4308195476290546,
+        "val_accuracy": 0.5117999911308289
+    },
+    {
+        "avg_train_loss": 2.493807424059914,
+        "train_accuracy": 0.48098,
+        "avg_val_loss": 2.4242190590387658,
+        "val_accuracy": 0.5123999714851379
+    },
+    {
+        "avg_train_loss": 2.4797395354951433,
+        "train_accuracy": 0.4871,
+        "avg_val_loss": 2.4191836586481408,
+        "val_accuracy": 0.5139999985694885
+    },
+    {
+        "avg_train_loss": 2.474879029461795,
+        "train_accuracy": 0.48526,
+        "avg_val_loss": 2.4089353537257714,
+        "val_accuracy": 0.5188000202178955
+    },
+    {
+        "avg_train_loss": 2.4681123287781426,
+        "train_accuracy": 0.48606,
+        "avg_val_loss": 2.3880093731457674,
+        "val_accuracy": 0.5206000208854675
+    },
+    {
+        "avg_train_loss": 2.464694506219586,
+        "train_accuracy": 0.48762,
+        "avg_val_loss": 2.4599369869956487,
+        "val_accuracy": 0.5094000101089478
+    },
+    {
+        "avg_train_loss": 2.4511944366538008,
+        "train_accuracy": 0.4928,
+        "avg_val_loss": 2.3902290440812894,
+        "val_accuracy": 0.5202000141143799
+    },
+    {
+        "avg_train_loss": 2.4458736345896024,
+        "train_accuracy": 0.4954,
+        "avg_val_loss": 2.3775387534612342,
+        "val_accuracy": 0.5228999853134155
+    },
+    {
+        "avg_train_loss": 2.4391189106285114,
+        "train_accuracy": 0.4964,
+        "avg_val_loss": 2.3863923278035997,
+        "val_accuracy": 0.5250999927520752
+    },
+    {
+        "avg_train_loss": 2.431982190102872,
+        "train_accuracy": 0.49786,
+        "avg_val_loss": 2.399633093725277,
+        "val_accuracy": 0.5228000283241272
+    },
+    {
+        "avg_train_loss": 2.4256825097991377,
+        "train_accuracy": 0.49946,
+        "avg_val_loss": 2.4045251773882517,
+        "val_accuracy": 0.515500009059906
+    },
+    {
+        "avg_train_loss": 2.4132598466275597,
+        "train_accuracy": 0.50316,
+        "avg_val_loss": 2.38846037659464,
+        "val_accuracy": 0.5232999920845032
+    },
+    {
+        "avg_train_loss": 2.4151184967411754,
+        "train_accuracy": 0.50272,
+        "avg_val_loss": 2.3768584818779668,
+        "val_accuracy": 0.5232999920845032
+    },
+    {
+        "avg_train_loss": 2.405930356906198,
+        "train_accuracy": 0.50574,
+        "avg_val_loss": 2.378061415273932,
+        "val_accuracy": 0.5267000198364258
+    },
+    {
+        "avg_train_loss": 2.4047722526828346,
+        "train_accuracy": 0.50608,
+        "avg_val_loss": 2.3851123278654076,
+        "val_accuracy": 0.5238000154495239
+    },
+    {
+        "avg_train_loss": 2.3914314154773724,
+        "train_accuracy": 0.50882,
+        "avg_val_loss": 2.3767078254796283,
+        "val_accuracy": 0.5264999866485596
+    },
+    {
+        "avg_train_loss": 2.3860856683357903,
+        "train_accuracy": 0.51156,
+        "avg_val_loss": 2.376488021657437,
+        "val_accuracy": 0.5235000252723694
+    },
+    {
+        "avg_train_loss": 2.386218143546063,
+        "train_accuracy": 0.51266,
+        "avg_val_loss": 2.3630600941332083,
+        "val_accuracy": 0.5235000252723694
+    },
+    {
+        "avg_train_loss": 2.3744330151611583,
+        "train_accuracy": 0.51428,
+        "avg_val_loss": 2.3920840492731408,
+        "val_accuracy": 0.5213000178337097
+    },
+    {
+        "avg_train_loss": 2.3711826228119834,
+        "train_accuracy": 0.51532,
+        "avg_val_loss": 2.3973187796677213,
+        "val_accuracy": 0.5235000252723694
+    },
+    {
+        "avg_train_loss": 2.3610368445706182,
+        "train_accuracy": 0.51854,
+        "avg_val_loss": 2.3672910279865507,
+        "val_accuracy": 0.5357000231742859
+    },
+    {
+        "avg_train_loss": 2.359229836165143,
+        "train_accuracy": 0.52028,
+        "avg_val_loss": 2.3615512123590783,
+        "val_accuracy": 0.5253999829292297
+    },
+    {
+        "avg_train_loss": 2.3491924016372017,
+        "train_accuracy": 0.52172,
+        "avg_val_loss": 2.3583934156200552,
+        "val_accuracy": 0.5288000106811523
+    },
+    {
+        "avg_train_loss": 2.336290584348352,
+        "train_accuracy": 0.52592,
+        "avg_val_loss": 2.3765073365803007,
+        "val_accuracy": 0.5221999883651733
+    },
+    {
+        "avg_train_loss": 2.3395893063081803,
+        "train_accuracy": 0.52642,
+        "avg_val_loss": 2.377288818359375,
+        "val_accuracy": 0.5306000113487244
+    },
+    {
+        "avg_train_loss": 2.3326609063026544,
+        "train_accuracy": 0.52884,
+        "avg_val_loss": 2.3460967631279668,
+        "val_accuracy": 0.5358999967575073
+    },
+    {
+        "avg_train_loss": 2.3343486081608726,
+        "train_accuracy": 0.5261,
+        "avg_val_loss": 2.3548485478268395,
+        "val_accuracy": 0.5310999751091003
+    },
+    {
+        "avg_train_loss": 2.319910573532514,
+        "train_accuracy": 0.53014,
+        "avg_val_loss": 2.3654568829113924,
+        "val_accuracy": 0.5289999842643738
+    },
+    {
+        "avg_train_loss": 2.3231105095590165,
+        "train_accuracy": 0.53024,
+        "avg_val_loss": 2.3426397782337816,
+        "val_accuracy": 0.5408999919891357
+    },
+    {
+        "avg_train_loss": 2.3172146889864638,
+        "train_accuracy": 0.52916,
+        "avg_val_loss": 2.3707584429390822,
+        "val_accuracy": 0.5284000039100647
     },
     {
+        "avg_train_loss": 2.3137913414889284,
+        "train_accuracy": 0.5338,
+        "avg_val_loss": 2.378921701938291,
+        "val_accuracy": 0.5299000144004822
     },
     {
+        "avg_train_loss": 2.292580285188182,
+        "train_accuracy": 0.5417,
+        "avg_val_loss": 2.3381581366816655,
+        "val_accuracy": 0.5406000018119812
     },
     {
+        "avg_train_loss": 2.2915546366625734,
+        "train_accuracy": 0.53912,
+        "avg_val_loss": 2.36516909056072,
+        "val_accuracy": 0.5367000102996826
     },
     {
+        "avg_train_loss": 2.291307615197223,
+        "train_accuracy": 0.53922,
+        "avg_val_loss": 2.3603519487984572,
+        "val_accuracy": 0.535099983215332
     },
     {
+        "avg_train_loss": 2.2937697252958937,
+        "train_accuracy": 0.5405,
+        "avg_val_loss": 2.3516428500791138,
+        "val_accuracy": 0.5401999950408936
     },
     {
+        "avg_train_loss": 2.2855114741703435,
+        "train_accuracy": 0.53978,
+        "avg_val_loss": 2.3426988818977454,
+        "val_accuracy": 0.536300003528595
     },
     {
+        "avg_train_loss": 2.2704710070129552,
+        "train_accuracy": 0.54678,
+        "avg_val_loss": 2.3620895192592957,
+        "val_accuracy": 0.5358999967575073
     },
     {
+        "avg_train_loss": 2.2660938359587393,
+        "train_accuracy": 0.5457,
+        "avg_val_loss": 2.3297165496439876,
+        "val_accuracy": 0.5368000268936157
     },
     {
+        "avg_train_loss": 2.2636455373690865,
+        "train_accuracy": 0.54736,
+        "avg_val_loss": 2.370776212668117,
+        "val_accuracy": 0.5335999727249146
     }
 ]

performance_plot.png CHANGED Viewed

plots.py CHANGED Viewed

@@ -5,7 +5,7 @@ with open("performance.json", "r") as f:
     performance = json.load(f)
 # Extract values from the performance list
-epochs = range(1, len(performance) + 1)
 train_losses = [epoch["avg_train_loss"] for epoch in performance]
 val_losses = [epoch["avg_val_loss"] for epoch in performance]
 train_accuracies = [epoch["train_accuracy"] for epoch in performance]
@@ -22,7 +22,7 @@ plt.xlabel("Epochs")
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
-plt.xticks(epochs)
 # Subplot for Accuracy
 plt.subplot(1, 2, 2)
@@ -32,7 +32,7 @@ plt.xlabel("Epochs")
 plt.ylabel("Accuracy")
 plt.title("Training and Validation Accuracy")
 plt.legend()
-plt.xticks(epochs)
 plt.tight_layout()

     performance = json.load(f)
 # Extract values from the performance list
+epochs = list(range(1, len(performance) + 1))
 train_losses = [epoch["avg_train_loss"] for epoch in performance]
 val_losses = [epoch["avg_val_loss"] for epoch in performance]
 train_accuracies = [epoch["train_accuracy"] for epoch in performance]
 plt.ylabel("Loss")
 plt.title("Training and Validation Loss")
 plt.legend()
+plt.xticks([1] + epochs[9::10] + [epochs[-1]])
 # Subplot for Accuracy
 plt.subplot(1, 2, 2)
 plt.ylabel("Accuracy")
 plt.title("Training and Validation Accuracy")
 plt.legend()
+plt.xticks([1] + epochs[9::10] + [epochs[-1]])
 plt.tight_layout()

predictions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

train.py DELETED Viewed

@@ -1,394 +0,0 @@
-#!/usr/bin/env python3
-import os
-import csv
-import json
-from tqdm import tqdm
-import torch
-import argparse
-from PIL import Image
-from torchvision import transforms
-from torch.utils.data import DataLoader, Dataset
-from model import MyModel
-import numpy as np
-class MiniPlaces(Dataset):
-    def __init__(self, root_dir, split, transform=None, label_dict=None):
-        """
-        Initialize the MiniPlaces dataset with the root directory for the images,
-        the split (train/val/test), an optional data transformation,
-        and an optional label dictionary.
-        Args:
-            root_dir (str): Root directory for the MiniPlaces images.
-            split (str): Split to use ('train', 'val', or 'test').
-            transform (callable, optional): Optional data transformation to apply to the images.
-            label_dict (dict, optional): Optional dictionary mapping integer labels to class names.
-        """
-        assert split in ['train', 'val', 'test']
-        self.root_dir = root_dir
-        self.split = split
-        self.transform = transform
-        self.filenames = []
-        self.labels = []
-        self.label_dict = label_dict if label_dict is not None else {}
-        with open(os.path.join(self.root_dir, self.split + '.txt')) as r:
-            lines = r.readlines()
-            for line in lines:
-                line = line.split()
-                self.filenames.append(line[0])
-                if split == 'test':
-                    label = line[0]
-                else:
-                    label = int(line[1])
-                self.labels.append(label)
-                if split == 'train':
-                    text_label = line[0].split('/')[2]
-                    self.label_dict[label] = text_label
-    def __len__(self):
-        """
-        Return the number of images in the dataset.
-        Returns:
-            int: Number of images in the dataset.
-        """
-        return len(self.labels)
-    def __getitem__(self, idx):
-        """
-        Return a single image and its corresponding label when given an index.
-        Args:
-            idx (int): Index of the image to retrieve.
-        Returns:
-            tuple: Tuple containing the image and its label.
-        """
-        if self.transform is not None:
-            image = self.transform(
-                Image.open(os.path.join(self.root_dir, "images", self.filenames[idx])))
-        else:
-            image = Image.open(os.path.join(self.root_dir, "images", self.filenames[idx]))
-        label = self.labels[idx]
-        return image, label
-def create_train_transform():
-    """
-    Create training data transformation with augmentation
-    """
-    image_net_mean = torch.Tensor([0.485, 0.456, 0.406])
-    image_net_std = torch.Tensor([0.229, 0.224, 0.225])
-    return transforms.Compose([
-        transforms.RandomResizedCrop(128, scale=(0.8, 1.0)),
-        transforms.RandomHorizontalFlip(p=0.5),
-        transforms.ColorJitter(
-            brightness=0.4,
-            contrast=0.4,
-            saturation=0.4,
-            hue=0.1
-        ),
-        transforms.RandomAffine(
-            degrees=15,  # rotation
-            translate=(0.1, 0.1),  # horizontal/vertical translation
-            scale=(0.9, 1.1),  # scale
-        ),
-        transforms.ToTensor(),
-        transforms.Resize((128, 128)),
-        transforms.Normalize(image_net_mean, image_net_std)
-    ])
-def create_val_transform():
-    """
-    Create validation/test data transformation without augmentation
-    """
-    image_net_mean = torch.Tensor([0.485, 0.456, 0.406])
-    image_net_std = torch.Tensor([0.229, 0.224, 0.225])
-    return transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Resize((128, 128)),
-        transforms.Normalize(image_net_mean, image_net_std)
-    ])
-def evaluate(model, test_loader, criterion, device):
-    """
-    Evaluate the CNN classifier on the validation set.
-    Args:
-        model (CNN): CNN classifier to evaluate.
-        test_loader (torch.utils.data.DataLoader): Data loader for the test set.
-        criterion (callable): Loss function to use for evaluation.
-        device (torch.device): Device to use for evaluation.
-    Returns:
-        float: Average loss on the test set.
-        float: Accuracy on the test set.
-    """
-    model.eval()  # Set model to evaluation mode
-    with torch.no_grad():
-        total_loss = 0.0
-        num_correct = 0
-        num_samples = 0
-        for inputs, labels in test_loader:
-            # Move inputs and labels to device
-            inputs = inputs.to(device)
-            labels = labels.to(device)
-            # Compute the logits and loss
-            logits = model(inputs)
-            loss = criterion(logits, labels)
-            total_loss += loss.item()
-            # Compute the accuracy
-            _, predictions = torch.max(logits, dim=1)
-            num_correct += (predictions == labels).sum().item()
-            num_samples += len(inputs)
-    # Evaluate the model on the validation set
-    avg_loss = total_loss / len(test_loader)
-    accuracy = num_correct / num_samples
-    return avg_loss, accuracy
-def train(model, train_loader, val_loader, optimizer, criterion, device,
-          num_epochs):
-    """
-    Train the CNN classifer on the training set and evaluate it on the validation set every epoch.
-    Args:
-    model (CNN): CNN classifier to train.
-    train_loader (torch.utils.data.DataLoader): Data loader for the training set.
-    val_loader (torch.utils.data.DataLoader): Data loader for the validation set.
-    optimizer (torch.optim.Optimizer): Optimizer to use for training.
-    criterion (callable): Loss function to use for training.
-    device (torch.device): Device to use for training.
-    num_epochs (int): Number of epochs to train the model.
-    """
-    # Place the model on device
-    model = model.to(device)
-    # Define early stopping parameters
-    patience = 5  # Number of epochs to wait for improvement
-    best_val_accuracy = 0.0  # Best validation accuracy so far
-    epochs_without_improvement = 0  # Counter for epochs without improvement
-    best_model_state = None  # To store the state of the best model
-    # Performance tracking
-    performance = []
-    for epoch in range(num_epochs):
-        model.train()  # Set model to training mode
-        running_loss = 0.0  # Track cumulative loss for averaging
-        correct_predictions = 0
-        total_samples = 0
-        with tqdm(total=len(train_loader),
-                  desc=f'Epoch {epoch + 1}/{num_epochs}',
-                  position=0,
-                  leave=True) as pbar:
-            for inputs, labels in train_loader:
-                # Move inputs and labels to device
-                inputs = inputs.to(device)
-                labels = labels.to(device)
-                # Zero the gradients
-                optimizer.zero_grad()
-                # Compute the logits and loss
-                logits = model(inputs)
-                loss = criterion(logits, labels)
-                # Backward pass: Compute gradients
-                loss.backward()
-                # Optimize model parameters
-                optimizer.step()
-                # Track running loss
-                running_loss += loss.item()
-                # Track accuracy
-                _, predicted = logits.max(1)
-                correct_predictions += (predicted == labels).sum().item()
-                total_samples += labels.size(0)
-                # Update the progress bar
-                pbar.update(1)
-                pbar.set_postfix(loss=loss.item())
-            # Calculate average loss and accuracy
-            avg_train_loss = running_loss / len(train_loader)
-            train_accuracy = correct_predictions / total_samples
-            avg_val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
-            performance.append({
-                "avg_train_loss": avg_train_loss,
-                "train_accuracy": train_accuracy,
-                "avg_val_loss": avg_val_loss,
-                "val_accuracy": val_accuracy
-            })
-            print(
-                f"Train Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f} "
-                f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}"
-            )
-            # Check for early stopping
-            if val_accuracy > best_val_accuracy:
-                best_val_accuracy = val_accuracy
-                epochs_without_improvement = 0  # Reset counter if there's an improvement
-                # Save the model checkpoint for the best model
-                best_model_state = {
-                    'model_state_dict': model.module.state_dict(),
-                    'optimizer_state_dict': optimizer.state_dict(),
-                    'epoch': epoch,
-                }
-            else:
-                epochs_without_improvement += 1
-            # Early stopping condition
-            if epochs_without_improvement >= patience:
-                print(f"Early stopping at epoch {epoch + 1}.")
-                break  # Stop training if no improvement for 'patience' epochs
-    # Save the performance list to a JSON file
-    with open("performance.json", "w") as f:
-        json.dump(performance, f, indent=4)
-    torch.save(best_model_state, 'model.ckpt')
-def test(model, test_loader, device):
-    """
-    Get predictions for the test set.
-    Args:
-        model (CNN): classifier to evaluate.
-        test_loader (torch.utils.data.DataLoader): Data loader for the test set.
-        device (torch.device): Device to use for evaluation.
-    Returns:
-        float: Average loss on the test set.
-        float: Accuracy on the test set.
-    """
-    model = model.to(device)
-    model.eval()  # Set model to evaluation mode
-    with torch.no_grad():
-        all_preds = []
-        for inputs, labels in test_loader:
-            # Move inputs and labels to device
-            inputs = inputs.to(device)
-            logits = model(inputs)
-            _, predictions = torch.max(logits, dim=1)
-            preds = list(zip(labels, predictions.tolist()))
-            all_preds.extend(preds)
-        return all_preds
-def write_predictions(preds, filename):
-    with open(filename, 'w') as f:
-        writer = csv.writer(f, delimiter=',')
-        for im, pred in preds:
-            writer.writerow((im, pred))
-def main(args):
-    image_net_mean = torch.Tensor([0.485, 0.456, 0.406])
-    image_net_std = torch.Tensor([0.229, 0.224, 0.225])
-    # Define data transformation
-    data_transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Resize((128, 128)),
-        transforms.Normalize(image_net_mean, image_net_std),
-    ])
-    # Separate transforms for training and validation
-    train_transform = create_train_transform()
-    val_transform = create_val_transform()
-    # Create datasets
-    data_root = 'data'
-    miniplaces_train = MiniPlaces(data_root,
-                                  split='train',
-                                  transform=data_transform)
-    miniplaces_val = MiniPlaces(data_root,
-                                split='val',
-                                transform=data_transform,
-                                label_dict=miniplaces_train.label_dict)
-    # Create the dataloaders
-    # Define the batch size and number of workers
-    batch_size = int(args.batch_size)
-    num_workers = 2
-    # Create DataLoader for training and validation sets
-    train_loader = DataLoader(miniplaces_train,
-                              batch_size=batch_size,
-                              num_workers=num_workers,
-                              shuffle=True)
-    val_loader = DataLoader(miniplaces_val,
-                            batch_size=batch_size,
-                            num_workers=num_workers,
-                            shuffle=False)
-    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else 'cpu')  # TODO: check cuda
-    model = MyModel(num_classes=len(miniplaces_train.label_dict))
-    # optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=False)
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=True)
-    print("PARAMS NUM:", sum(p.numel() for p in model.parameters() if p.requires_grad))
-    if args.checkpoint:
-        checkpoint = torch.load(args.checkpoint)
-        model.load_state_dict(checkpoint['model_state_dict'])
-        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-    criterion = torch.nn.CrossEntropyLoss(reduction='mean', label_smoothing=0.1)
-    if not args.test:
-        train(model, train_loader, val_loader, optimizer, criterion,
-              device, num_epochs=int(args.epochs))
-    else:
-        miniplaces_test = MiniPlaces(data_root,
-                                     split='test',
-                                     transform=data_transform)
-        test_loader = DataLoader(miniplaces_test,
-                                 batch_size=batch_size,
-                                 num_workers=num_workers,
-                                 shuffle=False)
-        checkpoint = torch.load(args.checkpoint, weights_only=True)
-        model.load_state_dict(checkpoint['model_state_dict'])
-        preds = test(model, test_loader, device)
-        write_predictions(preds, 'predictions.csv')
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--test', action='store_true')
-    parser.add_argument('--checkpoint')
-    parser.add_argument('--gpu', default=0)
-    parser.add_argument('--epochs', default=100)
-    parser.add_argument('--batch_size', default=32)
-    args = parser.parse_args()
-    main(args)

train_dist.py CHANGED Viewed

@@ -2,11 +2,13 @@
 import os
 import csv
 import json
 from tqdm import tqdm
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.data.distributed import DistributedSampler
 import argparse
 from PIL import Image
@@ -36,6 +38,7 @@ def cleanup():
     if dist.is_initialized():
         dist.barrier()  # Synchronize all processes before destroying process group
         dist.destroy_process_group()
 class MiniPlaces(Dataset):
@@ -161,6 +164,7 @@ def evaluate(model, test_loader, criterion, device):
     with torch.no_grad():
         total_loss = 0.0
         num_correct = 0
         num_samples = 0
         for inputs, labels in test_loader:
@@ -173,22 +177,29 @@ def evaluate(model, test_loader, criterion, device):
             _, predictions = torch.max(logits, dim=1)
             num_correct += (predictions == labels).sum().item()
             num_samples += len(inputs)
         # Gather metrics from all processes
         world_size = dist.get_world_size()
         total_loss = torch.tensor(total_loss).to(device)
         num_correct = torch.tensor(num_correct).to(device)
         num_samples = torch.tensor(num_samples).to(device)
         dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
         dist.all_reduce(num_correct, op=dist.ReduceOp.SUM)
         dist.all_reduce(num_samples, op=dist.ReduceOp.SUM)
         avg_loss = (total_loss / world_size).item() / len(test_loader)
         accuracy = (num_correct / num_samples).item()
-    return avg_loss, accuracy
 def train_worker(rank, world_size, args):
@@ -201,15 +212,18 @@ def train_worker(rank, world_size, args):
         args (argparse.Namespace): Command-line arguments.
     """
     try:
         setup(rank, world_size, args.port)
         device = torch.device(f'cuda:{rank}')
         # Define early stopping parameters
-        patience = 3  # Number of epochs to wait for improvement
         best_val_accuracy = 0.0  # Best validation accuracy so far
         epochs_without_improvement = 0  # Counter for epochs without improvement
         best_model_state = None  # To store the state of the best model
         # Separate transforms for training and validation
         train_transform = create_train_transform()
         val_transform = create_val_transform()
@@ -233,7 +247,7 @@ def train_worker(rank, world_size, args):
                                 pin_memory=True)
         # Create model and move to GPU
-        model = MyModel(num_classes=len(miniplaces_train.label_dict))
         model = model.to(device)
         model = DDP(model, device_ids=[rank])
@@ -247,6 +261,9 @@ def train_worker(rank, world_size, args):
             model.module.load_state_dict(checkpoint['model_state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
         if not args.test:
             # Training loop
             performance = []
@@ -288,7 +305,14 @@ def train_worker(rank, world_size, args):
                 # Evaluate and log metrics
                 avg_train_loss = running_loss / len(train_loader)
                 train_accuracy = correct_predictions / total_samples
-                avg_val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
                 if rank == 0:  # Only save metrics on rank 0
                     performance.append({
@@ -327,16 +351,25 @@ def train_worker(rank, world_size, args):
                 torch.save(best_model_state, 'model.ckpt')
         else:  # Testing mode
-            miniplaces_test = MiniPlaces(data_root, split='test', transform=data_transform)
             test_loader = DataLoader(miniplaces_test, batch_size=args.batch_size, num_workers=2, shuffle=False)
             checkpoint = torch.load(args.checkpoint, map_location=device)
             model.module.load_state_dict(checkpoint['model_state_dict'])
             preds = test(model, test_loader, device)
             if rank == 0:  # Only write predictions on rank 0
                 write_predictions(preds, 'predictions.csv')
     finally:
         cleanup()
-        # Add explicit synchronization before exiting
         torch.cuda.synchronize()
         if dist.is_initialized():
             dist.barrier()
@@ -403,7 +436,7 @@ if __name__ == "__main__":
     parser.add_argument('--test', action='store_true')
     parser.add_argument('--checkpoint')
     parser.add_argument('--epochs', type=int, default=100)
-    parser.add_argument('--batch_size', type=int, default=32)
     parser.add_argument('--port', type=int, default=4224)
     args = parser.parse_args()
     main(args)

 import os
 import csv
 import json
+import warnings
 from tqdm import tqdm
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim.lr_scheduler import ReduceLROnPlateau
 from torch.utils.data.distributed import DistributedSampler
 import argparse
 from PIL import Image
     if dist.is_initialized():
         dist.barrier()  # Synchronize all processes before destroying process group
         dist.destroy_process_group()
+        torch.cuda.synchronize()
 class MiniPlaces(Dataset):
     with torch.no_grad():
         total_loss = 0.0
         num_correct = 0
+        num_correct_top5 = 0
         num_samples = 0
         for inputs, labels in test_loader:
             _, predictions = torch.max(logits, dim=1)
             num_correct += (predictions == labels).sum().item()
+            _, top5_predictions = torch.topk(logits, 5, dim=1)
+            num_correct_top5 += (top5_predictions == labels.unsqueeze(1)).any(dim=1).sum().item()
             num_samples += len(inputs)
         # Gather metrics from all processes
         world_size = dist.get_world_size()
         total_loss = torch.tensor(total_loss).to(device)
         num_correct = torch.tensor(num_correct).to(device)
+        num_correct_top5 = torch.tensor(num_correct_top5).to(device)
         num_samples = torch.tensor(num_samples).to(device)
         dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
         dist.all_reduce(num_correct, op=dist.ReduceOp.SUM)
+        dist.all_reduce(num_correct_top5, op=dist.ReduceOp.SUM)
         dist.all_reduce(num_samples, op=dist.ReduceOp.SUM)
         avg_loss = (total_loss / world_size).item() / len(test_loader)
         accuracy = (num_correct / num_samples).item()
+        top5_accuracy = (num_correct_top5 / num_samples).item()
+    return avg_loss, accuracy, top5_accuracy
 def train_worker(rank, world_size, args):
         args (argparse.Namespace): Command-line arguments.
     """
     try:
+        warnings.filterwarnings("ignore")
         setup(rank, world_size, args.port)
         device = torch.device(f'cuda:{rank}')
         # Define early stopping parameters
+        patience = 10  # Number of epochs to wait for improvement
         best_val_accuracy = 0.0  # Best validation accuracy so far
         epochs_without_improvement = 0  # Counter for epochs without improvement
         best_model_state = None  # To store the state of the best model
+        last_lr = 0
         # Separate transforms for training and validation
         train_transform = create_train_transform()
         val_transform = create_val_transform()
                                 pin_memory=True)
         # Create model and move to GPU
+        model = MyModel(num_classes=len(miniplaces_train.label_dict), dropout_rate=0.2)
         model = model.to(device)
         model = DDP(model, device_ids=[rank])
             model.module.load_state_dict(checkpoint['model_state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        # Initialize the ReduceLROnPlateau scheduler
+        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4)
         if not args.test:
             # Training loop
             performance = []
                 # Evaluate and log metrics
                 avg_train_loss = running_loss / len(train_loader)
                 train_accuracy = correct_predictions / total_samples
+                avg_val_loss, val_accuracy, val_top5_accuracy = evaluate(model, val_loader, criterion, device)
+                # Step the scheduler with the validation loss
+                scheduler.step(avg_val_loss)
+                if scheduler.get_last_lr()[0] != last_lr:
+                    last_lr = scheduler.get_last_lr()[0]
+                    if epoch != 0:
+                        print(f"New learning rate: {scheduler.get_last_lr()[0]}")
                 if rank == 0:  # Only save metrics on rank 0
                     performance.append({
                 torch.save(best_model_state, 'model.ckpt')
         else:  # Testing mode
+            avg_val_loss, val_accuracy, val_top5_accuracy = evaluate(model, val_loader, criterion, device)
+            if rank == 0:
+                print(f"\nValidation Loss: {avg_val_loss:.4f}\n"
+                      f"Validation Accuracy: {val_accuracy:.4f}\n"
+                      f"Validation Top-5 Accuracy: {val_top5_accuracy:.4f}\n")
+            miniplaces_test = MiniPlaces(data_root, split='test', transform=val_transform)
             test_loader = DataLoader(miniplaces_test, batch_size=args.batch_size, num_workers=2, shuffle=False)
             checkpoint = torch.load(args.checkpoint, map_location=device)
             model.module.load_state_dict(checkpoint['model_state_dict'])
             preds = test(model, test_loader, device)
             if rank == 0:  # Only write predictions on rank 0
                 write_predictions(preds, 'predictions.csv')
+                print("Predictions saved to predictions.csv\n")
     finally:
         cleanup()
+        # Explicit synchronization before exiting
         torch.cuda.synchronize()
         if dist.is_initialized():
             dist.barrier()
     parser.add_argument('--test', action='store_true')
     parser.add_argument('--checkpoint')
     parser.add_argument('--epochs', type=int, default=100)
+    parser.add_argument('--batch_size', type=int, default=64)
     parser.add_argument('--port', type=int, default=4224)
     args = parser.parse_args()
     main(args)