| ### Relevant imports & set up | |
| ```python | |
| !pip install geopy > delete.txt | |
| !pip install datasets > delete.txt | |
| !pip install torch torchvision datasets > delete.txt | |
| !pip install huggingface_hub > delete.txt | |
| !rm delete.txt | |
| ``` | |
| ```python | |
| !pip install transformers | |
| import transformers | |
| ``` | |
| ```python | |
| !huggingface-cli login --token [your_token] | |
| ``` | |
| ```python | |
| lat_mean = 39.95156937654321 | |
| lat_std = 0.0005992518588323268 | |
| lon_mean = -75.19136795987654 | |
| lon_std = 0.0007030395253318959 | |
| ``` | |
| ### Instructions | |
| Our current best performing model is an ensemble of multiple models. To run it on hidden test data, first run the model definitions. | |
| #### Load and define models | |
| ```python | |
| from transformers import AutoModelForImageClassification, PretrainedConfig, PreTrainedModel | |
| import torch | |
| import torch.nn as nn | |
| import os | |
| from huggingface_hub import PyTorchModelHubMixin, hf_hub_download | |
| from safetensors.torch import load_file | |
| class CustomConvNeXtConfig(PretrainedConfig): | |
| model_type = "custom-convnext" | |
| def __init__(self, num_labels=2, **kwargs): | |
| super().__init__(**kwargs) | |
| self.num_labels = num_labels # Register number of labels (output dimensions) | |
| class CustomConvNeXtModel(PreTrainedModel): | |
| config_class = CustomConvNeXtConfig | |
| def __init__(self, config, model_name="facebook/convnext-tiny-224", | |
| num_classes=2, train_final_layer_only=False): | |
| super().__init__(config) | |
| # Load pre-trained ConvNeXt model from Hugging Face | |
| self.convnext = AutoModelForImageClassification.from_pretrained(model_name) | |
| # Access the input features of the existing classifier | |
| in_features = self.convnext.classifier.in_features | |
| # Modify the classifier layer to match the number of output classes | |
| self.convnext.classifier = nn.Linear(in_features, num_classes) | |
| # Freeze previous weights if only training the final layer | |
| if train_final_layer_only: | |
| for name, param in self.convnext.named_parameters(): | |
| if "classifier" not in name: | |
| param.requires_grad = False | |
| else: | |
| print(f"Unfrozen layer: {name}") | |
| def forward(self, x): | |
| return self.convnext(x) | |
| @classmethod | |
| def from_pretrained(cls, repo_id, model_name="facebook/convnext-tiny-224", **kwargs): | |
| """Load model weights and configuration from Hugging Face Hub.""" | |
| # Download model.safetensors from Hugging Face Hub | |
| model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") | |
| # Download config.json from Hugging Face Hub | |
| config_path = hf_hub_download(repo_id=repo_id, filename="config.json") | |
| # Load configuration | |
| config = CustomConvNeXtConfig.from_pretrained(config_path) | |
| # Create the model | |
| model = cls(config=config, model_name=model_name, num_classes=config.num_labels) | |
| # Load state_dict from safetensors file | |
| state_dict = load_file(model_path) | |
| model.load_state_dict(state_dict) | |
| return model | |
| class CustomResNetConfig(PretrainedConfig): | |
| model_type = "custom-resnet" | |
| def __init__(self, num_labels=2, **kwargs): | |
| super().__init__(**kwargs) | |
| self.num_labels = num_labels # Register number of labels (output dimensions) | |
| class CustomResNetModel(nn.Module, PyTorchModelHubMixin): | |
| config_class = CustomResNetConfig | |
| def __init__(self, model_name="microsoft/resnet-18", | |
| num_classes=2, | |
| train_final_layer_only=False): | |
| super().__init__() | |
| # Load pre-trained ResNet model from Hugging Face | |
| self.resnet = AutoModelForImageClassification.from_pretrained(model_name) | |
| # Access the Linear layer within the Sequential classifier | |
| in_features = self.resnet.classifier[1].in_features # Accessing the Linear layer within the Sequential | |
| # Modify the classifier layer to have the desired number of output classes | |
| self.resnet.classifier = nn.Sequential( | |
| nn.Flatten(), | |
| nn.Linear(in_features, num_classes) | |
| ) | |
| self.config = CustomResNetConfig(num_labels=num_classes) | |
| # Freeze previous weights | |
| if train_final_layer_only: | |
| for name, param in self.resnet.named_parameters(): | |
| if "classifier" not in name: | |
| param.requires_grad = False | |
| else: | |
| print(f"Unfrozen layer: {name}") | |
| def forward(self, x): | |
| return self.resnet(x) | |
| def save_pretrained(self, save_directory, **kwargs): | |
| """Save model weights and custom configuration in Hugging Face format.""" | |
| os.makedirs(save_directory, exist_ok=True) | |
| # Save model weights | |
| torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin")) | |
| # Save configuration | |
| self.config.save_pretrained(save_directory) | |
| @classmethod | |
| def from_pretrained(cls, repo_id, model_name="microsoft/resnet-18", **kwargs): | |
| """Load model weights and configuration from Hugging Face Hub or local directory.""" | |
| # Download pytorch_model.bin from Hugging Face Hub | |
| model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin") | |
| # Download config.json from Hugging Face Hub | |
| config_path = hf_hub_download(repo_id=repo_id, filename="config.json") | |
| # Load configuration | |
| config = CustomResNetConfig.from_pretrained(config_path) | |
| # Create the model | |
| model = cls(model_name=model_name, num_classes=config.num_labels) | |
| # Load state_dict | |
| model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu"))) | |
| return model | |
| class CustomEfficientNetConfig(PretrainedConfig): | |
| model_type = "custom-efficientnet" | |
| def __init__(self, num_labels=2, **kwargs): | |
| super().__init__(**kwargs) | |
| self.num_labels = num_labels # Register number of labels (output dimensions) | |
| class CustomEfficientNetModel(PreTrainedModel): | |
| config_class = CustomEfficientNetConfig | |
| def __init__(self, config, model_name="google/efficientnet-b0", | |
| num_classes=2, train_final_layer_only=False): | |
| super().__init__(config) | |
| # Load pre-trained EfficientNet model from Hugging Face | |
| self.efficientnet = AutoModelForImageClassification.from_pretrained(model_name) | |
| # Access the input features of the existing classifier | |
| in_features = self.efficientnet.classifier.in_features | |
| # Modify the classifier layer to match the number of output classes | |
| self.efficientnet.classifier = nn.Sequential( | |
| nn.Linear(in_features, num_classes) | |
| ) | |
| # Freeze previous weights if only training the final layer | |
| if train_final_layer_only: | |
| for name, param in self.efficientnet.named_parameters(): | |
| if "classifier" not in name: | |
| param.requires_grad = False | |
| else: | |
| print(f"Unfrozen layer: {name}") | |
| def forward(self, x): | |
| return self.efficientnet(x) | |
| @classmethod | |
| def from_pretrained(cls, repo_id, model_name="google/efficientnet-b0", **kwargs): | |
| """Load model weights and configuration from Hugging Face Hub.""" | |
| # Attempt to download the safetensors model file | |
| try: | |
| model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") | |
| state_dict = load_file(model_path) | |
| except Exception as e: | |
| raise ValueError( | |
| f"Failed to download or load 'model.safetensors' from {repo_id}. Ensure the file exists." | |
| ) from e | |
| # Download config.json from Hugging Face Hub | |
| config_path = hf_hub_download(repo_id=repo_id, filename="config.json") | |
| # Load configuration | |
| config = CustomEfficientNetConfig.from_pretrained(config_path) | |
| # Create the model | |
| model = cls(config=config, model_name=model_name, num_classes=config.num_labels) | |
| # Load the state_dict into the model | |
| model.load_state_dict(state_dict) | |
| return model | |
| class CustomViTConfig(PretrainedConfig): | |
| model_type = "custom-vit" | |
| def __init__(self, num_labels=2, **kwargs): | |
| super().__init__(**kwargs) | |
| self.num_labels = num_labels # Register number of labels (output dimensions) | |
| class CustomViTModel(PreTrainedModel): | |
| config_class = CustomViTConfig | |
| def __init__(self, config, model_name="google/vit-base-patch16-224", | |
| num_classes=2, train_final_layer_only=False): | |
| super().__init__(config) | |
| # Load pre-trained ViT model from Hugging Face | |
| self.vit = AutoModelForImageClassification.from_pretrained(model_name) | |
| # Access the input features of the existing classifier | |
| in_features = self.vit.classifier.in_features | |
| # Modify the classifier layer to match the number of output classes | |
| self.vit.classifier = nn.Linear(in_features, num_classes) | |
| # Freeze previous weights if only training the final layer | |
| if train_final_layer_only: | |
| for name, param in self.vit.named_parameters(): | |
| if "classifier" not in name: | |
| param.requires_grad = False | |
| else: | |
| print(f"Unfrozen layer: {name}") | |
| def forward(self, x): | |
| return self.vit(x) | |
| @classmethod | |
| def from_pretrained(cls, repo_id, model_name="google/vit-base-patch16-224", **kwargs): | |
| # Attempt to download the safetensors model file | |
| try: | |
| model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") | |
| state_dict = load_file(model_path) | |
| except Exception as e: | |
| raise ValueError( | |
| f"Failed to download or load 'model.safetensors' from {repo_id}. Ensure the file exists." | |
| ) from e | |
| # Download config.json from Hugging Face Hub | |
| config_path = hf_hub_download(repo_id=repo_id, filename="config.json") | |
| # Load configuration | |
| config = CustomViTConfig.from_pretrained(config_path) | |
| # Create the model | |
| model = cls(config=config, model_name=model_name, num_classes=config.num_labels) | |
| # Load the state_dict into the model | |
| model.load_state_dict(state_dict) | |
| return model | |
| # Define the WeightedEnsembleModel class | |
| class WeightedEnsembleModel(nn.Module): | |
| def __init__(self, models, weights): | |
| """ | |
| Initialize the ensemble model with individual models and their weights. | |
| """ | |
| super(WeightedEnsembleModel, self).__init__() | |
| self.models = nn.ModuleList(models) # Wrap models in ModuleList | |
| self.weights = weights | |
| def forward(self, images): | |
| """ | |
| Forward pass for the ensemble model. | |
| Performs weighted averaging of logits from individual models. | |
| """ | |
| ensemble_logits = torch.zeros((images.size(0), 2)).to(images.device) # Initialize logits | |
| for model, weight in zip(self.models, self.weights): | |
| outputs = model(images) | |
| logits = outputs.logits if hasattr(outputs, "logits") else outputs # Extract logits | |
| ensemble_logits += weight * logits # Weighted sum of logits | |
| return ensemble_logits | |
| ``` | |
| Now, load the model weights from huggingface. | |
| ```python | |
| from transformers import AutoModelForImageClassification | |
| import torch | |
| from sklearn.metrics import mean_absolute_error, mean_squared_error | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| ``` | |
| ```python | |
| #resnet | |
| resnet = CustomResNetModel.from_pretrained( | |
| "final-project-5190/model-resnet-50-base", | |
| model_name="microsoft/resnet-50" | |
| ) | |
| #convnext | |
| convnext=CustomConvNeXtModel.from_pretrained( | |
| "final-project-5190/model-convnext-tiny-reducePlateau", | |
| model_name="facebook/convnext-tiny-224") | |
| #vit | |
| vit = CustomViTModel.from_pretrained( | |
| "final-project-5190/model-ViT-base", | |
| model_name="google/vit-base-patch16-224" | |
| ) | |
| #efficientnet | |
| efficientnet = CustomEfficientNetModel.from_pretrained( | |
| "final-project-5190/model-efficientnet-b0-base", | |
| model_name="google/efficientnet-b0" | |
| ) | |
| models = [convnext, resnet, vit, efficientnet] | |
| weights = [0.28, 0.26, 0.20, 0.27] | |
| ``` | |
| #### For data loading | |
| ```python | |
| # Download | |
| from datasets import load_dataset, Image | |
| ``` | |
| ```python | |
| import torch | |
| import torch.nn as nn | |
| import torchvision.models as models | |
| import torchvision.transforms as transforms | |
| from torch.utils.data import DataLoader, Dataset | |
| from transformers import AutoImageProcessor, AutoModelForImageClassification, AutoConfig | |
| from huggingface_hub import PyTorchModelHubMixin, hf_hub_download | |
| from PIL import Image | |
| import os | |
| import numpy as np | |
| class GPSImageDataset(Dataset): | |
| def __init__(self, hf_dataset, transform=None, lat_mean=None, lat_std=None, lon_mean=None, lon_std=None): | |
| self.hf_dataset = hf_dataset | |
| self.transform = transform | |
| # Compute mean and std from the dataframe if not provided | |
| self.latitude_mean = lat_mean if lat_mean is not None else np.mean(np.array(self.hf_dataset['Latitude'])) | |
| self.latitude_std = lat_std if lat_std is not None else np.std(np.array(self.hf_dataset['Latitude'])) | |
| self.longitude_mean = lon_mean if lon_mean is not None else np.mean(np.array(self.hf_dataset['Longitude'])) | |
| self.longitude_std = lon_std if lon_std is not None else np.std(np.array(self.hf_dataset['Longitude'])) | |
| def __len__(self): | |
| return len(self.hf_dataset) | |
| def __getitem__(self, idx): | |
| # Extract data | |
| example = self.hf_dataset[idx] | |
| # Load and process the image | |
| image = example['image'] | |
| latitude = example['Latitude'] | |
| longitude = example['Longitude'] | |
| # image = image.rotate(-90, expand=True) | |
| if self.transform: | |
| image = self.transform(image) | |
| # Normalize GPS coordinates | |
| latitude = (latitude - self.latitude_mean) / self.latitude_std | |
| longitude = (longitude - self.longitude_mean) / self.longitude_std | |
| gps_coords = torch.tensor([latitude, longitude], dtype=torch.float32) | |
| return image, gps_coords | |
| ``` | |
| ```python | |
| # Dataloader + Visualize | |
| transform = transforms.Compose([ | |
| transforms.RandomResizedCrop(224), # Random crop and resize to 224x224 | |
| transforms.RandomHorizontalFlip(), # Random horizontal flip | |
| # transforms.RandomRotation(degrees=15), # Random rotation between -15 and 15 degrees | |
| transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # Random color jitter | |
| # transforms.GaussianBlur(kernel_size=(3, 5), sigma=(0.1, 2.0)), | |
| # transforms.RandomPerspective(distortion_scale=0.5, p=0.5), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225]) | |
| ]) | |
| # Optionally, you can create a separate transform for inference without augmentations | |
| inference_transform = transforms.Compose([ | |
| transforms.Resize((224, 224)), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225]) | |
| ]) | |
| ``` | |
| Here's an exmaple of us testing the ensemble on the release test set. You can just change the load release_data line below and run the rest of the code to obtain rMSE. | |
| ```python | |
| # Load test data | |
| release_data = load_dataset("gydou/released_img", split="train") | |
| ``` | |
| ```python | |
| # Create dataset and dataloader using training mean and std | |
| rel_dataset = GPSImageDataset( | |
| hf_dataset=release_data, | |
| transform=inference_transform, | |
| lat_mean=lat_mean, | |
| lat_std=lat_std, | |
| lon_mean=lon_mean, | |
| lon_std=lon_std | |
| ) | |
| rel_dataloader = DataLoader(rel_dataset, batch_size=32, shuffle=False) | |
| ``` | |
| ```python | |
| # ensemble | |
| ensemble_model = WeightedEnsembleModel(models=models, weights=weights).to(device) | |
| # Validation | |
| all_preds = [] | |
| all_actuals = [] | |
| ensemble_model.eval() | |
| with torch.no_grad(): | |
| for images, gps_coords in rel_dataloader: | |
| images, gps_coords = images.to(device), gps_coords.to(device) | |
| # Weighted ensemble prediction using the new model | |
| ensemble_logits = ensemble_model(images) | |
| # Denormalize predictions and actual values | |
| preds = ensemble_logits.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean]) | |
| actuals = gps_coords.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean]) | |
| all_preds.append(preds) | |
| all_actuals.append(actuals) | |
| # Concatenate all batches | |
| all_preds = torch.cat(all_preds).numpy() | |
| all_actuals = torch.cat(all_actuals).numpy() | |
| # Compute error metrics | |
| mae = mean_absolute_error(all_actuals, all_preds) | |
| rmse = mean_squared_error(all_actuals, all_preds, squared=False) | |
| print(f'Mean Absolute Error: {mae}') | |
| print(f'Root Mean Squared Error: {rmse}') | |
| # Convert predictions and actuals to meters | |
| latitude_mean_radians = np.radians(lat_mean) # Convert to radians for cosine | |
| meters_per_degree_latitude = 111000 # Constant | |
| meters_per_degree_longitude = 111000 * np.cos(latitude_mean_radians) # Adjusted for latitude mean | |
| all_preds_meters = all_preds.copy() | |
| all_preds_meters[:, 0] *= meters_per_degree_latitude # Latitude to meters | |
| all_preds_meters[:, 1] *= meters_per_degree_longitude # Longitude to meters | |
| all_actuals_meters = all_actuals.copy() | |
| all_actuals_meters[:, 0] *= meters_per_degree_latitude # Latitude to meters | |
| all_actuals_meters[:, 1] *= meters_per_degree_longitude # Longitude to meters | |
| # Compute error metrics in meters | |
| mae_meters = mean_absolute_error(all_actuals_meters, all_preds_meters) | |
| rmse_meters = mean_squared_error(all_actuals_meters, all_preds_meters, squared=False) | |
| print(f"Mean Absolute Error (meters): {mae_meters:.2f}") | |
| print(f"Root Mean Squared Error (meters): {rmse_meters:.2f}") | |
| ``` | |
| After running inference on the release test set, our results are the following. | |
| - Release Dataset Mean Absolute Error: 0.0004267849560326909 | |
| - Release Dataset Root Mean Squared Error: 0.0005247778631268114 | |
| - Mean Absolute Error (meters): 41.90 | |
| - Root Mean Squared Error (meters): 51.29 |