Spaces:

FinalProj5190
/

README

Configuration error

App Files Files Community

sqiud commited on Dec 10, 2024

Commit

754c234

verified ·

1 Parent(s): a040736

Update README.md

Browse files

Files changed (1) hide show

README.md +24 -76

README.md CHANGED Viewed

@@ -1,91 +1,16 @@
----
-title: README
-emoji: 🐢
-colorFrom: gray
-colorTo: pink
-sdk: static
-pinned: false
----
 Dataset stats: \
 lat_mean = 39.951564548022596 \
 lat_std = 0.0006361722351128644 \
 lon_mean = -75.19150880602636 \
 lon_std = 0.000611411894337979
-The model implementation is found here:
-```
-import torch
-import torch.nn as nn
-import torchvision.models as models
-import torchvision.transforms as transforms
-from torch.utils.data import DataLoader, Dataset
-from transformers import AutoImageProcessor, AutoModelForImageClassification
-from huggingface_hub import PyTorchModelHubMixin
-from PIL import Image
-import os
-import numpy as np
-from transformers import AutoModel
-class MultiModalModel(nn.Module):
-    def __init__(self, image_model_name="google/vit-base-patch16-224", num_gps_features=2, output_dim=2):
-        super(MultiModalModel, self).__init__()
-        # Load Vision Transformer for feature extraction
-        self.image_model = AutoModel.from_pretrained(image_model_name, output_hidden_states=True)
-        # Reduce image features to match GPS features
-        self.image_fc = nn.Sequential(
-            nn.Linear(self.image_model.config.hidden_size, 256),
-            nn.ReLU(),
-        )
-        # Process GPS features
-        self.gps_fc = nn.Sequential(
-            nn.Linear(num_gps_features, 128),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(128, 256),
-        )
-        # Combine image and GPS features for regression
-        self.regressor = nn.Sequential(
-            nn.Linear(256 + 256, 512),  # 256 from image + 256 from GPS
-            nn.ReLU(),
-            nn.Dropout(0.4),
-            nn.Linear(512, output_dim),
-        )
-    def forward(self, image, gps):
-        # Extract image features from the last hidden state
-        image_outputs = self.image_model(image)
-        image_features = image_outputs.last_hidden_state[:, 0, :]  # CLS token features
-        image_features = self.image_fc(image_features)
-        # Process GPS features
-        gps_features = self.gps_fc(gps)
-        # Concatenate image and GPS features
-        combined_features = torch.cat([image_features, gps_features], dim=1)
-        # Final regression
-        return self.regressor(combined_features)
-    def save_model(self, save_path):
-        """Save model locally using the Hugging Face format."""
-        self.save_pretrained(save_path)
-    def push_model(self, repo_name):
-        """Push the model to the Hugging Face Hub."""
-        self.push_to_hub(repo_name)
-```
 The model can be loaded using:
 ```
 from huggingface_hub import hf_hub_download
 import torch
 # Specify the repository and the filename of the model you want to load
-repo_id = "FinalProj5190/ImageToGPSproject-vit-base"  # Replace with your repo name
 filename = "resnet_gps_regressor_complete.pth"
 model_path = hf_hub_download(repo_id=repo_id, filename=filename)
@@ -94,3 +19,26 @@ model_path = hf_hub_download(repo_id=repo_id, filename=filename)
 model_test = torch.load(model_path)
 model_test.eval()  # Set the model to evaluation mode
 ```

 Dataset stats: \
 lat_mean = 39.951564548022596 \
 lat_std = 0.0006361722351128644 \
 lon_mean = -75.19150880602636 \
 lon_std = 0.000611411894337979
 The model can be loaded using:
 ```
 from huggingface_hub import hf_hub_download
 import torch
 # Specify the repository and the filename of the model you want to load
+repo_id = "FinalProj5190/ImageToGPSproject_new_vit"  # Replace with your repo name
 filename = "resnet_gps_regressor_complete.pth"
 model_path = hf_hub_download(repo_id=repo_id, filename=filename)
 model_test = torch.load(model_path)
 model_test.eval()  # Set the model to evaluation mode
 ```
+The model implementation is here:
+```
+class MultiModalModel(nn.Module):
+    def __init__(self, num_classes=2):
+        super(MultiModalModel, self).__init__()
+        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+        # Replace for regression instead of classification
+        self.regression_head = nn.Sequential(
+            nn.Linear(self.vit.config.hidden_size, 512),
+            nn.ReLU(),
+            nn.Linear(512, num_classes)
+        )
+    def forward(self, x):
+        outputs = self.vit(pixel_values=x)
+        # Take the last hidden state (CLS token embedding)
+        cls_output = outputs.last_hidden_state[:, 0, :]
+        # Pass through the regression head
+        gps_coordinates = self.regression_head(cls_output)
+        return gps_coordinates
+```