Spaces:

ML4RS-Anonymous
/

EarthEmbeddingExplorer

Build error

App Files Files Community

ML4RS-Anonymous commited on Feb 4

Commit

eb1aec4

verified ·

1 Parent(s): a582d21

Upload all files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
MajorTOM/MajorTOMDataset.py +64 -0
MajorTOM/__init__.py +5 -0
MajorTOM/embedder/MajorTOM_Embedder.py +191 -0
MajorTOM/embedder/__init__.py +2 -0
MajorTOM/embedder/__pycache__/MajorTOM_Embedder.cpython-311.pyc +0 -0
MajorTOM/embedder/__pycache__/__init__.cpython-311.pyc +0 -0
MajorTOM/embedder/__pycache__/grid_cell_fragment.cpython-311.pyc +0 -0
MajorTOM/embedder/grid_cell_fragment.py +164 -0
MajorTOM/embedder/models/DINOv2_S2RGB.py +91 -0
MajorTOM/embedder/models/SSL4EO_S1RTC.py +125 -0
MajorTOM/embedder/models/SSL4EO_S2L1C.py +97 -0
MajorTOM/embedder/models/SigLIP_S2RGB.py +65 -0
MajorTOM/embedder/models/__init__.py +4 -0
MajorTOM/embedder/models/__pycache__/DINOv2_S2RGB.cpython-311.pyc +0 -0
MajorTOM/embedder/models/__pycache__/SSL4EO_S1RTC.cpython-311.pyc +0 -0
MajorTOM/embedder/models/__pycache__/SSL4EO_S2L1C.cpython-311.pyc +0 -0
MajorTOM/embedder/models/__pycache__/SigLIP_S2RGB.cpython-311.pyc +0 -0
MajorTOM/embedder/models/__pycache__/__init__.cpython-311.pyc +0 -0
MajorTOM/extras/coverage-example.png +3 -0
MajorTOM/extras/coverage_vis.py +149 -0
MajorTOM/extras/extract-sample-from-raw-S2.ipynb +0 -0
MajorTOM/extras/thumbnail_dem.py +77 -0
MajorTOM/extras/thumbnail_s1rtc.py +80 -0
MajorTOM/extras/thumbnail_s2.py +68 -0
MajorTOM/grid.py +284 -0
MajorTOM/metadata_helpers.py +159 -0
MajorTOM/sample_helpers.py +20 -0
app.py +799 -0
compute_embeddings.py +606 -0
configs/huggingface.yaml +15 -0
countries.geo.json +0 -0
data_utils.py +223 -0
examples/example1.png +3 -0
examples/example2.png +3 -0
examples/example3.png +3 -0
logs/compute_embeddings_dinov2.log +170 -0
logs/compute_embeddings_farslip.log +150 -0
logs/compute_embeddings_satclip.log +182 -0
logs/compute_embeddings_siglip.log +200 -0
models/FarSLIP/.gitignore +160 -0
models/FarSLIP/LICENSE +21 -0
models/FarSLIP/README.md +237 -0
models/FarSLIP/__init__.py +1 -0
models/FarSLIP/open_clip/__init__.py +18 -0
models/FarSLIP/open_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
models/FarSLIP/open_clip/coca_model.py +582 -0
models/FarSLIP/open_clip/constants.py +11 -0
models/FarSLIP/open_clip/convert.py +206 -0
models/FarSLIP/open_clip/factory.py +610 -0

.gitattributes CHANGED Viewed

@@ -39,3 +39,9 @@ EarthEmbeddingExplorer/examples/example3.png filter=lfs diff=lfs merge=lfs -text
 EarthEmbeddingExplorer/MajorTOM/extras/coverage-example.png filter=lfs diff=lfs merge=lfs -text
 EarthEmbeddingExplorer/models/SatCLIP/satclip/positional_encoding/__pycache__/spherical_harmonics_ylm.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
 EarthEmbeddingExplorer/models/SatCLIP/satclip/positional_encoding/__pycache__/spherical_harmonics_ylm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 EarthEmbeddingExplorer/MajorTOM/extras/coverage-example.png filter=lfs diff=lfs merge=lfs -text
 EarthEmbeddingExplorer/models/SatCLIP/satclip/positional_encoding/__pycache__/spherical_harmonics_ylm.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
 EarthEmbeddingExplorer/models/SatCLIP/satclip/positional_encoding/__pycache__/spherical_harmonics_ylm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+examples/example1.png filter=lfs diff=lfs merge=lfs -text
+examples/example2.png filter=lfs diff=lfs merge=lfs -text
+examples/example3.png filter=lfs diff=lfs merge=lfs -text
+MajorTOM/extras/coverage-example.png filter=lfs diff=lfs merge=lfs -text
+models/SatCLIP/satclip/positional_encoding/__pycache__/spherical_harmonics_ylm.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+models/SatCLIP/satclip/positional_encoding/__pycache__/spherical_harmonics_ylm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

MajorTOM/MajorTOMDataset.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import rasterio as rio
+from PIL import Image
+import torchvision.transforms as transforms
+class MajorTOM(Dataset):
+    """MajorTOM Dataset (https://huggingface.co/Major-TOM)
+    Args:
+        df ((geo)pandas.DataFrame): Metadata dataframe
+        local_dir (string): Root directory of the local dataset version
+        tif_bands (list): A list of tif file names to be read
+        png_bands (list): A list of png file names to be read
+    """
+    def __init__(self,
+                 df,
+                 local_dir = None,
+                 tif_bands=['B04','B03','B02'],
+                 png_bands=['thumbnail'],
+                 tif_transforms=[transforms.ToTensor()],
+                 png_transforms=[transforms.ToTensor()]
+                ):
+        super().__init__()
+        self.df = df
+        self.local_dir = Path(local_dir) if isinstance(local_dir,str) else local_dir
+        self.tif_bands = tif_bands if not isinstance(tif_bands,str) else [tif_bands]
+        self.png_bands = png_bands if not isinstance(png_bands,str) else [png_bands]
+        self.tif_transforms = transforms.Compose(tif_transforms) if tif_transforms is not None else None
+        self.png_transforms = transforms.Compose(png_transforms) if png_transforms is not None else None
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx):
+        meta = self.df.iloc[idx]
+        product_id = meta.product_id
+        grid_cell = meta.grid_cell
+        row = grid_cell.split('_')[0]
+        path = self.local_dir / Path("{}/{}/{}".format(row, grid_cell, product_id))
+        out_dict = {'meta' : meta}
+        for band in self.tif_bands:
+            with rio.open(path / '{}.tif'.format(band)) as f:
+                out = f.read()
+            if self.tif_transforms is not None:
+                out = self.tif_transforms(out)
+            out_dict[band] = out
+        for band in self.png_bands:
+            out = Image.open(path / '{}.png'.format(band))
+            if self.png_transforms is not None:
+                out = self.png_transforms(out)
+            out_dict[band] = out
+        return out_dict

MajorTOM/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .sample_helpers import *
+from .metadata_helpers import *
+from .MajorTOMDataset import *
+from .grid import *
+from .embedder import *

MajorTOM/embedder/MajorTOM_Embedder.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+import geopandas as gpd
+import hashlib
+from rasterio.io import MemoryFile
+from .grid_cell_fragment import *
+from .models import *
+import cv2
+class MajorTOM_Embedder(torch.nn.Module):
+    """
+    MajorTOM Embedder class that applies a model to geospatial image fragments,
+    computes embeddings, and returns metadata for each fragment.
+    This class is designed to work with raster data, where the image is fragmented
+    into smaller tiles, and embeddings are computed for each tile using the provided
+    embedder model. The output is a GeoDataFrame containing spatial metadata and
+    the corresponding embeddings for each tile.
+    Attributes:
+        embedder: A model that generates embeddings for image fragments.
+        frag_params: Dictionary containing fragmentation parameters such as the
+                      target overlap and border shift.
+        column_types: Dictionary specifying data types for the output GeoDataFrame columns.
+    """
+    def __init__(self, embedder, target_overlap=0.1, border_shift=True):
+        """
+        Initializes the MajorTOM Embedder with the given parameters.
+        Args:
+            embedder (torch.nn.Module): A model that generates embeddings for image fragments.
+            target_overlap (float): The target overlap between image fragments. Default is 0.1.
+            border_shift (bool): Whether to shift the borders of fragments to avoid edge artifacts. Default is True.
+        """
+        super().__init__()
+        # Model
+        self.embedder = embedder
+        # Fragmentation Settings
+        self.frag_params = params = {
+            'fragment_size' : self.embedder.size[0],
+            'target_overlap' : target_overlap,
+            'border_shift' : border_shift
+        }
+        # Data types for the output dataframe (commented columns need no conversion)
+        self.column_types = {
+            #'unique_id' :,
+            #'embedding' : ,
+            #'timestamp' : ,
+            #'product_id' : ,
+            #'grid_cell' : ,
+            'grid_row_u' : 'int16',
+            'grid_col_r' : 'int16',
+            'centre_lat' : 'float32',
+            'centre_lon' : 'float32',
+            #'utm_footprint' : ,
+            #'utm_crs' : ,
+            #'pixel_bbox' : ,
+        }
+    def bands(self):
+        """
+        Returns the set of input bands in the correct order.
+        Returns:
+            list: List of input bands used by the embedder.
+        """
+        return self.embedder.bands
+    def size(self):
+        """
+        Returns the input image size.
+        Returns:
+            tuple: Tuple representing the image size (height, width).
+        """
+        return self.embedder.size
+    def calculate_checksum(self, geometry, timestamp, product_id, embedding):
+        """
+        Calculates a checksum for the given geometry, timestamp, product ID, and embedding.
+        Args:
+            geometry (shapely.geometry): The geometry object representing the fragment's footprint.
+            timestamp (str): Timestamp of the data.
+            product_id (str): Product identifier.
+            embedding (np.ndarray): The embedding of the image fragment.
+        Returns:
+            str: A SHA256 checksum of the concatenated input parameters.
+        """
+        combined = f"{geometry}_{timestamp}_{product_id}_{embedding}"
+        checksum = hashlib.sha256(combined.encode()).hexdigest()
+        return checksum
+    def _read_image(self, row):
+        """
+        Reads and processes the image bands for a given row, performs optional upsampling
+        if the resolution is mismatched, and returns the image data, footprint, and CRS.
+        Args:
+            row (pandas.Series): The input row containing the image bands.
+        Returns:
+            torch.Tensor: A tensor containing the stacked image bands.
+            shapely.geometry: The footprint of the image.
+            rasterio.crs.CRS: The CRS of the image.
+        """
+        # Read the file
+        img = []
+        for band in self.embedder.bands:
+            with MemoryFile(row[band][0].as_py()) as mem_f:
+                with mem_f.open(driver='GTiff') as f:
+                    crs = f.crs
+                    footprint = box(*f.bounds)
+                    img.append(f.read()[0])
+        # optional upsampling
+        shapes = [layer.shape for layer in img]
+        if any([el!=shapes[0] for el in shapes]): # if any resolution mismatch
+            h, w = max([el[0] for el in shapes]), max([el[1] for el in shapes]) # maximum size
+            for layer_idx, layer in enumerate(img):
+                if layer.shape != (h,w):
+                    img[layer_idx] = cv2.resize(layer, (h,w), interpolation=cv2.INTER_NEAREST)
+        img = torch.from_numpy(np.stack(img,-1).astype(np.float32))
+        return img, footprint, crs
+    def forward(self, row, row_meta, device='cuda'):
+        """
+        Forward pass of the model: Reads the image, fragments it, computes embeddings
+        for each fragment, and returns a GeoDataFrame with the spatial metadata and
+        embeddings.
+        Args:
+            row (pandas.Series): The input row containing the image data.
+            row_meta (pandas.Series): Metadata associated with the row (e.g., timestamp, product_id).
+            device (str): The device to run the model on ('cpu' or 'cuda'). Default is 'cuda'.
+        Returns:
+            geopandas.GeoDataFrame: A GeoDataFrame containing metadata and embeddings for each fragment.
+        """
+        # Read file
+        img, footprint, crs = self._read_image(row)
+        # Fragment the sample
+        fragments, xys = fragment_fn(img, **self.frag_params, return_indices=True, verbose=False)
+        nrows, ncols, c, h, w = fragments.shape
+        # Apply the model
+        with torch.no_grad():
+            embeddings = self.embedder(fragments.reshape(-1,c,h,w).to(device)).view(nrows, ncols, -1)
+        df_rows = []
+        # Pack rows for geoparquet
+        for r_idx in range(nrows):
+            for c_idx in range(ncols):
+                embedding = embeddings[r_idx, c_idx].cpu().numpy()
+                # spatial features per fragment
+                x_offset,y_offset=xys[r_idx,c_idx].int().tolist()
+                pixel_bbox = [x_offset, y_offset, x_offset + h,y_offset + w] # in pixels
+                utm_footprint = crop_footprint(footprint, *img.shape[:2], pixel_bbox)
+                # main footprint is in WGS84 (needs to be consistent across parquet)
+                transformer = Transformer.from_crs(crs, CRS.from_epsg(4326), always_xy=True)
+                geometry = transform(transformer.transform, utm_footprint) # WGS84
+                centre_lon, centre_lat = geometry.centroid.coords[0]
+                row_dict = {
+                    'unique_id' : self.calculate_checksum(geometry, row_meta.timestamp.item(), row_meta.product_id.item(), embedding),
+                    'embedding' : embedding,
+                    'timestamp' : row_meta.timestamp.item(),
+                    'product_id' : row_meta.product_id.item(),
+                    'grid_cell' : row_meta.grid_cell.item(),
+                    'grid_row_u' : row_meta.grid_row_u.item(),
+                    'grid_col_r' : row_meta.grid_col_r.item(),
+                    'geometry' : geometry,
+                    'centre_lat' : centre_lat,
+                    'centre_lon' : centre_lon,
+                    'utm_footprint' : utm_footprint.wkt,
+                    'utm_crs' : crs.to_string(),
+                    'pixel_bbox' : pixel_bbox,
+                }
+                df_rows.append(row_dict)
+        return gpd.GeoDataFrame(df_rows).astype(self.column_types)

MajorTOM/embedder/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .MajorTOM_Embedder import *
2	+ from .grid_cell_fragment import *

MajorTOM/embedder/__pycache__/MajorTOM_Embedder.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

MajorTOM/embedder/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (244 Bytes). View file

MajorTOM/embedder/__pycache__/grid_cell_fragment.cpython-311.pyc ADDED Viewed

Binary file (8.37 kB). View file

MajorTOM/embedder/grid_cell_fragment.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from shapely.ops import transform
+from pyproj import CRS, Transformer
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+from shapely.geometry import Polygon, box
+from rasterio.transform import from_bounds, xy
+#from rasterio.windows import Window, from_bounds
+import rasterio as rio
+def crop_footprint(footprint, height, width, crop_bbox):
+    """
+    Crops the given footprint to the specified bounding box.
+    Args:
+        footprint (shapely.geometry.Polygon): The original footprint of the image or area.
+        height (int): Height of the image (in pixels).
+        width (int): Width of the image (in pixels).
+        crop_bbox (list): The bounding box to crop the footprint. The format is
+                          [col_start, row_start, col_end, row_end], where:
+                          - col_start, row_start: top-left corner
+                          - col_end, row_end: bottom-right corner
+    Returns:
+        shapely.geometry.Polygon: The cropped bounding box in the same coordinate reference system (CRS) as the original footprint.
+    """
+    transform = from_bounds(*footprint.bounds, width, height)
+    # Convert pixel coordinates (col, row) to spatial coordinates (e.g., UTM)
+    # Using the raster's affine transform
+    min_x, min_y = transform * (crop_bbox[0], crop_bbox[1])  # (col_start, row_start)
+    max_x, max_y = transform * (crop_bbox[2], crop_bbox[3])  # (col_end, row_end)
+    # Create a Shapely polygon for the crop's bounding box in UTM
+    return box(min_x, min_y, max_x, max_y)
+def fragment_unfold(image,fragment_size,overlap):
+    """
+    Unfold operation for a fragment with overlap. This function extracts image patches (fragments) with a specified
+    size and overlap between them.
+    Args:
+        image (torch.Tensor or np.ndarray): The input image to be fragmented (height, width, channels).
+        fragment_size (int or list): The size of each fragment. Can be a single integer for square fragments or
+                                     a list of two integers for non-square fragments.
+        overlap (int or list): The overlap between adjacent fragments. Can be a single integer or a list of two integers.
+    Returns:
+        torch.Tensor: The unfolded fragments of the image, each with the specified size and overlap.
+    """
+    # Convert image to a tensor and reorder dimensions if necessary
+    if not torch.is_tensor(image):
+        image = torch.from_numpy(image).permute(2, 0, 1)  # Rearrange to (channels, height, width)
+    if len(image.shape) < 4:
+        image = image.unsqueeze(0)  # Add batch dimension
+    b, c, h, w = image.shape
+    # Ensure fragment size is a list
+    if isinstance(fragment_size, int):
+        fragment_size = [fragment_size, fragment_size]
+    if isinstance(overlap, int):
+        overlap = [overlap, overlap]
+    # Calculate stride based on fragment size and overlap
+    stride = [f - o for f, o in zip(fragment_size, overlap)]
+    # Perform the unfolding operation
+    uf = torch.nn.functional.unfold(image, fragment_size, dilation=1, padding=0, stride=stride)
+    # Reshape and permute to return the unfolded image fragments
+    return uf.view(b, c, *fragment_size, -1).permute(0, 4, 1, 2, 3)[0]
+def fragment_fn(img,
+                fragment_size,
+                target_overlap,
+                border_shift=True, # determines whether the outer border is shifted to ensure full coverage
+                return_indices=False,
+                verbose=False
+               ):
+    """
+    Fragment an image into smaller patches with a specified fragment size and overlap.
+    This function handles different scenarios based on image size, fragment size, and overlap,
+    and creates fragments from the input image accordingly. It also supports shifting the outer
+    border of fragments to ensure full coverage of the image.
+    Args:
+        img (np.ndarray or torch.Tensor): The input image to be fragmented (height, width, channels).
+        fragment_size (int or list): The size of the fragments. Can be a single integer (square) or a list of two integers (non-square).
+        target_overlap (float): The target overlap between adjacent fragments, in pixels.
+        border_shift (bool): Whether to shift the border of fragments to ensure full coverage of the image. Default is True.
+        return_indices (bool): If True, the function will also return the indices (offsets) for each fragment. Default is False.
+        verbose (bool): If True, the function will print additional details about the overlap. Default is False.
+    Returns:
+        torch.Tensor or tuple:
+            - If `return_indices` is False, a tensor containing the image fragments.
+            - If `return_indices` is True, a tuple of the image fragments and their offsets.
+    """
+    h,w,c=img.shape
+    assert h==w # SQUARE IMAGES SUPPORT ONLY
+    hf, wf = fragment_size, fragment_size
+    ho, wo = target_overlap*hf, target_overlap*wf
+    assert h >= hf and w >= wf # reject Scenario 1
+    # Scenario 2
+    if h == hf or w == wf:
+        if not torch.is_tensor(img):
+            img=torch.from_numpy(img).permute(2,0,1)
+        return img.view(1,1,c,h,w)
+    # Scenario 3 & 4
+    # determine number of segments between the centers of outermost fragments
+    h_n = max(1, int(np.round((h-hf)/(hf-ho))))
+    w_n = max(1, int(np.round((w-wf)/(wf-wo))))
+    # adjust practical overlap (divide the distance between the centers of outermost fragments by the true number of segments)
+    aho = int(np.ceil(hf-(h-hf)/(h_n)))
+    awo = int(np.ceil(wf-(w-wf)/(w_n)))
+    # compute fragments (might not exactly fill the outermost border)
+    topleft = fragment_unfold(img.permute(2,0,1),fragment_size=(hf,wf), overlap=(aho,awo)).view(1+h_n, 1+w_n, c, hf, wf)
+    full = topleft
+    if border_shift:
+        if  h > hf+h_n*(hf-aho) or w > wf+w_n*(wf-awo):
+            #print('Outers...')
+            bottomleft = fragment_unfold(img[-hf:,:,:],fragment_size=(hf,wf), overlap=(aho,awo)).view(1,1+w_n,c,hf,wf)
+            topright = fragment_unfold(img[:,-wf:,:],fragment_size=(hf,wf), overlap=(aho,awo)).view(1+h_n,1,c,hf,wf)
+            # Shift last row and col to the border of the original
+            full[:,-1,None] = topright
+            full[-1] = bottomleft
+    if verbose:
+        print('Target Overlap: {} pixels. Feasible Overlap: {} pixels.'.format(ho,aho))
+    if not return_indices:
+        return full
+    else:
+        offset=-1*torch.ones(*full.shape[:2],2)
+        for ridx in range(full.shape[0]):
+            for cidx in range(full.shape[1]):
+                offset[ridx,cidx,1] = cidx * (hf-aho)
+                offset[ridx,cidx,0] = ridx * (wf-awo)
+                if border_shift:
+                    offset[ridx,-1,1] = h-hf
+                    offset[-1,cidx,0] = w-wf
+        return full,offset

MajorTOM/embedder/models/DINOv2_S2RGB.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+from transformers import AutoImageProcessor, AutoModel
+class DINOv2_S2RGB_Embedder(torch.nn.Module):
+    """
+    Embedding wrapper for DINOv2 and Sentinel-2 data.
+    This model uses the DINOv2 architecture to generate embeddings for Sentinel-2 RGB data. The input data (RGB bands)
+    is preprocessed by normalizing and mapping it to true-color values. Then, it is passed through the DINOv2 model
+    to obtain feature embeddings.
+    Preprocessing:
+        The input Sentinel-2 image is divided by 10,000 and multiplied by 2.5 to map it to a true-color image
+        (normalized to the range [0, 1]), followed by processing using the DINOv2 image processor.
+    Model:
+        The DINOv2 model processes RGB input images of shape [224, 224] and produces embeddings, which are then
+        averaged across the sequence dimension to obtain a fixed-size embedding vector.
+    Model Components:
+        - `AutoImageProcessor`: Preprocessing pipeline for handling Sentinel-2 data.
+        - `AutoModel`: DINOv2 transformer model used for feature extraction.
+    Attributes:
+        processor (AutoImageProcessor): The DINOv2 image processor to handle preprocessing.
+        model (AutoModel): The DINOv2 model used to generate embeddings from preprocessed images.
+        bands (list): List of the Sentinel-2 bands used for RGB input (B04, B03, B02).
+        size (tuple): The input size expected by the model (height, width) for the RGB image.
+    """
+    def __init__(self):
+        """
+        Initializes the DINOv2_S2RGB_Embedder by loading the pre-trained DINOv2 model and processor,
+        and setting the expected input size for Sentinel-2 RGB data.
+        This embedder uses the 'facebook/dinov2-base' model for feature extraction from Sentinel-2
+        true-color images (RGB).
+        Attributes:
+            processor (AutoImageProcessor): The DINOv2 image processor for preprocessing Sentinel-2 images.
+            model (AutoModel): The pre-trained DINOv2 model for generating embeddings.
+            bands (list): The Sentinel-2 bands used for RGB data (B04 - Red, B03 - Green, B02 - Blue).
+            size (tuple): The expected input size of the image for the DINOv2 model (height, width).
+        """
+        super().__init__()
+        # Load the DINOv2 processor and model from Hugging Face
+        self.processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+        self.model = AutoModel.from_pretrained('facebook/dinov2-base')
+        # Define the RGB bands for Sentinel-2 (B04, B03, B02)
+        self.bands = ['B04', 'B03', 'B02']
+        # Extract the input size from the processor settings
+        self.size = self.processor.crop_size['height'], self.processor.crop_size['width']
+    def normalize(self, input):
+        """
+        Normalizes Sentinel-2 RGB data to true-color values.
+        The input image (in raw Sentinel-2 reflectance values) is first divided by 10,000 to convert it
+        to reflectance values in the range [0, 1]. Then, the result is multiplied by 2.5 to obtain true-color
+        values that are suitable for input into the DINOv2 model.
+        Args:
+            input (torch.Tensor): The raw Sentinel-2 image tensor to be normalized.
+        Returns:
+            torch.Tensor: The normalized true-color image.
+        """
+        return (2.5 * (input / 1e4)).clip(0,1)
+    def forward(self, input):
+        """
+        Forward pass through the model to generate embeddings for the input image.
+        The input image is first normalized using the `normalize` method, then processed by the DINOv2 image processor
+        and passed through the DINOv2 model to generate embeddings. The output from the model is averaged across
+        the sequence dimension to obtain a fixed-size embedding.
+        Args:
+            input (torch.Tensor): The input Sentinel-2 image tensor with shape [C, H, W], where C=3 (RGB channels).
+        Returns:
+            torch.Tensor: The embedding vector, averaged over the sequence dimension, with shape [embedding_dim].
+        """
+        model_input = self.processor(self.normalize(input), return_tensors="pt")
+        outputs = self.model(model_input['pixel_values'].to(self.model.device))
+        last_hidden_states = outputs.last_hidden_state
+        return last_hidden_states.mean(dim=1).cpu()

MajorTOM/embedder/models/SSL4EO_S1RTC.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+from torchgeo.models import ResNet50_Weights
+import timm
+import numpy as np
+class SSL4EO_S1RTC_Embedder(torch.nn.Module):
+    """
+    SSL4EO Embedder for Sentinel-1 data using a pre-trained model.
+    This model is based on the SSL4EO (Self-Supervised Learning for Earth Observation) approach,
+    using a pre-trained ResNet50 model for Sentinel-1 radar data (SAR). The model is fine-tuned
+    to work with Sentinel-1 data and can be used directly for feature extraction.
+    Project Code:
+        https://github.com/zhu-xlab/SSL4EO-S12
+    Publication:
+        https://arxiv.org/abs/2211.07044
+    """
+    def __init__(self, s1_mean=[-12.54847273, -20.19237134], s1_std=[5.25697717,5.91150917]):
+        """
+        Initializes the SSL4EO_S1RTC_Embedder by setting up the mean and standard deviation for Sentinel-1 data normalization,
+        and loading the pre-trained model.
+        The model uses a pre-trained ResNet50 architecture adapted for Sentinel-1 radar (SAR) data, with weights provided
+        by the `torchgeo` library. The `s1_mean` and `s1_std` are used for normalizing the input data to the model.
+        Args:
+            s1_mean (list, optional): Mean values for Sentinel-1 radar (SAR) data. Default is set to SSL4EO's values.
+            s1_std (list, optional): Standard deviation values for Sentinel-1 radar (SAR) data. Default is set to SSL4EO's values.
+        Attributes:
+            s1_mean (torch.FloatTensor): Mean values for normalization.
+            s1_std (torch.FloatTensor): Standard deviation values for normalization.
+            model (torch.nn.Module): The ResNet50 model initialized with pre-trained weights.
+            bands (list): List of Sentinel-1 bands used for input data (VV, VH).
+            size (tuple): The input size expected by the model (224x224 pixels).
+        """
+        super().__init__()
+        self.s1_mean = torch.FloatTensor(s1_mean)
+        self.s1_std = torch.FloatTensor(s1_std)
+        # load model
+        self.model = self.init_model()
+        self.bands = ['vv','vh']
+        self.size = 224,224
+    def init_model(self):
+        """
+        Initializes the ResNet50 model with pre-trained weights for Sentinel-1 data.
+        This method loads the pre-trained model weights for Sentinel-1 data from `ResNet50_Weights.SENTINEL1_ALL_MOCO`
+        and sets the fully connected layer (`fc`) to an identity function to output embeddings directly from the last
+        convolutional layer.
+        Returns:
+            torch.nn.Module: The initialized ResNet50 model.
+        """
+        weights = ResNet50_Weights.SENTINEL1_ALL_MOCO
+        model = timm.create_model('resnet50', in_chans=weights.meta['in_chans'])
+        model.load_state_dict(weights.get_state_dict(progress=True), strict=False)
+        model.fc=torch.nn.Identity()
+        return model
+    def normalize(self, img,scale=1.0):
+        """
+        Normalizes the Sentinel-1 SAR (Synthetic Aperture Radar) data.
+        This method normalizes the Sentinel-1 radar signals using the mean (`s1_mean`)
+        and standard deviation (`s1_std`) values. The radar data is normalized to a
+        standard range, and the pixel values are scaled using a factor (`scale`).
+        Args:
+            img (torch.Tensor): The input Sentinel-1 image to be normalized.
+            scale (float, optional): The scaling factor for the normalized image. Default is 1.0.
+        Returns:
+            torch.Tensor: The normalized and scaled image.
+        """
+        min_value = (self.s1_mean - 2 * self.s1_std).to(img.device)
+        max_value = (self.s1_mean + 2 * self.s1_std).to(img.device)
+        img = (img - min_value[:,None,None]) / (max_value - min_value)[:,None,None] * scale
+        img = img.clip(0,scale).float()
+        return img
+    def preprocess(self, input):
+        """
+        Preprocesses the Sentinel-1 SAR (Synthetic Aperture Radar) data before feeding it into the model.
+        This method applies a logarithmic transformation to the input image to convert
+        it from linear scale to decibel (dB) scale. The image is clipped to avoid
+        logarithm of zero and then normalized using the `normalize` method.
+        Args:
+            input (torch.Tensor): The input Sentinel-1 image (e.g., VV or VH polarization).
+        Returns:
+            torch.Tensor: The preprocessed and normalized image in dB scale.
+        """
+        # Convert the input from linear scale to decibel (dB) scale
+        dB_input = 10 * input.log10(input.clip(min=1e-10))  # Clip to prevent log(0)
+        # Normalize the dB-scaled image
+        return self.normalize(dB_input)
+    def forward(self, input):
+        """
+        Forward pass through the model.
+        The input image is preprocessed using the `preprocess` method and then passed
+        through the ResNet50 model to obtain an embedding.
+        Args:
+            input (torch.Tensor): Preprocessed Sentinel-1 image (e.g., shape: [C, H, W]).
+        Returns:
+            torch.Tensor: The output embedding from the model.
+        """
+        return self.model(self.preprocess(input))

MajorTOM/embedder/models/SSL4EO_S2L1C.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+from torchgeo.models import ResNet50_Weights
+import timm
+class SSL4EO_S2L1C_Embedder(torch.nn.Module):
+    """
+    SSL4EO Embedder for Sentinel-2 data using a pre-trained model.
+    This model is based on the SSL4EO (Self-Supervised Learning for Earth Observation) approach,
+    using a pre-trained ResNet50 model for Sentinel-2 data. The model is fine-tuned for Sentinel-2
+    images and can be used directly for feature extraction.
+    Project Code:
+        https://github.com/zhu-xlab/SSL4EO-S12
+    Publication:
+        https://arxiv.org/abs/2211.07044
+    """
+    def __init__(self):
+        """
+        Initializes the SSL4EO_S2L1C_Embedder by loading the pre-trained SSL4EO model.
+        The model uses ResNet50 architecture, adapted for Sentinel-2 data with a specific
+        weight configuration (`ResNet50_Weights.SENTINEL2_ALL_DINO`) provided by `torchgeo`.
+        It also defines the bands used for Sentinel-2 data and sets the input image size to
+        224x224 pixels (the model input size).
+        Attributes:
+            model (torch.nn.Module): The ResNet50 model with pre-trained weights for Sentinel-2 data.
+            bands (list): List of Sentinel-2 bands used for input data.
+            size (tuple): The input image size expected by the model, set to 224x224 pixels.
+        """
+        super().__init__()
+        # Load the pre-trained SSL4EO ResNet50 model
+        self.model = self.init_model()
+        # Define the Sentinel-2 L1C bands (e.g., B01, B02, B03, etc.)
+        self.bands = [
+            'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07',
+            'B08', 'B8A', 'B09', 'B10', 'B11', 'B12'
+        ]
+        # Define the expected input size of the model
+        self.size = 224, 224
+    def init_model(self):
+        """
+        Initializes the ResNet50 model with pre-trained weights for Sentinel-2 data.
+        The model is loaded using the `timm` library, with Sentinel-2 specific weights
+        (`ResNet50_Weights.SENTINEL2_ALL_DINO`). The fully connected layer (`fc`) is replaced
+        with an identity function to obtain embeddings directly from the last convolutional
+        layer.
+        Returns:
+            torch.nn.Module: The initialized ResNet50 model.
+        """
+        weights = ResNet50_Weights.SENTINEL2_ALL_DINO
+        model = timm.create_model('resnet50', in_chans=weights.meta['in_chans'])
+        model.load_state_dict(weights.get_state_dict(progress=True), strict=False)
+        model.fc=torch.nn.Identity()
+        return model
+    def preprocess(self, input):
+        """
+        Preprocesses the Sentinel-2 input data for the model.
+        This function normalizes the input image by dividing the pixel values by 10,000.
+        This scaling step ensures that the reflectance values are mapped into an appropriate
+        range for the model.
+        Args:
+            input (torch.Tensor): Input image with Sentinel-2 reflectance values (e.g., shape: [C, H, W]).
+        Returns:
+            torch.Tensor: Preprocessed input, scaled by a factor of 10,000.
+        """
+        return input / 1e4
+    def forward(self, input):
+        """
+        Forward pass through the model.
+        The input image is preprocessed and then passed through the ResNet50 model to obtain the embedding.
+        Args:
+            input (torch.Tensor): Preprocessed Sentinel-2 image (e.g., shape: [C, H, W]).
+        Returns:
+            torch.Tensor: The output embedding from the model.
+        """
+        return self.model(self.preprocess(input))

MajorTOM/embedder/models/SigLIP_S2RGB.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from open_clip import create_model_from_pretrained, get_tokenizer
+import torch
+class SigLIP_S2RGB_Embedder(torch.nn.Module):
+    """
+    Embedding wrapper for SigLIP and Sentinel-2 data.
+    This model processes Sentinel-2 RGB data and embeds it into a feature space using the DINOv@ transformer model.
+    The preprocessing includes normalizing Sentinel-2 values to create a True-Colour image before passing it through
+    the model. The final output is a high-dimensional feature vector representing the input image.
+    Preprocessing:
+        - Sentinel-2 bands are divided by 10,000 to scale the reflectance values.
+        - Then, the values are multiplied by 2.5 to map them into the [0, 1] range for True-Colour images.
+        - The model input is further processed using the DINOv@ preprocessor.
+    Model:
+        - Takes an RGB input of shape 384x384 pixels and produces an embedding vector.
+    """
+    def __init__(self):
+        super().__init__()
+        # load model
+        self.model, self.preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')
+        # Sentinel-2 RGB bands (B04 - Red, B03 - Green, B02 - Blue)
+        self.bands = ['B04', 'B03', 'B02']
+        self.size = self.preprocess.transforms[0].size
+    def normalize(self, input):
+        """
+        Normalizes Sentinel-2 image data to create a True-Colour image.
+        Sentinel-2 images are scaled to reflectance values in the range [0, 1]. This function:
+        - Divides the input by 10,000 to scale Sentinel-2 values.
+        - Multiplies the result by 2.5 to map the values into the True-Colour image range.
+        Args:
+            input (torch.Tensor or np.ndarray): Input image with Sentinel-2 reflectance values.
+        Returns:
+            torch.Tensor: Normalized True-Colour image, clipped to the range [0, 1].
+        """
+        return (2.5 * (input / 1e4)).clip(0,1)
+    def forward(self, input):
+        """
+        Forward pass through the SigLIP model.
+        This method normalizes the input Sentinel-2 image to a True-Colour representation and processes it through
+        the model to obtain an embedding.
+        Args:
+            input (torch.Tensor): A Sentinel-2 image, typically of shape (C, H, W), where C=3 (RGB),
+                                  H=384, and W=384.
+        Returns:
+            torch.Tensor: The image embedding produced by the model.
+        """
+        preprocess_input = self.normalize(input)
+        # normalization only
+        model_input = self.preprocess.transforms[-1](preprocess_input)
+        return self.model.encode_image(model_input)

MajorTOM/embedder/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .SigLIP_S2RGB import *
+from .DINOv2_S2RGB import *
+from .SSL4EO_S2L1C import *
+from .SSL4EO_S1RTC import *

MajorTOM/embedder/models/__pycache__/DINOv2_S2RGB.cpython-311.pyc ADDED Viewed

Binary file (5.58 kB). View file

MajorTOM/embedder/models/__pycache__/SSL4EO_S1RTC.cpython-311.pyc ADDED Viewed

Binary file (7.02 kB). View file

MajorTOM/embedder/models/__pycache__/SSL4EO_S2L1C.cpython-311.pyc ADDED Viewed

Binary file (4.75 kB). View file

MajorTOM/embedder/models/__pycache__/SigLIP_S2RGB.cpython-311.pyc ADDED Viewed

Binary file (3.72 kB). View file

MajorTOM/embedder/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (308 Bytes). View file

MajorTOM/extras/coverage-example.png ADDED Viewed

Git LFS Details

SHA256: a2ed4c9e1b6516b07b803cdced733213d3db3692665c119814fb495089231627
Pointer size: 132 Bytes
Size of remote file: 2.97 MB

MajorTOM/extras/coverage_vis.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.basemap import Basemap
+import PIL
+def get_mask(df):
+    """
+        Take a Major TOM dataframe and create a mask corresponding to available cells
+    """
+    mask = np.zeros((2004,4008), dtype=np.uint8)
+    row_offset = -1002
+    col_offset = -2004
+    nodata = df['nodata'].values > 0.5
+    yy = mask.shape[0] - (np.array(df['grid_row_u']) - row_offset) - 1
+    xx = np.array(df['grid_col_r']) - col_offset
+    yy = yy[~nodata]
+    xx = xx[~nodata]
+    mask[yy, xx] = 255
+    return PIL.Image.fromarray(mask)
+def fig2img(fig):
+    """Convert a Matplotlib figure to a PIL Image and return it"""
+    import io
+    buf = io.BytesIO()
+    fig.savefig(buf)
+    buf.seek(0)
+    img = PIL.Image.open(buf)
+    return img
+def light_basemap():
+    """
+        Bright coloured contours
+    """
+    with plt.ioff():
+        fig, ax = plt.subplots(figsize=(48,24), dpi=167)
+        m = Basemap(projection='sinu', lat_0=0, lon_0=0, resolution='l', ax=ax)
+        m.fillcontinents(color="#9eba9b", lake_color='#CCDDFF')
+        m.drawmapboundary(fill_color="#CCDDFF")
+        m.drawcountries(color="#666666", linewidth=1)
+        m.drawcoastlines(color="#666666", linewidth=1)
+        plt.gca().set_axis_off()
+        plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,
+                    hspace = 0, wspace = 0)
+        plt.margins(0,0)
+        return fig2img(fig)
+def dark_basemap():
+    """
+        Dark contours
+    """
+    with plt.ioff():
+        fig, ax = plt.subplots(figsize=(48,24), dpi=167)
+        m = Basemap(projection='sinu', lat_0=0, lon_0=0, resolution='l', ax=ax)
+        m.fillcontinents(color="#242424", lake_color='#242424')
+        m.drawmapboundary(fill_color="#242424")
+        m.drawcountries(color="#000000", linewidth=1)
+        m.drawcoastlines(color="#000000", linewidth=1)
+        plt.gca().set_axis_off()
+        plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0,
+                    hspace = 0, wspace = 0)
+        plt.margins(0,0)
+        return fig2img(fig)
+def get_coveragemap(input, input2=None):
+    """
+        Creates a complete coloured Major TOM coverage figure in the same style as in the official documentation
+        Optionally, input2 can be provided and then, the map plots a map with extra colours indicating cells available only in input (green) or only input2 (blue)
+    """
+    if input2 is None:
+        return single_coveragemap(input)
+    else:
+        cmap1 = single_coveragemap(input)
+        cmap2 = single_coveragemap(input2)
+        # arrays for mixing
+        inp1_arr = np.array(cmap1)[...,:3]
+        inp2_arr = np.array(cmap2)[...,:3]
+        common_arr = inp1_arr*(inp1_arr.sum(-1) == inp2_arr.sum(-1))[:,:,None]
+        common_arr[:,:,(1,2)] = 0
+        inp1_arr[:,:,(0,2)] = 0 # Green - indicates presence of S2 only
+        inp2_arr[:,:,(0,1)] = 0 # Blue - indicates presense of DEM only
+        return PIL.Image.fromarray(((common_arr + inp1_arr + inp2_arr)).astype(np.uint8))
+def single_coveragemap(input):
+    """
+        Creates a complete coloured Major TOM coverage figure in the same style as in the official documentation
+    """
+    # compute mask if df is provided
+    if isinstance(input, pd.DataFrame):
+        mask = get_mask(input)
+    else:
+        mask = input
+    basemap = light_basemap()
+    basemap_d = dark_basemap()
+    outside_earth = np.array(basemap.convert('RGBA'))[:, :, 0] == 255
+    outside_earth = PIL.Image.fromarray(outside_earth)
+    mask = mask.resize(basemap.size, PIL.Image.NEAREST)
+    basemap.putalpha(mask)
+    # Mask outside of earth
+    basemap.paste(outside_earth, (0,0), outside_earth)
+    basemap_d.paste(basemap, (0,0), basemap)
+    return basemap_d
+if __name__ == '__main__':
+    DATASET_NAME = 'Major-TOM/Core-S2L2A'
+    meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
+    df = pd.read_parquet(meta_path)
+    # This is how you make a coverage figure!
+    coverage_img = get_coveragemap(df)
+    coverage_img.save('coverage-example.png', format='PNG')
+    # and this is how you can create an overap for 2 datasets!
+    DATASET_NAME = 'Major-TOM/Core-DEM'
+    meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
+    dem_df = pd.read_parquet(meta_path)
+    coverage_img = get_coveragemap(df,dem_df)
+    coverage_img.save('overlap-coverage-example.png', format='PNG')

MajorTOM/extras/extract-sample-from-raw-S2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

MajorTOM/extras/thumbnail_dem.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+    NOTE: Major TOM standard does not require any specific type of thumbnail to be computed.
+    Instead these are shared as optional help since this is how the Core dataset thumbnails have been computed.
+"""
+from rasterio.io import MemoryFile
+from PIL import Image
+import numpy as np
+import os
+from pathlib import Path
+import rasterio as rio
+from matplotlib.colors import LightSource
+def get_grayscale(x):
+    """
+        Normalized grayscale visualisation
+    """
+    # normalize
+    x_n = x-x.min()
+    x_n = x_n/x_n.max()
+    return np.uint8(x_n*255)
+def get_hillshade(x, azdeg=315, altdeg=45,ve=1):
+    """
+        Hillshade visualisation for DEM
+    """
+    ls = LightSource(azdeg=azdeg, altdeg=altdeg)
+    return np.uint8(255*ls.hillshade(x, vert_exag=ve))
+def dem_thumbnail(dem, dem_NODATA = -32768.0, hillshade=True):
+    """
+        Takes vv and vh numpy arrays along with the corresponding NODATA values (default is -32768.0)
+        Returns a numpy array with the thumbnail
+    """
+    if hillshade:
+        return get_hillshade(dem)
+    else:
+        return get_grayscale(dem)
+def dem_thumbnail_from_datarow(datarow):
+    """
+        Takes a datarow directly from one of the data parquet files
+        Returns a PIL Image
+    """
+    with MemoryFile(datarow['DEM'][0].as_py()) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            dem=f.read().squeeze()
+            dem_NODATA = f.nodata
+    img = dem_thumbnail(dem, dem_NODATA)
+    return Image.fromarray(img,'L')
+if __name__ == '__main__':
+    from fsspec.parquet import open_parquet_file
+    import pyarrow.parquet as pq
+    print('[example run] reading file from HuggingFace...')
+    url = "https://huggingface.co/datasets/Major-TOM/Core-DEM/resolve/main/images/part_01001.parquet"
+    with open_parquet_file(url) as f:
+        with pq.ParquetFile(f) as pf:
+            first_row_group = pf.read_row_group(1)
+    print('[example run] computing the thumbnail...')
+    thumbnail = dem_thumbnail_from_datarow(first_row_group)
+    thumbnail_fname = 'example_thumbnail.png'
+    thumbnail.save(thumbnail_fname, format = 'PNG')
+    print('[example run] saved as "{}"'.format(thumbnail_fname))

MajorTOM/extras/thumbnail_s1rtc.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+    NOTE: Major TOM standard does not require any specific type of thumbnail to be computed.
+    Instead these are shared as optional help since this is how the Core dataset thumbnails have been computed.
+"""
+from rasterio.io import MemoryFile
+from PIL import Image
+import numpy as np
+def s1rtc_thumbnail(vv, vh, vv_NODATA = -32768.0, vh_NODATA = -32768.0):
+    """
+        Takes vv and vh numpy arrays along with the corresponding NODATA values (default is -32768.0)
+        Returns a numpy array with the thumbnail
+    """
+    # valid data masks
+    vv_mask = vv != vv_NODATA
+    vh_mask = vh != vh_NODATA
+    # remove invalid values before log op
+    vv[vv<0] = vv[vv>=0].min()
+    vh[vh<0] = vh[vh>=0].min()
+    # apply log op
+    vv_dB = 10*np.log10(vv)
+    vh_dB = 10*np.log10(vh)
+    # scale to 0-255
+    vv_dB = (vv_dB - vv_dB[vv_mask].min()) / (vv_dB[vv_mask].max() - vv_dB[vv_mask].min()) * 255
+    vh_dB = (vh_dB - vh_dB[vh_mask].min()) / (vh_dB[vh_mask].max() - vh_dB[vh_mask].min()) * 255
+    # represent nodata as 0
+    vv_dB[vv_mask==0] = 0
+    vh_dB[vh_mask==0] = 0
+    # false colour composite
+    return np.stack([vv_dB,
+                    255*(vv_dB+vh_dB)/np.max(vv_dB+vh_dB),
+                    vh_dB
+                   ],-1).astype(np.uint8)
+def s1rtc_thumbnail_from_datarow(datarow):
+    """
+        Takes a datarow directly from one of the data parquet files
+        Returns a PIL Image
+    """
+    with MemoryFile(datarow['vv'][0].as_py()) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            vv=f.read().squeeze()
+            vv_NODATA = f.nodata
+    with MemoryFile(datarow['vh'][0].as_py()) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            vh=f.read().squeeze()
+            vh_NODATA = f.nodata
+    img = s1rtc_thumbnail(vv, vh, vv_NODATA=vv_NODATA, vh_NODATA=vh_NODATA)
+    return Image.fromarray(img)
+if __name__ == '__main__':
+    from fsspec.parquet import open_parquet_file
+    import pyarrow.parquet as pq
+    print('[example run] reading file from HuggingFace...')
+    url = "https://huggingface.co/datasets/Major-TOM/Core-S1RTC/resolve/main/images/part_00001.parquet"
+    with open_parquet_file(url) as f:
+        with pq.ParquetFile(f) as pf:
+            first_row_group = pf.read_row_group(1)
+    print('[example run] computing the thumbnail...')
+    thumbnail = s1rtc_thumbnail_from_datarow(first_row_group)
+    thumbnail_fname = 'example_thumbnail.png'
+    thumbnail.save(thumbnail_fname, format = 'PNG')
+    print('[example run] saved as "{}"'.format(thumbnail_fname))

MajorTOM/extras/thumbnail_s2.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+    NOTE: Major TOM standard does not require any specific type of thumbnail to be computed.
+    Instead these are shared as optional help since this is how the Core dataset thumbnails have been computed.
+"""
+from rasterio.io import MemoryFile
+from PIL import Image
+import numpy as np
+def s2l2a_thumbnail(B04, B03, B02, gain=1.3, gamma=0.6):
+    """
+        Takes B04, B03, B02 numpy arrays along with the corresponding NODATA values (default is -32768.0)
+        Returns a numpy array with the thumbnail
+    """
+    # concatenate
+    thumb = np.stack([B04, B03, B02], -1)
+    # apply gain & gamma
+    thumb = gain*((thumb/10_000)**gamma)
+    return (thumb.clip(0,1)*255).astype(np.uint8)
+def s2l2a_thumbnail_from_datarow(datarow):
+    """
+        Takes a datarow directly from one of the data parquet files
+        Returns a PIL Image
+    """
+    # red
+    with MemoryFile(datarow['B04'][0].as_py()) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            B04=f.read().squeeze()
+            B04_NODATA = f.nodata
+    # green
+    with MemoryFile(datarow['B03'][0].as_py()) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            B03=f.read().squeeze()
+            B03_NODATA = f.nodata
+    # blue
+    with MemoryFile(datarow['B02'][0].as_py()) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            B02=f.read().squeeze()
+            B02_NODATA = f.nodata
+    img = s2l2a_thumbnail(B04,B03,B02)
+    return Image.fromarray(img)
+if __name__ == '__main__':
+    from fsspec.parquet import open_parquet_file
+    import pyarrow.parquet as pq
+    print('[example run] reading file from HuggingFace...')
+    url = "https://huggingface.co/datasets/Major-TOM/Core-S2L2A/resolve/main/images/part_01000.parquet"
+    with open_parquet_file(url, columns = ["B04", "B03", "B02"]) as f:
+        with pq.ParquetFile(f) as pf:
+            first_row_group = pf.read_row_group(1, columns = ["B04", "B03", "B02"])
+    print('[example run] computing the thumbnail...')
+    thumbnail = s2l2a_thumbnail_from_datarow(first_row_group)
+    thumbnail.save('example_thumbnail.png', format = 'PNG')

MajorTOM/grid.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import numpy as np
+import math
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import LineString, Polygon
+from tqdm import tqdm
+import re
+class Grid():
+    RADIUS_EQUATOR = 6378.137 # km
+    def __init__(self,dist,latitude_range=(-85,85),longitude_range=(-180,180),utm_definition='bottomleft'):
+        self.dist = dist
+        self.latitude_range = latitude_range
+        self.longitude_range = longitude_range
+        self.utm_definition = utm_definition
+        self.rows,self.lats = self.get_rows()
+        self.points, self.points_by_row = self.get_points()
+    def get_rows(self):
+        # Define set of latitudes to use, based on the grid distance
+        arc_pole_to_pole = math.pi * self.RADIUS_EQUATOR
+        num_divisions_in_hemisphere = math.ceil(arc_pole_to_pole / self.dist)
+        latitudes = np.linspace(-90, 90, num_divisions_in_hemisphere+1)[:-1]
+        latitudes = np.mod(latitudes, 180) - 90
+        # order should be from south to north
+        latitudes = np.sort(latitudes)
+        zeroth_row = np.searchsorted(latitudes,0)
+        # From 0U-NU and 1D-ND
+        rows = [None] * len(latitudes)
+        rows[zeroth_row:] = [f'{i}U' for i in range(len(latitudes)-zeroth_row)]
+        rows[:zeroth_row] = [f'{abs(i-zeroth_row)}D' for i in range(zeroth_row)]
+        # bound to range
+        idxs = (latitudes>=self.latitude_range[0]) * (latitudes<=self.latitude_range[1])
+        rows,latitudes = np.array(rows), np.array(latitudes)
+        rows,latitudes = rows[idxs],latitudes[idxs]
+        return rows,latitudes
+    def get_circumference_at_latitude(self,lat):
+        # Circumference of the cross-section of a sphere at a given latitude
+        radius_at_lat = self.RADIUS_EQUATOR * math.cos(lat * math.pi / 180)
+        circumference = 2 * math.pi * radius_at_lat
+        return circumference
+    def subdivide_circumference(self,lat,return_cols=False):
+        # Provide a list of longitudes that subdivide the circumference of the earth at a given latitude
+        # into equal parts as close as possible to dist
+        circumference = self.get_circumference_at_latitude(lat)
+        num_divisions = math.ceil(circumference / self.dist)
+        longitudes = np.linspace(-180,180, num_divisions+1)[:-1]
+        longitudes = np.mod(longitudes, 360) - 180
+        longitudes = np.sort(longitudes)
+        if return_cols:
+            cols = [None] * len(longitudes)
+            zeroth_idx = np.where(longitudes==0)[0][0]
+            cols[zeroth_idx:] = [f'{i}R' for i in range(len(longitudes)-zeroth_idx)]
+            cols[:zeroth_idx] = [f'{abs(i-zeroth_idx)}L' for i in range(zeroth_idx)]
+            return np.array(cols),np.array(longitudes)
+        return np.array(longitudes)
+    def get_points(self):
+        r_idx = 0
+        points_by_row = [None]*len(self.rows)
+        for r,lat in zip(self.rows,self.lats):
+            point_names,grid_row_names,grid_col_names,grid_row_idx,grid_col_idx,grid_lats,grid_lons,utm_zones,epsgs = [],[],[],[],[],[],[],[],[]
+            cols,lons = self.subdivide_circumference(lat,return_cols=True)
+            cols,lons = self.filter_longitude(cols,lons)
+            c_idx = 0
+            for c,lon in zip(cols,lons):
+                point_names.append(f'{r}_{c}')
+                grid_row_names.append(r)
+                grid_col_names.append(c)
+                grid_row_idx.append(r_idx)
+                grid_col_idx.append(c_idx)
+                grid_lats.append(lat)
+                grid_lons.append(lon)
+                if self.utm_definition == 'bottomleft':
+                    utm_zones.append(get_utm_zone_from_latlng([lat,lon]))
+                elif self.utm_definition == 'center':
+                    center_lat = lat + (1000*self.dist/2)/111_120
+                    center_lon = lon + (1000*self.dist/2)/(111_120*math.cos(center_lat*math.pi/180))
+                    utm_zones.append(get_utm_zone_from_latlng([center_lat,center_lon]))
+                else:
+                    raise ValueError(f'Invalid utm_definition {self.utm_definition}')
+                epsgs.append(f'EPSG:{utm_zones[-1]}')
+                c_idx += 1
+            points_by_row[r_idx] = gpd.GeoDataFrame({
+                'name':point_names,
+                'row':grid_row_names,
+                'col':grid_col_names,
+                'row_idx':grid_row_idx,
+                'col_idx':grid_col_idx,
+                'utm_zone':utm_zones,
+                'epsg':epsgs
+            },geometry=gpd.points_from_xy(grid_lons,grid_lats))
+            r_idx += 1
+        points = gpd.GeoDataFrame(pd.concat(points_by_row))
+        # points.reset_index(inplace=True,drop=True)
+        return points, points_by_row
+    def group_points_by_row(self):
+        # Make list of different gdfs for each row
+        points_by_row = [None]*len(self.rows)
+        for i,row in enumerate(self.rows):
+            points_by_row[i] = self.points[self.points.row==row]
+        return points_by_row
+    def filter_longitude(self,cols,lons):
+        idxs = (lons>=self.longitude_range[0]) * (lons<=self.longitude_range[1])
+        cols,lons = cols[idxs],lons[idxs]
+        return cols,lons
+    def latlon2rowcol(self,lats,lons,return_idx=False,integer=False):
+        """
+        Convert latitude and longitude to row and column number from the grid
+        """
+        # Always take bottom left corner of grid cell
+        rows = np.searchsorted(self.lats,lats)-1
+        # Get the possible points of the grid cells at the given latitude
+        possible_points = [self.points_by_row[row] for row in rows]
+        # For each point, find the rightmost point that is still to the left of the given longitude
+        cols = [poss_points.iloc[np.searchsorted(poss_points.geometry.x,lon)-1].col for poss_points,lon in zip(possible_points,lons)]
+        rows = self.rows[rows].tolist()
+        outputs = [rows, cols]
+        if return_idx:
+            # Get the table index for self.points with each row,col pair in rows, cols
+            idx = [self.points[(self.points.row==row) & (self.points.col==col)].index.values[0] for row,col in zip(rows,cols)]
+            outputs.append(idx)
+        # return raw numbers
+        if integer:
+            outputs[0] = [int(el[:-1]) if el[-1] == 'U' else -int(el[:-1]) for el in outputs[0]]
+            outputs[1] = [int(el[:-1]) if el[-1] == 'R' else -int(el[:-1]) for el in outputs[1]]
+        return outputs
+    def rowcol2latlon(self,rows,cols):
+        point_geoms = [self.points.loc[(self.points.row==row) & (self.points.col==col),'geometry'].values[0] for row,col in zip(rows,cols)]
+        lats = [point.y for point in point_geoms]
+        lons = [point.x for point in point_geoms]
+        return lats,lons
+    def get_bounded_footprint(self,point,buffer_ratio=0):
+        # Gets the polygon footprint of the grid cell for a given point, bounded by the other grid points' cells.
+        # Grid point defined as bottom-left corner of polygon. Buffer ratio is the ratio of the grid cell's width/height to buffer by.
+        bottom,left = point.geometry.y,point.geometry.x
+        row_idx = point.row_idx
+        col_idx = point.col_idx
+        next_row_idx = row_idx+1
+        next_col_idx = col_idx+1
+        if next_row_idx >= len(self.lats): # If at top row, use difference between top and second-to-top row for height
+            height = (self.lats[row_idx] - self.lats[row_idx-1])
+            top = self.lats[row_idx] + height
+        else:
+            top = self.lats[next_row_idx]
+        max_col = len(self.points_by_row[row_idx].col_idx)-1
+        if next_col_idx > max_col: # If at rightmost column, use difference between rightmost and second-to-rightmost column for width
+            width = (self.points_by_row[row_idx].iloc[col_idx].geometry.x - self.points_by_row[row_idx].iloc[col_idx-1].geometry.x)
+            right = self.points_by_row[row_idx].iloc[col_idx].geometry.x + width
+        else:
+            right = self.points_by_row[row_idx].iloc[next_col_idx].geometry.x
+        # Buffer the polygon by the ratio of the grid cell's width/height
+        width = right - left
+        height = top - bottom
+        buffer_horizontal = width * buffer_ratio
+        buffer_vertical = height * buffer_ratio
+        new_left = left - buffer_horizontal
+        new_right = right + buffer_horizontal
+        new_bottom = bottom - buffer_vertical
+        new_top = top + buffer_vertical
+        bbox = Polygon([(new_left,new_bottom),(new_left,new_top),(new_right,new_top),(new_right,new_bottom)])
+        return bbox
+def get_utm_zone_from_latlng(latlng):
+    """
+    Get the UTM zone from a latlng list and return the corresponding EPSG code.
+    Parameters
+    ----------
+    latlng : List[Union[int, float]]
+        The latlng list to get the UTM zone from.
+    Returns
+    -------
+    str
+        The EPSG code for the UTM zone.
+    """
+    assert isinstance(latlng, (list, tuple)), "latlng must be in the form of a list or tuple."
+    longitude = latlng[1]
+    latitude = latlng[0]
+    zone_number = (math.floor((longitude + 180) / 6)) % 60 + 1
+    # Special zones for Svalbard and Norway
+    if latitude >= 56.0 and latitude < 64.0 and longitude >= 3.0 and longitude < 12.0:
+        zone_number = 32
+    elif latitude >= 72.0 and latitude < 84.0:
+        if longitude >= 0.0 and longitude < 9.0:
+            zone_number = 31
+        elif longitude >= 9.0 and longitude < 21.0:
+            zone_number = 33
+        elif longitude >= 21.0 and longitude < 33.0:
+            zone_number = 35
+        elif longitude >= 33.0 and longitude < 42.0:
+            zone_number = 37
+    # Determine the hemisphere and construct the EPSG code
+    if latitude < 0:
+        epsg_code = f"327{zone_number:02d}"
+    else:
+        epsg_code = f"326{zone_number:02d}"
+    if not re.match(r"32[6-7](0[1-9]|[1-5][0-9]|60)",epsg_code):
+        print(f"latlng: {latlng}, epsg_code: {epsg_code}")
+        raise ValueError(f"out of bound latlng resulted in incorrect EPSG code for the point")
+    return epsg_code
+if __name__ == '__main__':
+    assert get_utm_zone_from_latlng([-1,-174.34]) == "32701"
+    assert get_utm_zone_from_latlng([48,-4]) == "32630"
+    assert get_utm_zone_from_latlng([78,13]) == "32633"
+    assert get_utm_zone_from_latlng([-34,19.7]) == "32734"
+    assert get_utm_zone_from_latlng([-36,175.7]) == "32760"
+    dist = 100
+    grid = Grid(dist)
+    np.random.seed(0)
+    test_lons = np.random.uniform(-20,20,size=(1000)) % 180 # Checks edge-case of crossing 180th meridian
+    test_lats = np.random.uniform(-20,68,size=(1000))
+    test_rows,test_cols = grid.latlon2rowcol(test_lats,test_lons)
+    test_lats2,test_lons2 = grid.rowcol2latlon(test_rows,test_cols)
+    print(test_lons[:10])
+    print(test_lats[:10])
+    print(test_rows[:10])
+    print(test_cols[:10])
+    # Make line segments from the points to their corresponding grid points
+    lines = []
+    for i in range(len(test_lats)):
+        lines.append([(test_lons[i],test_lats[i]),(test_lons2[i],test_lats2[i])])
+    lines = gpd.GeoDataFrame(geometry=gpd.GeoSeries([LineString(line) for line in lines]))
+    lines.to_file(f'testlines_{dist}km.geojson',driver='GeoJSON')
+    grid.points.to_file(f'testgrid_{dist}km.geojson',driver='GeoJSON')

MajorTOM/metadata_helpers.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import pyarrow.parquet as pq
+import pandas as pd
+import geopandas as gpd
+from pathlib import Path
+import urllib.request
+import fsspec
+from fsspec.parquet import open_parquet_file
+from io import BytesIO
+from PIL import Image
+from rasterio.io import MemoryFile
+from tqdm.notebook import tqdm
+import os
+from .sample_helpers import *
+def metadata_from_url(access_url, local_url):
+    local_url, response = urllib.request.urlretrieve(access_url, local_url)
+    df = pq.read_table(local_url).to_pandas()
+    df['timestamp'] = pd.to_datetime(df.timestamp)
+    gdf = gpd.GeoDataFrame(
+        df, geometry=gpd.points_from_xy(df.centre_lon, df.centre_lat), crs=df.crs.iloc[0]
+    )
+    return gdf
+def filter_metadata(df,
+                    region=None,
+                    daterange=None,
+                    cloud_cover=(0,100),
+                    nodata=(0, 1.0)
+                   ):
+    """Filters the Major-TOM dataframe based on several parameters
+    Args:
+        df (geopandas dataframe): Parent dataframe
+        region (shapely geometry object) : Region of interest
+        daterange (tuple) : Inclusive range of dates (example format: '2020-01-01')
+        cloud_cover (tuple) : Inclusive percentage range (0-100) of cloud cover
+        nodata (tuple) : Inclusive fraction (0.0-1.0) of no data allowed in a sample
+    Returns:
+        df: a filtered dataframe
+    """
+    # temporal filtering
+    if daterange is not None:
+        assert (isinstance(daterange, list) or isinstance(daterange, tuple)) and len(daterange)==2
+        df = df[df.timestamp >= daterange[0]]
+        df = df[df.timestamp <= daterange[1]]
+    # spatial filtering
+    if region is not None:
+        idxs = df.sindex.query(region)
+        df = df.take(idxs)
+    # cloud filtering
+    if cloud_cover is not None:
+        df = df[df.cloud_cover >= cloud_cover[0]]
+        df = df[df.cloud_cover <= cloud_cover[1]]
+    # spatial filtering
+    if nodata is not None:
+        df = df[df.nodata >= nodata[0]]
+        df = df[df.nodata <= nodata[1]]
+    return df
+def read_row(row, columns=["thumbnail"]):
+    """Reads a row from a Major-TOM dataframe
+    Args:
+        row (row from geopandas dataframe): The row of metadata
+        columns (list): columns to be read from the file
+    Returns:
+        data (dict): dictionary with returned data from requested columns
+    """
+    with open_parquet_file(row.parquet_url, columns=columns, footer_sample_size=2000000) as f:
+        with pq.ParquetFile(f) as pf:
+            row_group = pf.read_row_group(row.parquet_row, columns=columns)
+    if columns == ["thumbnail"]:
+        stream = BytesIO(row_group['thumbnail'][0].as_py())
+        return Image.open(stream)
+    else:
+        row_output = {}
+        for col in columns:
+            bytes = row_group[col][0].as_py()
+            if col != 'thumbnail':
+                row_output[col] = read_tif_bytes(bytes)
+            else:
+                stream = BytesIO(bytes)
+                row_output[col] = Image.open(stream)
+        return row_output
+def filter_download(df, local_dir, source_name, by_row = False, verbose = False, tif_columns=None):
+    """Downloads and unpacks the data of Major-TOM based on a metadata dataframe
+    Args:
+        df (geopandas dataframe): Metadata dataframe
+        local_dir (str or Path) : Path to the where the data is to be stored locally
+        source_name (str) : Name alias of the resulting dataset
+        by_row (bool): If True, it will access individual rows of parquet via http - otherwise entire parquets are downloaded temporarily
+        verbose (bool) : option for potential internal state printing
+        tif_columns (list of str) : Optionally specified columns to be downloaded as .tifs, e.g. ['B04', 'B03', 'B02']
+    Returns:
+        None
+    """
+    if isinstance(local_dir, str):
+        local_dir = Path(local_dir)
+    temp_file = local_dir / 'temp.parquet'
+    # identify all parquets that need to be downloaded (group them)
+    urls = df.parquet_url.unique()
+    print('Starting download of {} parquet files.'.format(len(urls))) if verbose else None
+    for url in tqdm(urls, desc='Downloading and unpacking...', disable=not verbose):
+        # identify all relevant rows
+        rows = df[df.parquet_url == url].parquet_row.unique()
+        if not by_row: # (downloads entire parquet)
+            # download a temporary file
+            temp_path, http_resp = urllib.request.urlretrieve(url, temp_file)
+        else:
+            f=fsspec.open(url)
+            temp_path = f.open()
+        # populate the bands
+        with pq.ParquetFile(temp_path) as pf:
+            for row_idx in rows:
+                table = pf.read_row_group(row_idx)
+                product_id = table['product_id'][0].as_py()
+                grid_cell = table['grid_cell'][0].as_py()
+                row = grid_cell.split('_')[0]
+                dest = local_dir / Path("{}/{}/{}/{}".format(source_name, row, grid_cell, product_id))
+                dest.mkdir(exist_ok=True, parents=True)
+                columns = [col for col in table.column_names if col[0] == 'B'] + ['cloud_mask'] if tif_columns is None else tif_columns
+                # tifs
+                for col in columns:
+                    with open(dest / "{}.tif".format(col), "wb") as f:
+                        # Write bytes to file
+                        f.write(table[col][0].as_py())
+                # thumbnail (png)
+                col = 'thumbnail'
+                with open(dest / "{}.png".format(col), "wb") as f:
+                    # Write bytes to file
+                    f.write(table[col][0].as_py())
+        if not by_row:
+            # remove downloaded file
+            os.remove(temp_path)
+        else:
+            f.close()

MajorTOM/sample_helpers.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from rasterio.io import MemoryFile
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from io import BytesIO
+def plot(sample, bands = ['B04', 'B03', 'B02'], scaling=2e3):
+    img = []
+    for b in bands:
+        img.append(read_tif_bytes(sample[b]))
+    plt.imshow(np.stack(img, -1)/2e3)
+def read_tif_bytes(tif_bytes):
+    with MemoryFile(tif_bytes) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            return f.read().squeeze()
+def read_png_bytes(png_bytes):
+    stream = BytesIO(png_bytes)
+    return Image.open(stream)

app.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import gradio as gr
+import torch
+import time
+import os
+import tempfile
+import zipfile
+import numpy as np
+import pandas as pd
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Import custom modules
+from models.siglip_model import SigLIPModel
+from models.satclip_model import SatCLIPModel
+from models.farslip_model import FarSLIPModel
+from models.dinov2_model import DINOv2Model
+from models.load_config import load_and_process_config
+from visualize import format_results_for_gallery, plot_top5_overview, plot_location_distribution, plot_global_map_static, plot_geographic_distribution
+from data_utils import download_and_process_image, get_esri_satellite_image, get_placeholder_image
+from PIL import Image as PILImage
+from PIL import ImageDraw, ImageFont
+# Configuration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Running on device: {device}")
+# Load and process configuration
+config = load_and_process_config()
+print(config)
+# Initialize Models
+print("Initializing models...")
+models = {}
+# DINOv2
+try:
+    if config and 'dinov2' in config:
+        models['DINOv2'] = DINOv2Model(
+            ckpt_path=config['dinov2'].get('ckpt_path'),
+            embedding_path=config['dinov2'].get('embedding_path'),
+            device=device
+        )
+    else:
+        models['DINOv2'] = DINOv2Model(device=device)
+except Exception as e:
+    print(f"Failed to load DINOv2: {e}")
+# SigLIP
+try:
+    if config and 'siglip' in config:
+        models['SigLIP'] = SigLIPModel(
+            ckpt_path=config['siglip'].get('ckpt_path'),
+            tokenizer_path=config['siglip'].get('tokenizer_path'),
+            embedding_path=config['siglip'].get('embedding_path'),
+            device=device
+        )
+    else:
+        models['SigLIP'] = SigLIPModel(device=device)
+except Exception as e:
+    print(f"Failed to load SigLIP: {e}")
+# SatCLIP
+try:
+    if config and 'satclip' in config:
+        models['SatCLIP'] = SatCLIPModel(
+            ckpt_path=config['satclip'].get('ckpt_path'),
+            embedding_path=config['satclip'].get('embedding_path'),
+            device=device
+        )
+    else:
+        models['SatCLIP'] = SatCLIPModel(device=device)
+except Exception as e:
+    print(f"Failed to load SatCLIP: {e}")
+# FarSLIP
+try:
+    if config and 'farslip' in config:
+        models['FarSLIP'] = FarSLIPModel(
+            ckpt_path=config['farslip'].get('ckpt_path'),
+            model_name=config['farslip'].get('model_name'),
+            embedding_path=config['farslip'].get('embedding_path'),
+            device=device
+        )
+    else:
+        models['FarSLIP'] = FarSLIPModel(device=device)
+except Exception as e:
+    print(f"Failed to load FarSLIP: {e}")
+def get_active_model(model_name):
+    if model_name not in models:
+        return None, f"Model {model_name} not loaded."
+    return models[model_name], None
+def combine_images(img1, img2):
+    if img1 is None: return img2
+    if img2 is None: return img1
+    # Resize to match width
+    w1, h1 = img1.size
+    w2, h2 = img2.size
+    new_w = max(w1, w2)
+    new_h1 = int(h1 * new_w / w1)
+    new_h2 = int(h2 * new_w / w2)
+    img1 = img1.resize((new_w, new_h1))
+    img2 = img2.resize((new_w, new_h2))
+    dst = PILImage.new('RGB', (new_w, new_h1 + new_h2), (255, 255, 255))
+    dst.paste(img1, (0, 0))
+    dst.paste(img2, (0, new_h1))
+    return dst
+def create_text_image(text, size=(384, 384)):
+    img = PILImage.new('RGB', size, color=(240, 240, 240))
+    d = ImageDraw.Draw(img)
+    # Try to load a font, fallback to default
+    try:
+        # Try to find a font that supports larger size
+        font = ImageFont.truetype("DejaVuSans.ttf", 40)
+    except:
+        font = ImageFont.load_default()
+    # Wrap text simply
+    margin = 20
+    offset = 100
+    for line in text.split(','):
+        d.text((margin, offset), line.strip(), font=font, fill=(0, 0, 0))
+        offset += 50
+    d.text((margin, offset + 50), "Text Query", font=font, fill=(0, 0, 255))
+    return img
+def fetch_top_k_images(top_indices, probs, df_embed, query_text=None):
+    """
+    Fetches top-k images using actual dataset download (ModelScope) via download_and_process_image.
+    """
+    results = []
+    # We can run this in parallel
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        future_to_idx = {}
+        for i, idx in enumerate(top_indices):
+            row = df_embed.iloc[idx]
+            pid = row['product_id']
+            # Use download_and_process_image to get real data
+            future = executor.submit(download_and_process_image, pid, df_source=df_embed, verbose=False)
+            future_to_idx[future] = idx
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                img_384, img_full = future.result()
+                if img_384 is None:
+                    # Fallback to Esri if download fails
+                    print(f"Download failed for idx {idx}, falling back to Esri...")
+                    row = df_embed.iloc[idx]
+                    img_384 = get_esri_satellite_image(row['centre_lat'], row['centre_lon'], score=probs[idx], rank=0, query=query_text)
+                    img_full = img_384
+                row = df_embed.iloc[idx]
+                results.append({
+                    'image_384': img_384,
+                    'image_full': img_full,
+                    'score': probs[idx],
+                    'lat': row['centre_lat'],
+                    'lon': row['centre_lon'],
+                    'id': row['product_id']
+                })
+            except Exception as e:
+                print(f"Error fetching image for idx {idx}: {e}")
+    # Sort results by score descending (since futures complete in random order)
+    results.sort(key=lambda x: x['score'], reverse=True)
+    return results
+def get_all_results_metadata(model, filtered_indices, probs):
+    if len(filtered_indices) == 0:
+        return []
+    # Sort by score descending
+    filtered_scores = probs[filtered_indices]
+    sorted_order = np.argsort(filtered_scores)[::-1]
+    sorted_indices = filtered_indices[sorted_order]
+    # Extract from DataFrame
+    df_results = model.df_embed.iloc[sorted_indices].copy()
+    df_results['score'] = probs[sorted_indices]
+    # Rename columns
+    df_results = df_results.rename(columns={'product_id': 'id', 'centre_lat': 'lat', 'centre_lon': 'lon'})
+    # Convert to list of dicts
+    return df_results[['id', 'lat', 'lon', 'score']].to_dict('records')
+def search_text(query, threshold, model_name):
+    model, error = get_active_model(model_name)
+    if error:
+        yield None, None, error, None, None, None, None
+        return
+    if not query:
+        yield None, None, "Please enter a query.", None, None, None, None
+        return
+    try:
+        timings = {}
+        # 1. Encode Text
+        yield None, None, "Encoding text...", None, None, None, None
+        t0 = time.time()
+        text_features = model.encode_text(query)
+        timings['Encoding'] = time.time() - t0
+        if text_features is None:
+            yield None, None, "Model does not support text encoding or is not initialized.", None, None, None, None
+            return
+        # 2. Search
+        yield None, None, "Encoding text... ✓\nRetrieving similar images...", None, None, None, None
+        t0 = time.time()
+        probs, filtered_indices, top_indices = model.search(text_features, top_percent=threshold/1000.0)
+        timings['Retrieval'] = time.time() - t0
+        if probs is None:
+            yield None, None, "Search failed (embeddings missing?).", None, None, None, None
+            return
+        # Show geographic distribution (not timed)
+        df_embed = model.df_embed
+        geo_dist_map, df_filtered = plot_geographic_distribution(df_embed, probs, threshold/1000.0, title=f'Similarity to "{query}" ({model_name})')
+        # 3. Download Images
+        yield gr.update(visible=False), None, "Encoding text... ✓\nRetrieving similar images... ✓\nDownloading images...", None, None, df_filtered, gr.update(value=geo_dist_map, visible=True)
+        t0 = time.time()
+        top_indices = top_indices[:10]
+        results = fetch_top_k_images(top_indices, probs, df_embed, query_text=query)
+        timings['Download'] = time.time() - t0
+        # 4. Visualize - keep geo_dist_map visible
+        yield gr.update(visible=False), None, "Encoding text... ✓\nRetrieving similar images... ✓\nDownloading images... ✓\nGenerating visualizations...", None, None, df_filtered, gr.update(value=geo_dist_map, visible=True)
+        t0 = time.time()
+        fig_results = plot_top5_overview(None, results, query_info=query)
+        gallery_items = format_results_for_gallery(results)
+        timings['Visualization'] = time.time() - t0
+        # 5. Generate Final Status
+        timing_str = f"Encoding {timings['Encoding']:.1f}s, Retrieval {timings['Retrieval']:.1f}s, Download {timings['Download']:.1f}s, Visualization {timings['Visualization']:.1f}s\n\n"
+        status_msg = timing_str + generate_status_msg(len(filtered_indices), threshold/100.0, results)
+        all_results = get_all_results_metadata(model, filtered_indices, probs)
+        results_txt = format_results_to_text(all_results)
+        yield gr.update(visible=False), gallery_items, status_msg, fig_results, [geo_dist_map, fig_results, results_txt], df_filtered, gr.update(value=geo_dist_map, visible=True)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        yield None, None, f"Error: {str(e)}", None, None, None, None
+def search_image(image_input, threshold, model_name):
+    model, error = get_active_model(model_name)
+    if error:
+        yield None, None, error, None, None, None, None
+        return
+    if image_input is None:
+        yield None, None, "Please upload an image.", None, None, None, None
+        return
+    try:
+        timings = {}
+        # 1. Encode Image
+        yield None, None, "Encoding image...", None, None, None, None
+        t0 = time.time()
+        image_features = model.encode_image(image_input)
+        timings['Encoding'] = time.time() - t0
+        if image_features is None:
+            yield None, None, "Model does not support image encoding.", None, None, None, None
+            return
+        # 2. Search
+        yield None, None, "Encoding image... ✓\nRetrieving similar images...", None, None, None, None
+        t0 = time.time()
+        probs, filtered_indices, top_indices = model.search(image_features, top_percent=threshold/1000.0)
+        timings['Retrieval'] = time.time() - t0
+        # Show geographic distribution (not timed)
+        df_embed = model.df_embed
+        geo_dist_map, df_filtered = plot_geographic_distribution(df_embed, probs, threshold/1000.0, title=f'Similarity to Input Image ({model_name})')
+        # 3. Download Images
+        yield gr.update(visible=False), None, "Encoding image... ✓\nRetrieving similar images... ✓\nDownloading images...", None, None, df_filtered, gr.update(value=geo_dist_map, visible=True)
+        t0 = time.time()
+        top_indices = top_indices[:6]
+        results = fetch_top_k_images(top_indices, probs, df_embed, query_text="Image Query")
+        timings['Download'] = time.time() - t0
+        # 4. Visualize - keep geo_dist_map visible
+        yield gr.update(visible=False), None, "Encoding image... ✓\nRetrieving similar images... ✓\nDownloading images... ✓\nGenerating visualizations...", None, None, df_filtered, gr.update(value=geo_dist_map, visible=True)
+        t0 = time.time()
+        fig_results = plot_top5_overview(image_input, results, query_info="Image Query")
+        gallery_items = format_results_for_gallery(results)
+        timings['Visualization'] = time.time() - t0
+        # 5. Generate Final Status
+        timing_str = f"Encoding {timings['Encoding']:.1f}s, Retrieval {timings['Retrieval']:.1f}s, Download {timings['Download']:.1f}s, Visualization {timings['Visualization']:.1f}s\n\n"
+        status_msg = timing_str + generate_status_msg(len(filtered_indices), threshold/100.0, results)
+        all_results = get_all_results_metadata(model, filtered_indices, probs)
+        results_txt = format_results_to_text(all_results[:50])
+        yield gr.update(visible=False), gallery_items, status_msg, fig_results, [geo_dist_map, fig_results, results_txt], df_filtered, gr.update(value=geo_dist_map, visible=True)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        yield None, None, f"Error: {str(e)}", None, None, None, None
+def search_location(lat, lon, threshold):
+    model_name = "SatCLIP"
+    model, error = get_active_model(model_name)
+    if error:
+        yield None, None, error, None, None, None, None
+        return
+    try:
+        timings = {}
+        # 1. Encode Location
+        yield None, None, "Encoding location...", None, None, None, None
+        t0 = time.time()
+        loc_features = model.encode_location(float(lat), float(lon))
+        timings['Encoding'] = time.time() - t0
+        if loc_features is None:
+            yield None, None, "Location encoding failed.", None, None, None, None
+            return
+        # 2. Search
+        yield None, None, "Encoding location... ✓\nRetrieving similar images...", None, None, None, None
+        t0 = time.time()
+        probs, filtered_indices, top_indices = model.search(loc_features, top_percent=threshold/100.0)
+        timings['Retrieval'] = time.time() - t0
+        # 3. Generate Distribution Map (not timed for location distribution)
+        yield None, None, "Encoding location... ✓\nRetrieving similar images... ✓\nGenerating distribution map...", None, None, None, None
+        df_embed = model.df_embed
+        top_10_indices = top_indices[:10]
+        top_10_results = []
+        for idx in top_10_indices:
+            row = df_embed.iloc[idx]
+            top_10_results.append({'lat': row['centre_lat'], 'lon': row['centre_lon']})
+        # Show geographic distribution (not timed)
+        geo_dist_map, df_filtered = plot_geographic_distribution(df_embed, probs, threshold/1000.0, title=f'Similarity to Location ({lat}, {lon})')
+        # 4. Download Images
+        yield gr.update(visible=False), None, "Encoding location... ✓\nRetrieving similar images... ✓\nGenerating distribution map... ✓\nDownloading images...", None, None, df_filtered, gr.update(value=geo_dist_map, visible=True)
+        t0 = time.time()
+        top_6_indices = top_indices[:6]
+        results = fetch_top_k_images(top_6_indices, probs, df_embed, query_text=f"Loc: {lat},{lon}")
+        # Get query tile
+        query_tile = None
+        try:
+            lats = pd.to_numeric(df_embed['centre_lat'], errors='coerce')
+            lons = pd.to_numeric(df_embed['centre_lon'], errors='coerce')
+            dists = (lats - float(lat))**2 + (lons - float(lon))**2
+            nearest_idx = dists.idxmin()
+            pid = df_embed.loc[nearest_idx, 'product_id']
+            query_tile, _ = download_and_process_image(pid, df_source=df_embed, verbose=False)
+        except Exception as e:
+            print(f"Error fetching nearest MajorTOM image: {e}")
+        if query_tile is None:
+            query_tile = get_placeholder_image(f"Query Location\n({lat}, {lon})")
+        timings['Download'] = time.time() - t0
+        # 5. Visualize - keep geo_dist_map visible
+        yield gr.update(visible=False), None, "Encoding location... ✓\nRetrieving similar images... ✓\nGenerating distribution map... ✓\nDownloading images... ✓\nGenerating visualizations...", None, None, df_filtered, gr.update(value=geo_dist_map, visible=True)
+        t0 = time.time()
+        fig_results = plot_top5_overview(query_tile, results, query_info=f"Loc: {lat},{lon}")
+        gallery_items = format_results_for_gallery(results)
+        timings['Visualization'] = time.time() - t0
+        # 6. Generate Final Status
+        timing_str = f"Encoding {timings['Encoding']:.1f}s, Retrieval {timings['Retrieval']:.1f}s, Download {timings['Download']:.1f}s, Visualization {timings['Visualization']:.1f}s\n\n"
+        status_msg = timing_str + generate_status_msg(len(filtered_indices), threshold/100.0, results)
+        all_results = get_all_results_metadata(model, filtered_indices, probs)
+        results_txt = format_results_to_text(all_results)
+        yield gr.update(visible=False), gallery_items, status_msg, fig_results, [geo_dist_map, fig_results, results_txt], df_filtered, gr.update(value=geo_dist_map, visible=True)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        yield None, None, f"Error: {str(e)}", None, None, None, None
+def generate_status_msg(count, threshold, results):
+    status_msg = f"Found {count} matches in top {threshold*100:.0f}‰.\n\nTop {len(results)} similar images:\n"
+    for i, res in enumerate(results[:5]):
+        status_msg += f"{i+1}. Product ID: {res['id']}, Location: ({res['lat']:.4f}, {res['lon']:.4f}), Score: {res['score']:.4f}\n"
+    return status_msg
+def get_initial_plot():
+    # Use FarSLIP as default for initial plot, fallback to SigLIP
+    df_vis = None
+    img = None
+    if 'DINOv2' in models and models['DINOv2'].df_embed is not None:
+        img, df_vis = plot_global_map_static(models['DINOv2'].df_embed)
+        # fig = plot_global_map(models['FarSLIP'].df_embed)
+    else:
+        img, df_vis = plot_global_map_static(models['SigLIP'].df_embed)
+    return gr.update(value=img, visible=True), [img], df_vis, gr.update(visible=False)
+def handle_map_click(evt: gr.SelectData, df_vis):
+    if evt is None:
+        return None, None, None, "No point selected."
+    try:
+        x, y = evt.index[0], evt.index[1]
+        # Image dimensions (New)
+        img_width = 3000
+        img_height = 1500
+        # Scaled Margins (Proportional to 4000x2000)
+        left_margin = 110 * 0.75
+        right_margin = 110 * 0.75
+        top_margin = 100 * 0.75
+        bottom_margin = 67 * 0.75
+        plot_width = img_width - left_margin - right_margin
+        plot_height = img_height - top_margin - bottom_margin
+        # Adjust for aspect ratio preservation
+        map_aspect = 360.0 / 180.0  # 2.0
+        plot_aspect = plot_width / plot_height
+        if plot_aspect > map_aspect:
+            actual_map_width = plot_height * map_aspect
+            actual_map_height = plot_height
+            h_offset = (plot_width - actual_map_width) / 2
+            v_offset = 0
+        else:
+            actual_map_width = plot_width
+            actual_map_height = plot_width / map_aspect
+            h_offset = 0
+            v_offset = (plot_height - actual_map_height) / 2
+        # Calculate relative position within the plot area
+        x_in_plot = x - left_margin
+        y_in_plot = y - top_margin
+        # Check if click is within the actual map bounds
+        if (x_in_plot < h_offset or x_in_plot > h_offset + actual_map_width or
+            y_in_plot < v_offset or y_in_plot > v_offset + actual_map_height):
+            return None, None, None, "Click outside map area. Please click on the map."
+        # Calculate relative position within the map (0 to 1)
+        x_rel = (x_in_plot - h_offset) / actual_map_width
+        y_rel = (y_in_plot - v_offset) / actual_map_height
+        # Clamp to [0, 1]
+        x_rel = max(0, min(1, x_rel))
+        y_rel = max(0, min(1, y_rel))
+        # Convert to geographic coordinates
+        lon = x_rel * 360 - 180
+        lat = 90 - y_rel * 180
+        # Find nearest point in df_vis if available
+        pid = ""
+        if df_vis is not None:
+            dists = (df_vis['centre_lat'] - lat)**2 + (df_vis['centre_lon'] - lon)**2
+            min_idx = dists.idxmin()
+            nearest_row = df_vis.loc[min_idx]
+            if dists[min_idx] < 25:
+                lat = nearest_row['centre_lat']
+                lon = nearest_row['centre_lon']
+                pid = nearest_row['product_id']
+    except Exception as e:
+        print(f"Error handling click: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None, None, f"Error: {e}"
+    return lat, lon, pid, f"Selected Point: ({lat:.4f}, {lon:.4f})"
+def download_image_by_location(lat, lon, pid, model_name):
+    """Download and return the image at the specified location"""
+    if lat is None or lon is None:
+        return None, "Please specify coordinates first."
+    model, error = get_active_model(model_name)
+    if error:
+        return None, error
+    try:
+        # Convert to float to ensure proper formatting
+        lat = float(lat)
+        lon = float(lon)
+        # Find Product ID if not provided
+        if not pid:
+            df = model.df_embed
+            lats = pd.to_numeric(df['centre_lat'], errors='coerce')
+            lons = pd.to_numeric(df['centre_lon'], errors='coerce')
+            dists = (lats - lat)**2 + (lons - lon)**2
+            nearest_idx = dists.idxmin()
+            pid = df.loc[nearest_idx, 'product_id']
+        # Download image
+        img_384, _ = download_and_process_image(pid, df_source=model.df_embed, verbose=True)
+        if img_384 is None:
+            return None, f"Failed to download image for location ({lat:.4f}, {lon:.4f})"
+        return img_384, f"Downloaded image at ({lat:.4f}, {lon:.4f})"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"Error: {str(e)}"
+def reset_to_global_map():
+    """Reset the map to the initial global distribution view"""
+    img = None
+    df_vis = None
+    if 'DINOv2' in models and models['DINOv2'].df_embed is not None:
+        img, df_vis = plot_global_map_static(models['DINOv2'].df_embed)
+    else:
+        img, df_vis = plot_global_map_static(models['SigLIP'].df_embed)
+    return gr.update(value=img, visible=True), [img], df_vis
+def format_results_to_text(results):
+    if not results:
+        return "No results found."
+    txt = f"Top {len(results)} Retrieval Results\n"
+    txt += "=" * 30 + "\n\n"
+    for i, res in enumerate(results):
+        txt += f"Rank: {i+1}\n"
+        txt += f"Product ID: {res['id']}\n"
+        txt += f"Location: Latitude {res['lat']:.6f}, Longitude {res['lon']:.6f}\n"
+        txt += f"Similarity Score: {res['score']:.6f}\n"
+        txt += "-" * 30 + "\n"
+    return txt
+def save_plot(figs):
+    if figs is None:
+        return None
+    try:
+        # If it's a single image (initial state), save as png
+        if isinstance(figs, PILImage.Image):
+             fd, path = tempfile.mkstemp(suffix='.png', prefix='earth_explorer_map_')
+             os.close(fd)
+             figs.save(path)
+             return path
+        # If it's a list/tuple of images [map_img, results_img]
+        if isinstance(figs, (list, tuple)):
+            # If only one image in list, save as PNG
+            if len(figs) == 1 and isinstance(figs[0], PILImage.Image):
+                 fd, path = tempfile.mkstemp(suffix='.png', prefix='earth_explorer_map_')
+                 os.close(fd)
+                 figs[0].save(path)
+                 return path
+            fd, zip_path = tempfile.mkstemp(suffix='.zip', prefix='earth_explorer_results_')
+            os.close(fd)
+            with zipfile.ZipFile(zip_path, 'w') as zipf:
+                # Save Map
+                if figs[0] is not None:
+                    map_path = os.path.join(tempfile.gettempdir(), 'map_distribution.png')
+                    figs[0].save(map_path)
+                    zipf.write(map_path, arcname='map_distribution.png')
+                # Save Results
+                if len(figs) > 1 and figs[1] is not None:
+                    res_path = os.path.join(tempfile.gettempdir(), 'retrieval_results.png')
+                    figs[1].save(res_path)
+                    zipf.write(res_path, arcname='retrieval_results.png')
+                # Save Results Text
+                if len(figs) > 2 and figs[2] is not None:
+                    txt_path = os.path.join(tempfile.gettempdir(), 'results.txt')
+                    with open(txt_path, 'w', encoding='utf-8') as f:
+                        f.write(figs[2])
+                    zipf.write(txt_path, arcname='results.txt')
+            return zip_path
+        # Fallback for Plotly figure (if any)
+        # Create a temporary file
+        fd, path = tempfile.mkstemp(suffix='.html', prefix='earth_explorer_plot_')
+        os.close(fd)
+        # Write to the temporary file
+        figs.write_html(path)
+        return path
+    except Exception as e:
+        print(f"Error saving: {e}")
+        return None
+# Gradio Blocks Interface
+with gr.Blocks(title="EarthEmbeddingExplorer") as demo:
+    gr.Markdown("# EarthEmbeddingExplorer")
+    gr.HTML("""
+    <div style="font-size: 1.2em;">
+    EarthEmbeddingExplorer is a tool that allows you to search for satellite images of the Earth using natural language descriptions, images, geolocations, or a simple a click on the map. For example, you can type "tropical rainforest" or "coastline with a city," and the system will find locations on Earth that match your description. It then visualizes these locations on a world map and displays the top matching images.
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Tabs():
+                with gr.TabItem("Text Search") as tab_text:
+                    model_selector_text = gr.Dropdown(choices=["SigLIP", "FarSLIP"], value="FarSLIP", label="Model")
+                    query_input = gr.Textbox(label="Query", placeholder="e.g., rainforest, glacier")
+                    gr.Examples(
+                        examples=[
+                            ["a satellite image of a river around a city"],
+                            ["a satellite image of a rainforest"],
+                            ["a satellite image of a slum"],
+                            ["a satellite image of a glacier"],
+                            ["a satellite image of snow covered mountains"]
+                        ],
+                        inputs=[query_input],
+                        label="Text Examples"
+                    )
+                    search_btn = gr.Button("Search by Text", variant="primary")
+                with gr.TabItem("Image Search") as tab_image:
+                    model_selector_img = gr.Dropdown(choices=["SigLIP", "FarSLIP", "SatCLIP", "DINOv2"], value="FarSLIP", label="Model")
+                    gr.Markdown("### Option 1: Upload or Select Image")
+                    image_input = gr.Image(type="pil", label="Upload Image")
+                    gr.Examples(
+                        examples=[
+                            ["./examples/example1.png"],
+                            ["./examples/example2.png"],
+                            ["./examples/example3.png"]
+                        ],
+                        inputs=[image_input],
+                        label="Image Examples"
+                    )
+                    gr.Markdown("### Option 2: Click Map or Enter Coordinates")
+                    btn_reset_map_img = gr.Button("🔄 Reset Map to Global View", variant="secondary", size="sm")
+                    with gr.Row():
+                        img_lat = gr.Number(label="Latitude", interactive=True)
+                        img_lon = gr.Number(label="Longitude", interactive=True)
+                    img_pid = gr.Textbox(label="Product ID (auto-filled)", visible=False)
+                    img_click_status = gr.Markdown("")
+                    btn_download_img = gr.Button("Download Image by Geolocation", variant="secondary")
+                    search_img_btn = gr.Button("Search by Image", variant="primary")
+                with gr.TabItem("Location Search") as tab_location:
+                    gr.Markdown("Search using **SatCLIP** location encoder.")
+                    gr.Markdown("### Click Map or Enter Coordinates")
+                    btn_reset_map_loc = gr.Button("🔄 Reset Map to Global View", variant="secondary", size="sm")
+                    with gr.Row():
+                        lat_input = gr.Number(label="Latitude", value=30.0, interactive=True)
+                        lon_input = gr.Number(label="Longitude", value=120.0, interactive=True)
+                    loc_pid = gr.Textbox(label="Product ID (auto-filled)", visible=False)
+                    loc_click_status = gr.Markdown("")
+                    gr.Examples(
+                        examples=[
+                            [30.32, 120.15],
+                            [40.7128, -74.0060],
+                            [24.65, 46.71],
+                            [-3.4653, -62.2159],
+                            [64.4, 16.8]
+                        ],
+                        inputs=[lat_input, lon_input],
+                        label="Location Examples"
+                    )
+                    search_loc_btn = gr.Button("Search by Location", variant="primary")
+            threshold_slider = gr.Slider(minimum=1, maximum=30, value=7, step=1, label="Top Percentage (‰)")
+            status_output = gr.Textbox(label="Status", lines=10)
+            save_btn = gr.Button("Download Result")
+            download_file = gr.File(label="Zipped Results", height=40)
+        with gr.Column(scale=6):
+            plot_map = gr.Image(
+                label="Geographical Distribution",
+                type="pil",
+                interactive=False,
+                height=400,
+                width=800,
+                visible=True
+            )
+            plot_map_interactive = gr.Plot(
+                label="Geographical Distribution (Interactive)",
+                visible=False
+            )
+            results_plot = gr.Image(label="Top 5 Matched Images", type="pil")
+            gallery_images = gr.Gallery(label="Top Retrieved Images (Zoom)", columns=3, height="auto")
+    current_fig = gr.State()
+    map_data_state = gr.State()
+    # Initial Load
+    demo.load(fn=get_initial_plot, outputs=[plot_map, current_fig, map_data_state, plot_map_interactive])
+    # Reset Map Buttons
+    btn_reset_map_img.click(
+        fn=reset_to_global_map,
+        outputs=[plot_map, current_fig, map_data_state]
+    )
+    btn_reset_map_loc.click(
+        fn=reset_to_global_map,
+        outputs=[plot_map, current_fig, map_data_state]
+    )
+    # Map Click Event - updates Image Search coordinates
+    plot_map.select(
+        fn=handle_map_click,
+        inputs=[map_data_state],
+        outputs=[img_lat, img_lon, img_pid, img_click_status]
+    )
+    # Map Click Event - also updates Location Search coordinates
+    plot_map.select(
+        fn=handle_map_click,
+        inputs=[map_data_state],
+        outputs=[lat_input, lon_input, loc_pid, loc_click_status]
+    )
+    # Download Image by Geolocation
+    btn_download_img.click(
+        fn=download_image_by_location,
+        inputs=[img_lat, img_lon, img_pid, model_selector_img],
+        outputs=[image_input, img_click_status]
+    )
+    # Search Event (Text)
+    search_btn.click(
+        fn=search_text,
+        inputs=[query_input, threshold_slider, model_selector_text],
+        outputs=[plot_map_interactive, gallery_images, status_output, results_plot, current_fig, map_data_state, plot_map]
+    )
+    # Search Event (Image)
+    search_img_btn.click(
+        fn=search_image,
+        inputs=[image_input, threshold_slider, model_selector_img],
+        outputs=[plot_map_interactive, gallery_images, status_output, results_plot, current_fig, map_data_state, plot_map]
+    )
+    # Search Event (Location)
+    search_loc_btn.click(
+        fn=search_location,
+        inputs=[lat_input, lon_input, threshold_slider],
+        outputs=[plot_map_interactive, gallery_images, status_output, results_plot, current_fig, map_data_state, plot_map]
+    )
+    # Save Event
+    save_btn.click(
+        fn=save_plot,
+        inputs=[current_fig],
+        outputs=[download_file]
+    )
+    # Tab Selection Events
+    def show_static_map():
+        return gr.update(visible=True), gr.update(visible=False)
+    tab_text.select(fn=show_static_map, outputs=[plot_map, plot_map_interactive])
+    tab_image.select(fn=show_static_map, outputs=[plot_map, plot_map_interactive])
+    tab_location.select(fn=show_static_map, outputs=[plot_map, plot_map_interactive])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

compute_embeddings.py ADDED Viewed

	@@ -0,0 +1,606 @@

+#!/usr/bin/env python3
+"""
+Compute Embeddings for Major-TOM Sentinel-2 Images
+This script generates embeddings for Sentinel-2 imagery using various models:
+- DINOv2: Vision Transformer trained with self-supervised learning
+- SigLIP: Vision-Language model with sigmoid loss
+- FarSLIP: Remote sensing fine-tuned CLIP
+- SatCLIP: Satellite imagery CLIP with location awareness
+Usage:
+    python compute_embeddings.py --model dinov2 --device cuda:1
+    python compute_embeddings.py --model siglip --device cuda:5
+    python compute_embeddings.py --model satclip --device cuda:3
+    python compute_embeddings.py --model farslip --device cuda:4
+Author: Generated by Copilot
+"""
+import os
+import sys
+import argparse
+import logging
+from pathlib import Path
+from datetime import datetime
+import numpy as np
+import pandas as pd
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.absolute()
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from models.load_config import load_and_process_config
+# =============================================================================
+# Configuration
+# =============================================================================
+METADATA_PATH = Path("/data1/zyj/Core-S2L2A-249k/Core_S2L2A_249k_crop_384x384_metadata.parquet")
+IMAGE_PARQUET_DIR = Path("/data1/zyj/Core-S2L2A-249k/images")
+OUTPUT_BASE_DIR = Path("/data1/zyj/EarthEmbeddings/Core-S2L2A-249k")
+# Columns to remove from output
+COLUMNS_TO_REMOVE = ['cloud_cover', 'nodata', 'geometry_wkt', 'bands', 'image_shape', 'image_dtype']
+# Columns to rename
+COLUMNS_RENAME = {'crs': 'utm_crs'}
+# Pixel bbox for center 384x384 crop from 1068x1068 original
+# (1068 - 384) / 2 = 342
+PIXEL_BBOX = [342, 342, 726, 726]  # [x_min, y_min, x_max, y_max]
+# Model output paths
+MODEL_OUTPUT_PATHS = {
+    'dinov2': OUTPUT_BASE_DIR / 'dinov2' / 'DINOv2_crop_384x384.parquet',
+    'siglip': OUTPUT_BASE_DIR / 'siglip' / 'SigLIP_crop_384x384.parquet',
+    'farslip': OUTPUT_BASE_DIR / 'farslip' / 'FarSLIP_crop_384x384.parquet',
+    'satclip': OUTPUT_BASE_DIR / 'satclip' / 'SatCLIP_crop_384x384.parquet',
+}
+# Batch sizes for different models
+BATCH_SIZES = {
+    'dinov2': 64,
+    'siglip': 64,
+    'farslip': 64,
+    'satclip': 128,
+}
+# =============================================================================
+# Setup Logging
+# =============================================================================
+def setup_logging(model_name: str):
+    """Configure logging to both file and console."""
+    log_dir = PROJECT_ROOT / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_file = log_dir / f"compute_embeddings_{model_name}.log"
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+    return logging.getLogger(__name__)
+# =============================================================================
+# Image Preprocessing Functions
+# =============================================================================
+def decode_image_bytes(row) -> np.ndarray:
+    """
+    Decode image bytes from parquet row to numpy array.
+    Args:
+        row: pandas Series with 'image_bytes', 'image_shape', 'image_dtype'
+    Returns:
+        np.ndarray of shape (H, W, 12) with uint16 values
+    """
+    shape = tuple(map(int, row['image_shape']))
+    dtype = np.dtype(row['image_dtype'])
+    img_flat = np.frombuffer(row['image_bytes'], dtype=dtype)
+    return img_flat.reshape(shape)
+def extract_rgb_image(img_array: np.ndarray, clip_max: float = 4000.0) -> Image.Image:
+    """
+    Extract RGB channels from 12-band Sentinel-2 array.
+    Sentinel-2 Bands: [B01, B02, B03, B04, B05, B06, B07, B08, B8A, B09, B11, B12]
+    RGB Mapping: R=B04(idx 3), G=B03(idx 2), B=B02(idx 1)
+    Args:
+        img_array: numpy array of shape (H, W, 12)
+        clip_max: Value to clip reflectance data for visualization
+    Returns:
+        PIL.Image: RGB image
+    """
+    # Select RGB Channels: R=B04(3), G=B03(2), B=B02(1)
+    rgb_bands = img_array[:, :, [3, 2, 1]].astype(np.float32)
+    # Normalize and Clip
+    rgb_normalized = np.clip(rgb_bands / clip_max, 0, 1)
+    # Convert to 8-bit
+    rgb_uint8 = (rgb_normalized * 255).astype(np.uint8)
+    return Image.fromarray(rgb_uint8)
+# =============================================================================
+# Model Loading Functions
+# =============================================================================
+def load_model(model_name: str, device: str, config: dict):
+    """
+    Load the specified model.
+    Args:
+        model_name: One of 'dinov2', 'siglip', 'farslip', 'satclip'
+        device: Device string like 'cuda:0' or 'cpu'
+        config: Configuration dictionary from local.yaml
+    Returns:
+        Model instance
+    """
+    logger = logging.getLogger(__name__)
+    if model_name == 'dinov2':
+        from models.dinov2_model import DINOv2Model
+        model_config = config.get('dinov2', {})
+        model = DINOv2Model(
+            ckpt_path=model_config.get('ckpt_path', '/data1/zyj/checkpoints/dinov2-large'),
+            model_name='facebook/dinov2-large',
+            embedding_path=None,  # We're generating, not loading
+            device=device
+        )
+        logger.info(f"DINOv2 model loaded on {device}")
+        return model
+    elif model_name == 'siglip':
+        from models.siglip_model import SigLIPModel
+        model_config = config.get('siglip', {})
+        model = SigLIPModel(
+            ckpt_path=model_config.get('ckpt_path', './checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin'),
+            model_name='ViT-SO400M-14-SigLIP-384',
+            tokenizer_path=model_config.get('tokenizer_path', './checkpoints/ViT-SO400M-14-SigLIP-384'),
+            embedding_path=None,
+            device=device
+        )
+        # Disable embedding loading since we set path to None
+        model.df_embed = None
+        model.image_embeddings = None
+        logger.info(f"SigLIP model loaded on {device}")
+        return model
+    elif model_name == 'farslip':
+        from models.farslip_model import FarSLIPModel
+        model_config = config.get('farslip', {})
+        model = FarSLIPModel(
+            ckpt_path=model_config.get('ckpt_path', './checkpoints/FarSLIP/FarSLIP2_ViT-B-16.pt'),
+            model_name='ViT-B-16',
+            embedding_path=None,
+            device=device
+        )
+        logger.info(f"FarSLIP model loaded on {device}")
+        return model
+    elif model_name == 'satclip':
+        from models.satclip_ms_model import SatCLIPMSModel
+        model_config = config.get('satclip', {})
+        model = SatCLIPMSModel(
+            ckpt_path=model_config.get('ckpt_path', './checkpoints/SatCLIP/satclip-vit16-l40.ckpt'),
+            embedding_path=None,
+            device=device
+        )
+        logger.info(f"SatCLIP-MS model loaded on {device}")
+        return model
+    else:
+        raise ValueError(f"Unknown model: {model_name}")
+# =============================================================================
+# Embedding Computation Functions
+# =============================================================================
+def compute_embedding_single(model, model_name: str, img_array: np.ndarray) -> np.ndarray:
+    """
+    Compute embedding for a single image.
+    Args:
+        model: Model instance
+        model_name: Model identifier
+        img_array: numpy array of shape (H, W, 12)
+    Returns:
+        np.ndarray: 1D embedding vector
+    """
+    if model_name in ['dinov2', 'siglip', 'farslip']:
+        # These models use RGB input
+        rgb_img = extract_rgb_image(img_array)
+        feature = model.encode_image(rgb_img)
+        if feature is not None:
+            return feature.cpu().numpy().flatten()
+        return None
+    elif model_name == 'satclip':
+        # SatCLIP can use multi-spectral input directly
+        feature = model.encode_image(img_array, is_multispectral=True)
+        if feature is not None:
+            return feature.cpu().numpy().flatten()
+        return None
+    return None
+def compute_embedding_batch(model, model_name: str, img_arrays: list) -> list:
+    """
+    Compute embeddings for a batch of images.
+    Falls back to single-image processing if batch method unavailable.
+    Args:
+        model: Model instance
+        model_name: Model identifier
+        img_arrays: List of numpy arrays of shape (H, W, 12)
+    Returns:
+        List of 1D embedding vectors (numpy arrays), None for failed items
+    """
+    n_images = len(img_arrays)
+    if model_name in ['dinov2', 'siglip', 'farslip']:
+        # These models use RGB input
+        rgb_imgs = [extract_rgb_image(arr) for arr in img_arrays]
+        # Try batch encoding first
+        if hasattr(model, 'encode_images'):
+            try:
+                features = model.encode_images(rgb_imgs)
+                if features is not None:
+                    return [features[i].cpu().numpy().flatten() for i in range(len(features))]
+            except Exception:
+                pass  # Fall back to single processing
+        # Fall back to single image encoding
+        results = []
+        for img in rgb_imgs:
+            try:
+                feature = model.encode_image(img)
+                if feature is not None:
+                    results.append(feature.cpu().numpy().flatten())
+                else:
+                    results.append(None)
+            except Exception:
+                results.append(None)
+        return results
+    elif model_name == 'satclip':
+        # SatCLIP uses multi-spectral input
+        if hasattr(model, 'encode_images'):
+            try:
+                features = model.encode_images(img_arrays, is_multispectral=True)
+                if features is not None:
+                    return [features[i].cpu().numpy().flatten() for i in range(len(features))]
+            except Exception:
+                pass  # Fall back to single processing
+        # Fall back to single image encoding
+        results = []
+        for arr in img_arrays:
+            try:
+                feature = model.encode_image(arr, is_multispectral=True)
+                if feature is not None:
+                    results.append(feature.cpu().numpy().flatten())
+                else:
+                    results.append(None)
+            except Exception:
+                results.append(None)
+        return results
+    return [None] * n_images
+# def process_parquet_file(
+#     file_path: Path,
+#     model,
+#     model_name: str,
+#     batch_size: int = 64
+# ) -> pd.DataFrame:
+#     """
+#     Process a single parquet file and generate embeddings.
+#     Args:
+#         file_path: Path to input parquet file
+#         model: Model instance
+#         model_name: Model identifier
+#         batch_size: Batch size for processing
+#     Returns:
+#         DataFrame with embeddings
+#     """
+#     logger = logging.getLogger(__name__)
+#     # Load data
+#     df = pd.read_parquet(file_path)
+#     embeddings_list = []
+#     valid_indices = []
+#     # Process in batches (for future batch optimization)
+#     for idx, row in df.iterrows():
+#         try:
+#             # Decode image
+#             img_array = decode_image_bytes(row)
+#             # Compute embedding
+#             embedding = compute_embedding_single(model, model_name, img_array)
+#             if embedding is not None:
+#                 embeddings_list.append(embedding)
+#                 valid_indices.append(idx)
+#         except Exception as e:
+#             logger.warning(f"Error processing row {idx}: {e}")
+#             continue
+#     if not embeddings_list:
+#         logger.warning(f"No valid embeddings for {file_path.name}")
+#         return None
+#     # Build result DataFrame
+#     result_df = df.loc[valid_indices].copy()
+#     # Remove unwanted columns
+#     cols_to_drop = [c for c in COLUMNS_TO_REMOVE if c in result_df.columns]
+#     if cols_to_drop:
+#         result_df = result_df.drop(columns=cols_to_drop)
+#     # Remove image_bytes (large binary data)
+#     if 'image_bytes' in result_df.columns:
+#         result_df = result_df.drop(columns=['image_bytes'])
+#     # Remove geometry column (binary)
+#     if 'geometry' in result_df.columns:
+#         result_df = result_df.drop(columns=['geometry'])
+#     # Rename columns
+#     result_df = result_df.rename(columns=COLUMNS_RENAME)
+#     # Add pixel_bbox
+#     result_df['pixel_bbox'] = [PIXEL_BBOX] * len(result_df)
+#     # Add embedding
+#     result_df['embedding'] = embeddings_list
+#     return result_df
+def process_parquet_file(
+    file_path: Path,
+    model,
+    model_name: str,
+    batch_size: int = 64
+) -> pd.DataFrame:
+    """
+    Process a single parquet file and generate embeddings using batch processing.
+    Args:
+        file_path: Path to input parquet file
+        model: Model instance
+        model_name: Model identifier
+        batch_size: Batch size for processing
+    Returns:
+        DataFrame with embeddings
+    """
+    logger = logging.getLogger(__name__)
+    # Load data
+    df = pd.read_parquet(file_path)
+    n_rows = len(df)
+    embeddings_list = [None] * n_rows
+    valid_mask = [False] * n_rows
+    # Process in batches
+    for batch_start in range(0, n_rows, batch_size):
+        batch_end = min(batch_start + batch_size, n_rows)
+        batch_indices = list(range(batch_start, batch_end))
+        # Decode images for this batch
+        batch_arrays = []
+        batch_valid_indices = []
+        for idx in batch_indices:
+            try:
+                row = df.iloc[idx]
+                img_array = decode_image_bytes(row)
+                batch_arrays.append(img_array)
+                batch_valid_indices.append(idx)
+            except Exception as e:
+                logger.warning(f"Error decoding row {idx}: {e}")
+                continue
+        if not batch_arrays:
+            continue
+        # Compute embeddings for this batch
+        try:
+            batch_embeddings = compute_embedding_batch(model, model_name, batch_arrays)
+            # Store results
+            for i, idx in enumerate(batch_valid_indices):
+                if batch_embeddings[i] is not None:
+                    embeddings_list[idx] = batch_embeddings[i]
+                    valid_mask[idx] = True
+        except Exception as e:
+            logger.warning(f"Error computing batch embeddings: {e}")
+            # Fall back to single image processing for this batch
+            for i, idx in enumerate(batch_valid_indices):
+                try:
+                    embedding = compute_embedding_single(model, model_name, batch_arrays[i])
+                    if embedding is not None:
+                        embeddings_list[idx] = embedding
+                        valid_mask[idx] = True
+                except Exception as inner_e:
+                    logger.warning(f"Error processing row {idx}: {inner_e}")
+                    continue
+    # Filter to valid rows only
+    valid_indices = [i for i, v in enumerate(valid_mask) if v]
+    if not valid_indices:
+        logger.warning(f"No valid embeddings for {file_path.name}")
+        return None
+    # Build result DataFrame
+    result_df = df.iloc[valid_indices].copy()
+    valid_embeddings = [embeddings_list[i] for i in valid_indices]
+    # Remove unwanted columns
+    cols_to_drop = [c for c in COLUMNS_TO_REMOVE if c in result_df.columns]
+    if cols_to_drop:
+        result_df = result_df.drop(columns=cols_to_drop)
+    # Remove image_bytes (large binary data)
+    if 'image_bytes' in result_df.columns:
+        result_df = result_df.drop(columns=['image_bytes'])
+    # Remove geometry column (binary)
+    if 'geometry' in result_df.columns:
+        result_df = result_df.drop(columns=['geometry'])
+    # Rename columns
+    result_df = result_df.rename(columns=COLUMNS_RENAME)
+    # Add pixel_bbox
+    result_df['pixel_bbox'] = [PIXEL_BBOX] * len(result_df)
+    # Add embedding
+    result_df['embedding'] = valid_embeddings
+    return result_df
+# =============================================================================
+# Main Processing Pipeline
+# =============================================================================
+def main():
+    parser = argparse.ArgumentParser(description='Compute embeddings for Major-TOM images')
+    parser.add_argument('--model', type=str, required=True,
+                        choices=['dinov2', 'siglip', 'farslip', 'satclip'],
+                        help='Model to use for embedding computation')
+    parser.add_argument('--device', type=str, default='cuda:0',
+                        help='Device to run on (e.g., cuda:0, cuda:1, cpu)')
+    parser.add_argument('--batch-size', type=int, default=None,
+                        help='Batch size for processing (default: model-specific)')
+    parser.add_argument('--max-files', type=int, default=None,
+                        help='Maximum number of files to process (for testing)')
+    args = parser.parse_args()
+    # Setup logging
+    logger = setup_logging(args.model)
+    logger.info("=" * 80)
+    logger.info(f"Computing {args.model.upper()} embeddings")
+    logger.info(f"Timestamp: {datetime.now().isoformat()}")
+    logger.info(f"Device: {args.device}")
+    logger.info("=" * 80)
+    # Load configuration
+    config = load_and_process_config()
+    if config is None:
+        logger.warning("No config file found, using default paths")
+        config = {}
+    # Determine batch size
+    batch_size = args.batch_size or BATCH_SIZES.get(args.model, 64)
+    logger.info(f"Batch size: {batch_size}")
+    # Get output path
+    output_path = MODEL_OUTPUT_PATHS[args.model]
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Output path: {output_path}")
+    # Load model
+    logger.info(f"Loading {args.model} model...")
+    model = load_model(args.model, args.device, config)
+    # Get input files
+    parquet_files = sorted(IMAGE_PARQUET_DIR.glob("batch_*.parquet"))
+    if args.max_files:
+        parquet_files = parquet_files[:args.max_files]
+    logger.info(f"Found {len(parquet_files)} input files")
+    # Process files
+    all_results = []
+    total_rows = 0
+    for file_path in tqdm(parquet_files, desc=f"Processing {args.model}"):
+        try:
+            result_df = process_parquet_file(file_path, model, args.model, batch_size)
+            if result_df is not None:
+                all_results.append(result_df)
+                total_rows += len(result_df)
+                logger.info(f"[{file_path.name}] Processed {len(result_df)} rows")
+        except Exception as e:
+            logger.error(f"Error processing {file_path.name}: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    # Merge and save
+    if all_results:
+        logger.info("Merging all results...")
+        final_df = pd.concat(all_results, ignore_index=True)
+        # Validate columns
+        logger.info(f"Final columns: {list(final_df.columns)}")
+        # Check for removed columns
+        removed = [c for c in COLUMNS_TO_REMOVE if c in final_df.columns]
+        if removed:
+            logger.warning(f"Columns still present that should be removed: {removed}")
+        else:
+            logger.info("✓ All unwanted columns removed")
+        # Check for renamed columns
+        if 'utm_crs' in final_df.columns and 'crs' not in final_df.columns:
+            logger.info("✓ Column 'crs' renamed to 'utm_crs'")
+        # Check for pixel_bbox
+        if 'pixel_bbox' in final_df.columns:
+            logger.info("✓ Column 'pixel_bbox' added")
+        # Save
+        logger.info(f"Saving to {output_path}...")
+        final_df.to_parquet(output_path, index=False)
+        logger.info(f"=" * 80)
+        logger.info(f"Processing complete!")
+        logger.info(f"Total rows: {len(final_df):,}")
+        logger.info(f"Embedding dimension: {len(final_df['embedding'].iloc[0])}")
+        logger.info(f"Output file: {output_path}")
+        logger.info(f"=" * 80)
+    else:
+        logger.error("No data processed!")
+        return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

configs/huggingface.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+siglip:
+  ckpt_path: "hf"
+  model_name: "ViT-SO400M-14-SigLIP-384"
+  tokenizer_path: "hf"
+  embedding_path: "hf://ML4RS-Anonymous/EarthEmbeddings/Core-S2L2A-249k/siglip/SigLIP_crop_384x384.parquet.parquet"
+farslip:
+  ckpt_path: "hf"
+  model_name: "ViT-B-16"
+  embedding_path: "hf://ML4RS-Anonymous/EarthEmbeddings/Core-S2L2A-249k/farslip/FarSLIP_crop_384x384.parquet.parquet"
+satclip:
+  ckpt_path: "hf"
+  embedding_path: "hf://ML4RS-Anonymous/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet.parquet"
+dinov2:
+  ckpt_path: "hf"
+  embedding_path: "hf://ML4RS-Anonymous/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet.parquet"

countries.geo.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data_utils.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import fsspec
+import pyarrow.parquet as pq
+import numpy as np
+from PIL import Image
+from io import BytesIO
+from rasterio.io import MemoryFile
+import matplotlib.pyplot as plt
+import cartopy.crs as ccrs
+import cartopy.io.img_tiles as cimgt
+from matplotlib.patches import Rectangle
+import math
+from matplotlib.figure import Figure
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+def crop_center(img_array, cropx, cropy):
+    y, x, c = img_array.shape
+    startx = x // 2 - (cropx // 2)
+    starty = y // 2 - (cropy // 2)
+    return img_array[starty:starty+cropy, startx:startx+cropx]
+def read_tif_bytes(tif_bytes):
+    with MemoryFile(tif_bytes) as mem_f:
+        with mem_f.open(driver='GTiff') as f:
+            return f.read().squeeze()
+def read_row_memory(row_dict, columns=["thumbnail"]):
+    url = row_dict['parquet_url']
+    row_idx = row_dict['parquet_row']
+    fs_options = {
+        "cache_type": "readahead",
+        "block_size": 5 * 1024 * 1024
+    }
+    with fsspec.open(url, mode='rb', **fs_options) as f:
+        with pq.ParquetFile(f) as pf:
+            table = pf.read_row_group(row_idx, columns=columns)
+    row_output = {}
+    for col in columns:
+        col_data = table[col][0].as_py()
+        if col != 'thumbnail':
+            row_output[col] = read_tif_bytes(col_data)
+        else:
+            stream = BytesIO(col_data)
+            row_output[col] = Image.open(stream)
+    return row_output
+def download_and_process_image(product_id, df_source=None, verbose=True):
+    if df_source is None:
+        if verbose: print("❌ Error: No DataFrame provided.")
+        return None, None
+    row_subset = df_source[df_source['product_id'] == product_id]
+    if len(row_subset) == 0:
+        if verbose: print(f"❌ Error: Product ID {product_id} not found in DataFrame.")
+        return None, None
+    row_dict = row_subset.iloc[0].to_dict()
+    if 'parquet_url' in row_dict:
+        url = row_dict['parquet_url']
+        if 'huggingface.co' in url:
+            row_dict['parquet_url'] = url.replace('https://huggingface.co', 'https://modelscope.cn').replace('resolve/main', 'resolve/master')
+        elif 'hf-mirror.com' in url:
+            row_dict['parquet_url'] = url.replace('https://hf-mirror.com', 'https://modelscope.cn').replace('resolve/main', 'resolve/master')
+    else:
+        if verbose: print("❌ Error: 'parquet_url' missing in metadata.")
+        return None, None
+    if verbose: print(f"⬇️ Fetching data for {product_id} from {row_dict['parquet_url']}...")
+    try:
+        bands_data = read_row_memory(row_dict, columns=['B04', 'B03', 'B02'])
+        if not all(b in bands_data for b in ['B04', 'B03', 'B02']):
+             if verbose: print(f"❌ Error: Missing bands in fetched data for {product_id}")
+             return None, None
+        rgb_img = np.stack([bands_data['B04'], bands_data['B03'], bands_data['B02']], axis=-1)
+        if verbose:
+            print(f"Raw RGB stats: Min={rgb_img.min()}, Max={rgb_img.max()}, Mean={rgb_img.mean()}, Dtype={rgb_img.dtype}")
+        # Check if data is already 0-255 or 0-1
+        if rgb_img.max() <= 255:
+             # Assume it might be uint8 or scaled
+             pass
+        rgb_norm = (2.5 * (rgb_img.astype(float) / 10000.0)).clip(0, 1)
+        rgb_uint8 = (rgb_norm * 255).astype(np.uint8)
+        if verbose:
+             print(f"Processed RGB stats: Min={rgb_uint8.min()}, Max={rgb_uint8.max()}, Mean={rgb_uint8.mean()}")
+        img_full = Image.fromarray(rgb_uint8)
+        if rgb_uint8.shape[0] >= 384 and rgb_uint8.shape[1] >= 384:
+            cropped_array = crop_center(rgb_uint8, 384, 384)
+            img_384 = Image.fromarray(cropped_array)
+        else:
+            if verbose: print(f"⚠️ Image too small {rgb_uint8.shape}, resizing to 384x384.")
+            img_384 = img_full.resize((384, 384))
+        if verbose: print(f"✅ Successfully processed {product_id}")
+        return img_384, img_full
+    except Exception as e:
+        if verbose: print(f"❌ Error processing {product_id}: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None
+# Define Esri Imagery Class
+class EsriImagery(cimgt.GoogleTiles):
+    def _image_url(self, tile):
+        x, y, z = tile
+        return f'https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}'
+from PIL import Image, ImageDraw, ImageFont
+def get_placeholder_image(text="Image Unavailable", size=(384, 384)):
+    img = Image.new('RGB', size, color=(200, 200, 200))
+    d = ImageDraw.Draw(img)
+    try:
+        # Try to load a default font
+        font = ImageFont.load_default()
+    except:
+        font = None
+    # Draw text in center (rough approximation)
+    # For better centering we would need font metrics, but simple is fine here
+    d.text((20, size[1]//2), text, fill=(0, 0, 0), font=font)
+    return img
+def get_esri_satellite_image(lat, lon, score=None, rank=None, query=None):
+    """
+    Generates a satellite image visualization using Esri World Imagery via Cartopy.
+    Matches the style of the provided notebook.
+    Uses OO Matplotlib API for thread safety.
+    """
+    try:
+        imagery = EsriImagery()
+        # Create figure using OO API
+        fig = Figure(figsize=(5, 5), dpi=100)
+        canvas = FigureCanvasAgg(fig)
+        ax = fig.add_subplot(1, 1, 1, projection=imagery.crs)
+        # Set extent to approx 10km x 10km around the point
+        extent_deg = 0.05
+        ax.set_extent([lon - extent_deg, lon + extent_deg, lat - extent_deg, lat + extent_deg], crs=ccrs.PlateCarree())
+        # Add the imagery
+        ax.add_image(imagery, 14)
+        # Add a marker for the center
+        ax.plot(lon, lat, marker='+', color='yellow', markersize=12, markeredgewidth=2, transform=ccrs.PlateCarree())
+        # Add Bounding Box (3840m x 3840m)
+        box_size_m = 384 * 10 # 3840m
+        # Convert meters to degrees (approx)
+        # 1 deg lat = 111320m
+        # 1 deg lon = 111320m * cos(lat)
+        dlat = (box_size_m / 111320)
+        dlon = (box_size_m / (111320 * math.cos(math.radians(lat))))
+        # Bottom-Left corner
+        rect_lon = lon - dlon / 2
+        rect_lat = lat - dlat / 2
+        # Add Rectangle
+        rect = Rectangle((rect_lon, rect_lat), dlon, dlat,
+                        linewidth=2, edgecolor='red', facecolor='none', transform=ccrs.PlateCarree())
+        ax.add_patch(rect)
+        # Title
+        title_parts = []
+        if query: title_parts.append(f"{query}")
+        if rank is not None: title_parts.append(f"Rank {rank}")
+        if score is not None: title_parts.append(f"Score: {score:.4f}")
+        ax.set_title("\n".join(title_parts), fontsize=10)
+        # Save to buffer
+        buf = BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight')
+        buf.seek(0)
+        return Image.open(buf)
+    except Exception as e:
+        # Suppress full traceback for network errors to avoid log spam
+        error_msg = str(e)
+        if "Connection reset by peer" in error_msg or "Network is unreachable" in error_msg or "urlopen error" in error_msg:
+            print(f"⚠️ Network warning: Could not fetch Esri satellite map for ({lat:.4f}, {lon:.4f}). Server might be offline.")
+        else:
+            print(f"Error generating Esri image for {lat}, {lon}: {e}")
+            # Only print traceback for non-network errors
+            # import traceback
+            # traceback.print_exc()
+        # Return a placeholder image with text
+        return get_placeholder_image(f"Map Unavailable\n({lat:.2f}, {lon:.2f})")
+def get_esri_satellite_image_url(lat, lon, zoom=14):
+    """
+    Returns the URL for the Esri World Imagery tile at the given location.
+    """
+    try:
+        imagery = EsriImagery()
+        # Calculate tile coordinates
+        # This is a simplification, cimgt handles this internally usually
+        # But for direct URL we might need more logic or just use the static map approach above
+        # For now, let's stick to the static map generation which works
+        pass
+    except:
+        pass
+    return None

examples/example1.png ADDED Viewed

Git LFS Details

SHA256: 07dd836c4dfe700657f163afdae9ebf2685f83dca1417078b3147c8c31f598a9
Pointer size: 131 Bytes
Size of remote file: 225 kB

examples/example2.png ADDED Viewed

Git LFS Details

SHA256: e52a44517c028cb6b9828c37c974991fb20122f6cdba951e809ac66b7c591552
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

examples/example3.png ADDED Viewed

Git LFS Details

SHA256: d63b587c17943eb1e60f511def466696c1a12a323f0f67dff99da7631e2e48aa
Pointer size: 131 Bytes
Size of remote file: 507 kB

logs/compute_embeddings_dinov2.log ADDED Viewed

	@@ -0,0 +1,170 @@

+2026-02-01 09:07:55,115 [INFO] ================================================================================
+2026-02-01 09:07:55,115 [INFO] Computing DINOV2 embeddings
+2026-02-01 09:07:55,115 [INFO] Timestamp: 2026-02-01T09:07:55.115269
+2026-02-01 09:07:55,115 [INFO] Device: cuda:0
+2026-02-01 09:07:55,115 [INFO] ================================================================================
+2026-02-01 09:07:55,116 [INFO] Batch size: 64
+2026-02-01 09:07:55,116 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet
+2026-02-01 09:07:55,116 [INFO] Loading dinov2 model...
+2026-02-01 09:07:58,665 [INFO] DINOv2 model loaded on cuda:0
+2026-02-01 09:07:58,666 [INFO] Found 1 input files
+2026-02-01 09:08:48,122 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:08:48,122 [INFO] Merging all results...
+2026-02-01 09:08:48,122 [INFO] Final columns: ['grid_cell', 'grid_row_u', 'grid_col_r', 'product_id', 'timestamp', 'centre_lat', 'centre_lon', 'utm_crs', 'parquet_url', 'parquet_row', 'pixel_bbox', 'embedding']
+2026-02-01 09:08:48,122 [INFO] ✓ All unwanted columns removed
+2026-02-01 09:08:48,122 [INFO] ✓ Column 'crs' renamed to 'utm_crs'
+2026-02-01 09:08:48,122 [INFO] ✓ Column 'pixel_bbox' added
+2026-02-01 09:08:48,122 [INFO] Saving to /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet...
+2026-02-01 09:08:48,228 [INFO] ================================================================================
+2026-02-01 09:08:48,228 [INFO] Processing complete!
+2026-02-01 09:08:48,228 [INFO] Total rows: 1,996
+2026-02-01 09:08:48,228 [INFO] Embedding dimension: 1024
+2026-02-01 09:08:48,228 [INFO] Output file: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet
+2026-02-01 09:08:48,228 [INFO] ================================================================================
+2026-02-01 09:43:06,596 [INFO] ================================================================================
+2026-02-01 09:43:06,596 [INFO] Computing DINOV2 embeddings
+2026-02-01 09:43:06,596 [INFO] Timestamp: 2026-02-01T09:43:06.596521
+2026-02-01 09:43:06,596 [INFO] Device: cuda:1
+2026-02-01 09:43:06,596 [INFO] ================================================================================
+2026-02-01 09:43:06,597 [INFO] Batch size: 64
+2026-02-01 09:43:06,597 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet
+2026-02-01 09:43:06,597 [INFO] Loading dinov2 model...
+2026-02-01 09:43:08,665 [INFO] DINOv2 model loaded on cuda:1
+2026-02-01 09:43:08,666 [INFO] Found 125 input files
+2026-02-01 09:43:59,600 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:44:50,531 [INFO] [batch_0002_384x384.parquet] Processed 1990 rows
+2026-02-01 09:45:40,104 [INFO] [batch_0003_384x384.parquet] Processed 1997 rows
+2026-02-01 09:46:31,203 [INFO] [batch_0004_384x384.parquet] Processed 1992 rows
+2026-02-01 09:47:22,240 [INFO] [batch_0005_384x384.parquet] Processed 1992 rows
+2026-02-01 09:48:17,789 [INFO] [batch_0006_384x384.parquet] Processed 1992 rows
+2026-02-01 09:49:12,206 [INFO] [batch_0007_384x384.parquet] Processed 1998 rows
+2026-02-01 09:50:04,633 [INFO] [batch_0008_384x384.parquet] Processed 1994 rows
+2026-02-01 09:51:01,688 [INFO] [batch_0009_384x384.parquet] Processed 1993 rows
+2026-02-01 09:51:52,258 [INFO] [batch_0010_384x384.parquet] Processed 1993 rows
+2026-02-01 09:52:43,385 [INFO] [batch_0011_384x384.parquet] Processed 1990 rows
+2026-02-01 09:53:33,664 [INFO] [batch_0012_384x384.parquet] Processed 1993 rows
+2026-02-01 09:54:23,450 [INFO] [batch_0013_384x384.parquet] Processed 1993 rows
+2026-02-01 09:55:14,741 [INFO] [batch_0014_384x384.parquet] Processed 1991 rows
+2026-02-01 09:56:05,637 [INFO] [batch_0015_384x384.parquet] Processed 1993 rows
+2026-02-01 09:57:02,579 [INFO] [batch_0016_384x384.parquet] Processed 1990 rows
+2026-02-01 09:57:59,164 [INFO] [batch_0017_384x384.parquet] Processed 1991 rows
+2026-02-01 09:58:54,668 [INFO] [batch_0018_384x384.parquet] Processed 1991 rows
+2026-02-01 09:59:50,748 [INFO] [batch_0019_384x384.parquet] Processed 1996 rows
+2026-02-01 10:00:44,987 [INFO] [batch_0020_384x384.parquet] Processed 1994 rows
+2026-02-01 10:01:41,422 [INFO] [batch_0021_384x384.parquet] Processed 1996 rows
+2026-02-01 10:02:39,884 [INFO] [batch_0022_384x384.parquet] Processed 1995 rows
+2026-02-01 10:03:41,408 [INFO] [batch_0023_384x384.parquet] Processed 1992 rows
+2026-02-01 10:04:44,392 [INFO] [batch_0024_384x384.parquet] Processed 1989 rows
+2026-02-01 10:05:47,970 [INFO] [batch_0025_384x384.parquet] Processed 1993 rows
+2026-02-01 10:06:47,594 [INFO] [batch_0026_384x384.parquet] Processed 1995 rows
+2026-02-01 10:07:46,292 [INFO] [batch_0027_384x384.parquet] Processed 1997 rows
+2026-02-01 10:08:43,976 [INFO] [batch_0028_384x384.parquet] Processed 1991 rows
+2026-02-01 10:09:43,099 [INFO] [batch_0029_384x384.parquet] Processed 1992 rows
+2026-02-01 10:10:40,183 [INFO] [batch_0030_384x384.parquet] Processed 1993 rows
+2026-02-01 10:11:44,485 [INFO] [batch_0031_384x384.parquet] Processed 1988 rows
+2026-02-01 10:12:39,796 [INFO] [batch_0032_384x384.parquet] Processed 1994 rows
+2026-02-01 10:13:45,836 [INFO] [batch_0033_384x384.parquet] Processed 1992 rows
+2026-02-01 10:14:44,908 [INFO] [batch_0034_384x384.parquet] Processed 1994 rows
+2026-02-01 10:15:44,326 [INFO] [batch_0035_384x384.parquet] Processed 1994 rows
+2026-02-01 10:16:43,931 [INFO] [batch_0036_384x384.parquet] Processed 1996 rows
+2026-02-01 10:17:41,513 [INFO] [batch_0037_384x384.parquet] Processed 1995 rows
+2026-02-01 10:18:39,810 [INFO] [batch_0038_384x384.parquet] Processed 1993 rows
+2026-02-01 10:19:36,710 [INFO] [batch_0039_384x384.parquet] Processed 1989 rows
+2026-02-01 10:20:31,841 [INFO] [batch_0040_384x384.parquet] Processed 1990 rows
+2026-02-01 10:21:29,236 [INFO] [batch_0041_384x384.parquet] Processed 1998 rows
+2026-02-01 10:22:32,483 [INFO] [batch_0042_384x384.parquet] Processed 1997 rows
+2026-02-01 10:23:28,852 [INFO] [batch_0043_384x384.parquet] Processed 1988 rows
+2026-02-01 10:24:24,324 [INFO] [batch_0044_384x384.parquet] Processed 1991 rows
+2026-02-01 10:25:22,097 [INFO] [batch_0045_384x384.parquet] Processed 1993 rows
+2026-02-01 10:26:18,196 [INFO] [batch_0046_384x384.parquet] Processed 1994 rows
+2026-02-01 10:27:34,649 [INFO] [batch_0047_384x384.parquet] Processed 1995 rows
+2026-02-01 10:28:30,976 [INFO] [batch_0048_384x384.parquet] Processed 1992 rows
+2026-02-01 10:29:41,715 [INFO] [batch_0049_384x384.parquet] Processed 1996 rows
+2026-02-01 10:30:45,082 [INFO] [batch_0050_384x384.parquet] Processed 1991 rows
+2026-02-01 10:31:46,711 [INFO] [batch_0051_384x384.parquet] Processed 1997 rows
+2026-02-01 10:32:45,127 [INFO] [batch_0052_384x384.parquet] Processed 1993 rows
+2026-02-01 10:33:48,960 [INFO] [batch_0053_384x384.parquet] Processed 1995 rows
+2026-02-01 10:35:01,705 [INFO] [batch_0054_384x384.parquet] Processed 1997 rows
+2026-02-01 10:36:11,677 [INFO] [batch_0055_384x384.parquet] Processed 1995 rows
+2026-02-01 10:37:17,746 [INFO] [batch_0056_384x384.parquet] Processed 1997 rows
+2026-02-01 10:38:28,458 [INFO] [batch_0057_384x384.parquet] Processed 1991 rows
+2026-02-01 10:39:38,673 [INFO] [batch_0058_384x384.parquet] Processed 1994 rows
+2026-02-01 10:40:48,784 [INFO] [batch_0059_384x384.parquet] Processed 1993 rows
+2026-02-01 10:41:47,477 [INFO] [batch_0060_384x384.parquet] Processed 1995 rows
+2026-02-01 10:42:55,595 [INFO] [batch_0061_384x384.parquet] Processed 1995 rows
+2026-02-01 10:44:08,413 [INFO] [batch_0062_384x384.parquet] Processed 1998 rows
+2026-02-01 10:45:27,616 [INFO] [batch_0063_384x384.parquet] Processed 1997 rows
+2026-02-01 10:46:40,936 [INFO] [batch_0064_384x384.parquet] Processed 1992 rows
+2026-02-01 10:47:38,737 [INFO] [batch_0065_384x384.parquet] Processed 1994 rows
+2026-02-01 10:48:46,233 [INFO] [batch_0066_384x384.parquet] Processed 1992 rows
+2026-02-01 10:49:56,228 [INFO] [batch_0067_384x384.parquet] Processed 1993 rows
+2026-02-01 10:51:12,380 [INFO] [batch_0068_384x384.parquet] Processed 1994 rows
+2026-02-01 10:52:27,369 [INFO] [batch_0069_384x384.parquet] Processed 1992 rows
+2026-02-01 10:53:42,056 [INFO] [batch_0070_384x384.parquet] Processed 1997 rows
+2026-02-01 10:54:50,573 [INFO] [batch_0071_384x384.parquet] Processed 1996 rows
+2026-02-01 10:56:03,974 [INFO] [batch_0072_384x384.parquet] Processed 1992 rows
+2026-02-01 10:57:09,742 [INFO] [batch_0073_384x384.parquet] Processed 1995 rows
+2026-02-01 10:58:22,365 [INFO] [batch_0074_384x384.parquet] Processed 1992 rows
+2026-02-01 10:59:33,712 [INFO] [batch_0075_384x384.parquet] Processed 1991 rows
+2026-02-01 11:00:48,387 [INFO] [batch_0076_384x384.parquet] Processed 1998 rows
+2026-02-01 11:01:47,919 [INFO] [batch_0077_384x384.parquet] Processed 1996 rows
+2026-02-01 11:03:01,336 [INFO] [batch_0078_384x384.parquet] Processed 1992 rows
+2026-02-01 11:04:04,437 [INFO] [batch_0079_384x384.parquet] Processed 1995 rows
+2026-02-01 11:05:15,344 [INFO] [batch_0080_384x384.parquet] Processed 1993 rows
+2026-02-01 11:06:26,434 [INFO] [batch_0081_384x384.parquet] Processed 1995 rows
+2026-02-01 11:07:29,500 [INFO] [batch_0082_384x384.parquet] Processed 1989 rows
+2026-02-01 11:08:41,452 [INFO] [batch_0083_384x384.parquet] Processed 1995 rows
+2026-02-01 11:09:52,372 [INFO] [batch_0084_384x384.parquet] Processed 1996 rows
+2026-02-01 11:10:54,102 [INFO] [batch_0085_384x384.parquet] Processed 1997 rows
+2026-02-01 11:12:05,011 [INFO] [batch_0086_384x384.parquet] Processed 1996 rows
+2026-02-01 11:13:18,046 [INFO] [batch_0087_384x384.parquet] Processed 1994 rows
+2026-02-01 11:14:28,554 [INFO] [batch_0088_384x384.parquet] Processed 1992 rows
+2026-02-01 11:15:30,371 [INFO] [batch_0089_384x384.parquet] Processed 1993 rows
+2026-02-01 11:16:36,098 [INFO] [batch_0090_384x384.parquet] Processed 1993 rows
+2026-02-01 11:17:47,559 [INFO] [batch_0091_384x384.parquet] Processed 1995 rows
+2026-02-01 11:18:59,181 [INFO] [batch_0092_384x384.parquet] Processed 1994 rows
+2026-02-01 11:20:10,040 [INFO] [batch_0093_384x384.parquet] Processed 1998 rows
+2026-02-01 11:21:11,780 [INFO] [batch_0094_384x384.parquet] Processed 1993 rows
+2026-02-01 11:22:13,323 [INFO] [batch_0095_384x384.parquet] Processed 1995 rows
+2026-02-01 11:23:13,963 [INFO] [batch_0096_384x384.parquet] Processed 1997 rows
+2026-02-01 11:24:11,380 [INFO] [batch_0097_384x384.parquet] Processed 1990 rows
+2026-02-01 11:25:16,113 [INFO] [batch_0098_384x384.parquet] Processed 1995 rows
+2026-02-01 11:26:15,319 [INFO] [batch_0099_384x384.parquet] Processed 1992 rows
+2026-02-01 11:27:09,846 [INFO] [batch_0100_384x384.parquet] Processed 1993 rows
+2026-02-01 11:28:13,634 [INFO] [batch_0101_384x384.parquet] Processed 1994 rows
+2026-02-01 11:29:19,508 [INFO] [batch_0102_384x384.parquet] Processed 1991 rows
+2026-02-01 11:30:27,321 [INFO] [batch_0103_384x384.parquet] Processed 1990 rows
+2026-02-01 11:31:38,038 [INFO] [batch_0104_384x384.parquet] Processed 1995 rows
+2026-02-01 11:32:55,342 [INFO] [batch_0105_384x384.parquet] Processed 1993 rows
+2026-02-01 11:34:02,868 [INFO] [batch_0106_384x384.parquet] Processed 1988 rows
+2026-02-01 11:35:08,481 [INFO] [batch_0107_384x384.parquet] Processed 1996 rows
+2026-02-01 11:36:17,025 [INFO] [batch_0108_384x384.parquet] Processed 1992 rows
+2026-02-01 11:37:26,799 [INFO] [batch_0109_384x384.parquet] Processed 1993 rows
+2026-02-01 11:38:39,274 [INFO] [batch_0110_384x384.parquet] Processed 1996 rows
+2026-02-01 11:39:49,743 [INFO] [batch_0111_384x384.parquet] Processed 1991 rows
+2026-02-01 11:40:47,923 [INFO] [batch_0112_384x384.parquet] Processed 1994 rows
+2026-02-01 11:41:53,376 [INFO] [batch_0113_384x384.parquet] Processed 1996 rows
+2026-02-01 11:42:53,847 [INFO] [batch_0114_384x384.parquet] Processed 1997 rows
+2026-02-01 11:43:47,456 [INFO] [batch_0115_384x384.parquet] Processed 1997 rows
+2026-02-01 11:44:47,188 [INFO] [batch_0116_384x384.parquet] Processed 1992 rows
+2026-02-01 11:45:44,350 [INFO] [batch_0117_384x384.parquet] Processed 1992 rows
+2026-02-01 11:46:51,765 [INFO] [batch_0118_384x384.parquet] Processed 1993 rows
+2026-02-01 11:47:54,777 [INFO] [batch_0119_384x384.parquet] Processed 1998 rows
+2026-02-01 11:48:58,907 [INFO] [batch_0120_384x384.parquet] Processed 1995 rows
+2026-02-01 11:49:59,917 [INFO] [batch_0121_384x384.parquet] Processed 1995 rows
+2026-02-01 11:51:00,476 [INFO] [batch_0122_384x384.parquet] Processed 1992 rows
+2026-02-01 11:52:05,414 [INFO] [batch_0123_384x384.parquet] Processed 1994 rows
+2026-02-01 11:53:06,075 [INFO] [batch_0124_384x384.parquet] Processed 1995 rows
+2026-02-01 11:53:54,915 [INFO] [batch_0125_384x384.parquet] Processed 1505 rows
+2026-02-01 11:53:54,915 [INFO] Merging all results...
+2026-02-01 11:53:54,970 [INFO] Final columns: ['grid_cell', 'grid_row_u', 'grid_col_r', 'product_id', 'timestamp', 'centre_lat', 'centre_lon', 'utm_crs', 'parquet_url', 'parquet_row', 'pixel_bbox', 'embedding']
+2026-02-01 11:53:54,971 [INFO] ✓ All unwanted columns removed
+2026-02-01 11:53:54,971 [INFO] ✓ Column 'crs' renamed to 'utm_crs'
+2026-02-01 11:53:54,971 [INFO] ✓ Column 'pixel_bbox' added
+2026-02-01 11:53:54,971 [INFO] Saving to /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet...
+2026-02-01 11:54:03,559 [INFO] ================================================================================
+2026-02-01 11:54:03,559 [INFO] Processing complete!
+2026-02-01 11:54:03,559 [INFO] Total rows: 248,719
+2026-02-01 11:54:03,560 [INFO] Embedding dimension: 1024
+2026-02-01 11:54:03,560 [INFO] Output file: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/dinov2/DINOv2_crop_384x384.parquet
+2026-02-01 11:54:03,560 [INFO] ================================================================================

logs/compute_embeddings_farslip.log ADDED Viewed

	@@ -0,0 +1,150 @@

+2026-02-01 09:54:48,604 [INFO] ================================================================================
+2026-02-01 09:54:48,605 [INFO] Computing FARSLIP embeddings
+2026-02-01 09:54:48,605 [INFO] Timestamp: 2026-02-01T09:54:48.605134
+2026-02-01 09:54:48,605 [INFO] Device: cuda:4
+2026-02-01 09:54:48,605 [INFO] ================================================================================
+2026-02-01 09:54:48,606 [INFO] Batch size: 64
+2026-02-01 09:54:48,607 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/farslip/FarSLIP_crop_384x384.parquet
+2026-02-01 09:54:48,607 [INFO] Loading farslip model...
+2026-02-01 09:54:48,613 [INFO] Loaded ViT-B-16 model config.
+2026-02-01 09:54:50,536 [INFO] Loading pretrained ViT-B-16 weights (/data1/zyj/checkpoints/FarSLIP/FarSLIP2_ViT-B-16.pt).
+2026-02-01 09:54:51,666 [INFO] Missing keys: []
+2026-02-01 09:54:51,745 [INFO] FarSLIP model loaded on cuda:4
+2026-02-01 09:54:51,745 [INFO] Found 125 input files
+2026-02-01 09:55:38,785 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:56:18,239 [INFO] [batch_0002_384x384.parquet] Processed 1990 rows
+2026-02-01 09:57:17,259 [INFO] [batch_0003_384x384.parquet] Processed 1997 rows
+2026-02-01 09:58:08,339 [INFO] [batch_0004_384x384.parquet] Processed 1992 rows
+2026-02-01 09:59:00,302 [INFO] [batch_0005_384x384.parquet] Processed 1992 rows
+2026-02-01 10:00:15,416 [INFO] [batch_0006_384x384.parquet] Processed 1992 rows
+2026-02-01 10:01:22,601 [INFO] [batch_0007_384x384.parquet] Processed 1998 rows
+2026-02-01 10:02:25,131 [INFO] [batch_0008_384x384.parquet] Processed 1994 rows
+2026-02-01 10:03:31,735 [INFO] [batch_0009_384x384.parquet] Processed 1993 rows
+2026-02-01 10:04:47,342 [INFO] [batch_0010_384x384.parquet] Processed 1993 rows
+2026-02-01 10:05:54,617 [INFO] [batch_0011_384x384.parquet] Processed 1990 rows
+2026-02-01 10:06:58,372 [INFO] [batch_0012_384x384.parquet] Processed 1993 rows
+2026-02-01 10:08:16,301 [INFO] [batch_0013_384x384.parquet] Processed 1993 rows
+2026-02-01 10:09:11,722 [INFO] [batch_0014_384x384.parquet] Processed 1991 rows
+2026-02-01 10:10:23,603 [INFO] [batch_0015_384x384.parquet] Processed 1993 rows
+2026-02-01 10:11:38,047 [INFO] [batch_0016_384x384.parquet] Processed 1990 rows
+2026-02-01 10:12:22,943 [INFO] [batch_0017_384x384.parquet] Processed 1991 rows
+2026-02-01 10:13:41,095 [INFO] [batch_0018_384x384.parquet] Processed 1991 rows
+2026-02-01 10:14:47,596 [INFO] [batch_0019_384x384.parquet] Processed 1996 rows
+2026-02-01 10:15:40,983 [INFO] [batch_0020_384x384.parquet] Processed 1994 rows
+2026-02-01 10:16:52,878 [INFO] [batch_0021_384x384.parquet] Processed 1996 rows
+2026-02-01 10:17:43,460 [INFO] [batch_0022_384x384.parquet] Processed 1995 rows
+2026-02-01 10:18:41,479 [INFO] [batch_0023_384x384.parquet] Processed 1992 rows
+2026-02-01 10:19:40,728 [INFO] [batch_0024_384x384.parquet] Processed 1989 rows
+2026-02-01 10:20:25,503 [INFO] [batch_0025_384x384.parquet] Processed 1993 rows
+2026-02-01 10:21:27,428 [INFO] [batch_0026_384x384.parquet] Processed 1995 rows
+2026-02-01 10:22:23,776 [INFO] [batch_0027_384x384.parquet] Processed 1997 rows
+2026-02-01 10:23:16,992 [INFO] [batch_0028_384x384.parquet] Processed 1991 rows
+2026-02-01 10:24:14,634 [INFO] [batch_0029_384x384.parquet] Processed 1992 rows
+2026-02-01 10:24:55,464 [INFO] [batch_0030_384x384.parquet] Processed 1993 rows
+2026-02-01 10:25:56,600 [INFO] [batch_0031_384x384.parquet] Processed 1988 rows
+2026-02-01 10:26:40,392 [INFO] [batch_0032_384x384.parquet] Processed 1994 rows
+2026-02-01 10:27:49,696 [INFO] [batch_0033_384x384.parquet] Processed 1992 rows
+2026-02-01 10:28:49,831 [INFO] [batch_0034_384x384.parquet] Processed 1994 rows
+2026-02-01 10:29:42,378 [INFO] [batch_0035_384x384.parquet] Processed 1994 rows
+2026-02-01 10:30:48,969 [INFO] [batch_0036_384x384.parquet] Processed 1996 rows
+2026-02-01 10:32:01,922 [INFO] [batch_0037_384x384.parquet] Processed 1995 rows
+2026-02-01 10:32:47,057 [INFO] [batch_0038_384x384.parquet] Processed 1993 rows
+2026-02-01 10:34:01,196 [INFO] [batch_0039_384x384.parquet] Processed 1989 rows
+2026-02-01 10:35:19,501 [INFO] [batch_0040_384x384.parquet] Processed 1990 rows
+2026-02-01 10:36:09,997 [INFO] [batch_0041_384x384.parquet] Processed 1998 rows
+2026-02-01 10:37:25,589 [INFO] [batch_0042_384x384.parquet] Processed 1997 rows
+2026-02-01 10:38:42,876 [INFO] [batch_0043_384x384.parquet] Processed 1988 rows
+2026-02-01 10:39:31,979 [INFO] [batch_0044_384x384.parquet] Processed 1991 rows
+2026-02-01 10:40:43,745 [INFO] [batch_0045_384x384.parquet] Processed 1993 rows
+2026-02-01 10:41:59,576 [INFO] [batch_0046_384x384.parquet] Processed 1994 rows
+2026-02-01 10:42:53,620 [INFO] [batch_0047_384x384.parquet] Processed 1995 rows
+2026-02-01 10:44:25,584 [INFO] [batch_0048_384x384.parquet] Processed 1992 rows
+2026-02-01 10:46:13,258 [INFO] [batch_0049_384x384.parquet] Processed 1996 rows
+2026-02-01 10:47:13,109 [INFO] [batch_0050_384x384.parquet] Processed 1991 rows
+2026-02-01 10:48:13,385 [INFO] [batch_0051_384x384.parquet] Processed 1997 rows
+2026-02-01 10:49:48,140 [INFO] [batch_0052_384x384.parquet] Processed 1993 rows
+2026-02-01 10:51:22,710 [INFO] [batch_0053_384x384.parquet] Processed 1995 rows
+2026-02-01 10:52:23,823 [INFO] [batch_0054_384x384.parquet] Processed 1997 rows
+2026-02-01 10:53:48,669 [INFO] [batch_0055_384x384.parquet] Processed 1995 rows
+2026-02-01 10:55:03,785 [INFO] [batch_0056_384x384.parquet] Processed 1997 rows
+2026-02-01 10:55:56,653 [INFO] [batch_0057_384x384.parquet] Processed 1991 rows
+2026-02-01 10:56:50,364 [INFO] [batch_0058_384x384.parquet] Processed 1994 rows
+2026-02-01 10:57:33,268 [INFO] [batch_0059_384x384.parquet] Processed 1993 rows
+2026-02-01 10:58:36,103 [INFO] [batch_0060_384x384.parquet] Processed 1995 rows
+2026-02-01 10:59:43,156 [INFO] [batch_0061_384x384.parquet] Processed 1995 rows
+2026-02-01 11:00:45,280 [INFO] [batch_0062_384x384.parquet] Processed 1998 rows
+2026-02-01 11:02:03,960 [INFO] [batch_0063_384x384.parquet] Processed 1997 rows
+2026-02-01 11:03:01,993 [INFO] [batch_0064_384x384.parquet] Processed 1992 rows
+2026-02-01 11:04:18,812 [INFO] [batch_0065_384x384.parquet] Processed 1994 rows
+2026-02-01 11:05:34,954 [INFO] [batch_0066_384x384.parquet] Processed 1992 rows
+2026-02-01 11:06:26,502 [INFO] [batch_0067_384x384.parquet] Processed 1993 rows
+2026-02-01 11:07:42,754 [INFO] [batch_0068_384x384.parquet] Processed 1994 rows
+2026-02-01 11:09:01,751 [INFO] [batch_0069_384x384.parquet] Processed 1992 rows
+2026-02-01 11:09:49,394 [INFO] [batch_0070_384x384.parquet] Processed 1997 rows
+2026-02-01 11:11:06,518 [INFO] [batch_0071_384x384.parquet] Processed 1996 rows
+2026-02-01 11:12:22,688 [INFO] [batch_0072_384x384.parquet] Processed 1992 rows
+2026-02-01 11:13:14,831 [INFO] [batch_0073_384x384.parquet] Processed 1995 rows
+2026-02-01 11:14:14,879 [INFO] [batch_0074_384x384.parquet] Processed 1992 rows
+2026-02-01 11:14:58,098 [INFO] [batch_0075_384x384.parquet] Processed 1991 rows
+2026-02-01 11:15:43,764 [INFO] [batch_0076_384x384.parquet] Processed 1998 rows
+2026-02-01 11:16:53,710 [INFO] [batch_0077_384x384.parquet] Processed 1996 rows
+2026-02-01 11:17:51,040 [INFO] [batch_0078_384x384.parquet] Processed 1992 rows
+2026-02-01 11:18:57,871 [INFO] [batch_0079_384x384.parquet] Processed 1995 rows
+2026-02-01 11:20:06,930 [INFO] [batch_0080_384x384.parquet] Processed 1993 rows
+2026-02-01 11:20:51,630 [INFO] [batch_0081_384x384.parquet] Processed 1995 rows
+2026-02-01 11:21:43,270 [INFO] [batch_0082_384x384.parquet] Processed 1989 rows
+2026-02-01 11:22:29,228 [INFO] [batch_0083_384x384.parquet] Processed 1995 rows
+2026-02-01 11:23:23,236 [INFO] [batch_0084_384x384.parquet] Processed 1996 rows
+2026-02-01 11:24:32,532 [INFO] [batch_0085_384x384.parquet] Processed 1997 rows
+2026-02-01 11:25:20,336 [INFO] [batch_0086_384x384.parquet] Processed 1996 rows
+2026-02-01 11:26:33,616 [INFO] [batch_0087_384x384.parquet] Processed 1994 rows
+2026-02-01 11:27:24,449 [INFO] [batch_0088_384x384.parquet] Processed 1992 rows
+2026-02-01 11:28:20,047 [INFO] [batch_0089_384x384.parquet] Processed 1993 rows
+2026-02-01 11:29:43,109 [INFO] [batch_0090_384x384.parquet] Processed 1993 rows
+2026-02-01 11:30:41,652 [INFO] [batch_0091_384x384.parquet] Processed 1995 rows
+2026-02-01 11:31:43,751 [INFO] [batch_0092_384x384.parquet] Processed 1994 rows
+2026-02-01 11:33:10,661 [INFO] [batch_0093_384x384.parquet] Processed 1998 rows
+2026-02-01 11:34:12,721 [INFO] [batch_0094_384x384.parquet] Processed 1993 rows
+2026-02-01 11:35:09,887 [INFO] [batch_0095_384x384.parquet] Processed 1995 rows
+2026-02-01 11:36:36,141 [INFO] [batch_0096_384x384.parquet] Processed 1997 rows
+2026-02-01 11:37:41,740 [INFO] [batch_0097_384x384.parquet] Processed 1990 rows
+2026-02-01 11:38:40,066 [INFO] [batch_0098_384x384.parquet] Processed 1995 rows
+2026-02-01 11:39:45,765 [INFO] [batch_0099_384x384.parquet] Processed 1992 rows
+2026-02-01 11:40:40,739 [INFO] [batch_0100_384x384.parquet] Processed 1993 rows
+2026-02-01 11:41:41,583 [INFO] [batch_0101_384x384.parquet] Processed 1994 rows
+2026-02-01 11:42:47,504 [INFO] [batch_0102_384x384.parquet] Processed 1991 rows
+2026-02-01 11:43:31,148 [INFO] [batch_0103_384x384.parquet] Processed 1990 rows
+2026-02-01 11:44:38,070 [INFO] [batch_0104_384x384.parquet] Processed 1995 rows
+2026-02-01 11:45:48,089 [INFO] [batch_0105_384x384.parquet] Processed 1993 rows
+2026-02-01 11:46:47,156 [INFO] [batch_0106_384x384.parquet] Processed 1988 rows
+2026-02-01 11:48:06,340 [INFO] [batch_0107_384x384.parquet] Processed 1996 rows
+2026-02-01 11:49:08,016 [INFO] [batch_0108_384x384.parquet] Processed 1992 rows
+2026-02-01 11:50:27,665 [INFO] [batch_0109_384x384.parquet] Processed 1993 rows
+2026-02-01 11:51:38,073 [INFO] [batch_0110_384x384.parquet] Processed 1996 rows
+2026-02-01 11:52:26,956 [INFO] [batch_0111_384x384.parquet] Processed 1991 rows
+2026-02-01 11:53:44,395 [INFO] [batch_0112_384x384.parquet] Processed 1994 rows
+2026-02-01 11:54:23,803 [INFO] [batch_0113_384x384.parquet] Processed 1996 rows
+2026-02-01 11:55:07,867 [INFO] [batch_0114_384x384.parquet] Processed 1997 rows
+2026-02-01 11:55:54,834 [INFO] [batch_0115_384x384.parquet] Processed 1997 rows
+2026-02-01 11:56:36,849 [INFO] [batch_0116_384x384.parquet] Processed 1992 rows
+2026-02-01 11:57:20,506 [INFO] [batch_0117_384x384.parquet] Processed 1992 rows
+2026-02-01 11:57:58,985 [INFO] [batch_0118_384x384.parquet] Processed 1993 rows
+2026-02-01 11:58:38,965 [INFO] [batch_0119_384x384.parquet] Processed 1998 rows
+2026-02-01 11:59:16,459 [INFO] [batch_0120_384x384.parquet] Processed 1995 rows
+2026-02-01 11:59:55,497 [INFO] [batch_0121_384x384.parquet] Processed 1995 rows
+2026-02-01 12:00:33,857 [INFO] [batch_0122_384x384.parquet] Processed 1992 rows
+2026-02-01 12:01:15,871 [INFO] [batch_0123_384x384.parquet] Processed 1994 rows
+2026-02-01 12:01:53,537 [INFO] [batch_0124_384x384.parquet] Processed 1995 rows
+2026-02-01 12:02:22,334 [INFO] [batch_0125_384x384.parquet] Processed 1505 rows
+2026-02-01 12:02:22,334 [INFO] Merging all results...
+2026-02-01 12:02:22,384 [INFO] Final columns: ['grid_cell', 'grid_row_u', 'grid_col_r', 'product_id', 'timestamp', 'centre_lat', 'centre_lon', 'utm_crs', 'parquet_url', 'parquet_row', 'pixel_bbox', 'embedding']
+2026-02-01 12:02:22,384 [INFO] ✓ All unwanted columns removed
+2026-02-01 12:02:22,384 [INFO] ✓ Column 'crs' renamed to 'utm_crs'
+2026-02-01 12:02:22,384 [INFO] ✓ Column 'pixel_bbox' added
+2026-02-01 12:02:22,384 [INFO] Saving to /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/farslip/FarSLIP_crop_384x384.parquet...
+2026-02-01 12:02:25,588 [INFO] ================================================================================
+2026-02-01 12:02:25,588 [INFO] Processing complete!
+2026-02-01 12:02:25,588 [INFO] Total rows: 248,719
+2026-02-01 12:02:25,589 [INFO] Embedding dimension: 512
+2026-02-01 12:02:25,589 [INFO] Output file: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/farslip/FarSLIP_crop_384x384.parquet
+2026-02-01 12:02:25,589 [INFO] ================================================================================

logs/compute_embeddings_satclip.log ADDED Viewed

	@@ -0,0 +1,182 @@

+2026-02-01 09:09:57,720 [INFO] ================================================================================
+2026-02-01 09:09:57,720 [INFO] Computing SATCLIP embeddings
+2026-02-01 09:09:57,720 [INFO] Timestamp: 2026-02-01T09:09:57.720447
+2026-02-01 09:09:57,720 [INFO] Device: cuda:1
+2026-02-01 09:09:57,720 [INFO] ================================================================================
+2026-02-01 09:09:57,721 [INFO] Batch size: 128
+2026-02-01 09:09:57,721 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet
+2026-02-01 09:09:57,721 [INFO] Loading satclip model...
+2026-02-01 09:09:57,727 [INFO] SatCLIP-MS model loaded on cuda:1
+2026-02-01 09:09:57,728 [INFO] Found 1 input files
+2026-02-01 09:10:21,830 [WARNING] No valid embeddings for batch_0001_384x384.parquet
+2026-02-01 09:10:22,107 [ERROR] No data processed!
+2026-02-01 09:39:17,993 [INFO] ================================================================================
+2026-02-01 09:39:17,993 [INFO] Computing SATCLIP embeddings
+2026-02-01 09:39:17,993 [INFO] Timestamp: 2026-02-01T09:39:17.993775
+2026-02-01 09:39:17,993 [INFO] Device: cuda:1
+2026-02-01 09:39:17,993 [INFO] ================================================================================
+2026-02-01 09:39:17,994 [INFO] Batch size: 128
+2026-02-01 09:39:17,994 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet
+2026-02-01 09:39:17,994 [INFO] Loading satclip model...
+2026-02-01 09:39:20,179 [INFO] SatCLIP-MS model loaded on cuda:1
+2026-02-01 09:39:20,180 [INFO] Found 1 input files
+2026-02-01 09:40:01,084 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:40:01,084 [INFO] Merging all results...
+2026-02-01 09:40:01,085 [INFO] Final columns: ['grid_cell', 'grid_row_u', 'grid_col_r', 'product_id', 'timestamp', 'centre_lat', 'centre_lon', 'utm_crs', 'parquet_url', 'parquet_row', 'pixel_bbox', 'embedding']
+2026-02-01 09:40:01,085 [INFO] ✓ All unwanted columns removed
+2026-02-01 09:40:01,085 [INFO] ✓ Column 'crs' renamed to 'utm_crs'
+2026-02-01 09:40:01,085 [INFO] ✓ Column 'pixel_bbox' added
+2026-02-01 09:40:01,085 [INFO] Saving to /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet...
+2026-02-01 09:40:01,134 [INFO] ================================================================================
+2026-02-01 09:40:01,134 [INFO] Processing complete!
+2026-02-01 09:40:01,134 [INFO] Total rows: 1,996
+2026-02-01 09:40:01,134 [INFO] Embedding dimension: 256
+2026-02-01 09:40:01,134 [INFO] Output file: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet
+2026-02-01 09:40:01,134 [INFO] ================================================================================
+2026-02-01 09:43:19,666 [INFO] ================================================================================
+2026-02-01 09:43:19,666 [INFO] Computing SATCLIP embeddings
+2026-02-01 09:43:19,666 [INFO] Timestamp: 2026-02-01T09:43:19.666577
+2026-02-01 09:43:19,666 [INFO] Device: cuda:3
+2026-02-01 09:43:19,666 [INFO] ================================================================================
+2026-02-01 09:43:19,668 [INFO] Batch size: 128
+2026-02-01 09:43:19,668 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet
+2026-02-01 09:43:19,668 [INFO] Loading satclip model...
+2026-02-01 09:43:21,344 [INFO] SatCLIP-MS model loaded on cuda:3
+2026-02-01 09:43:21,345 [INFO] Found 125 input files
+2026-02-01 09:44:03,000 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:44:46,041 [INFO] [batch_0002_384x384.parquet] Processed 1990 rows
+2026-02-01 09:45:27,652 [INFO] [batch_0003_384x384.parquet] Processed 1997 rows
+2026-02-01 09:46:15,446 [INFO] [batch_0004_384x384.parquet] Processed 1992 rows
+2026-02-01 09:47:09,769 [INFO] [batch_0005_384x384.parquet] Processed 1992 rows
+2026-02-01 09:47:59,773 [INFO] [batch_0006_384x384.parquet] Processed 1992 rows
+2026-02-01 09:48:51,057 [INFO] [batch_0007_384x384.parquet] Processed 1998 rows
+2026-02-01 09:49:34,202 [INFO] [batch_0008_384x384.parquet] Processed 1994 rows
+2026-02-01 09:50:25,944 [INFO] [batch_0009_384x384.parquet] Processed 1993 rows
+2026-02-01 09:51:09,586 [INFO] [batch_0010_384x384.parquet] Processed 1993 rows
+2026-02-01 09:51:56,545 [INFO] [batch_0011_384x384.parquet] Processed 1990 rows
+2026-02-01 09:52:44,526 [INFO] [batch_0012_384x384.parquet] Processed 1993 rows
+2026-02-01 09:53:32,729 [INFO] [batch_0013_384x384.parquet] Processed 1993 rows
+2026-02-01 09:54:14,312 [INFO] [batch_0014_384x384.parquet] Processed 1991 rows
+2026-02-01 09:55:05,975 [INFO] [batch_0015_384x384.parquet] Processed 1993 rows
+2026-02-01 09:55:57,268 [INFO] [batch_0016_384x384.parquet] Processed 1990 rows
+2026-02-01 09:57:00,591 [INFO] [batch_0017_384x384.parquet] Processed 1991 rows
+2026-02-01 09:57:48,464 [INFO] [batch_0018_384x384.parquet] Processed 1991 rows
+2026-02-01 09:58:52,420 [INFO] [batch_0019_384x384.parquet] Processed 1996 rows
+2026-02-01 10:00:04,202 [INFO] [batch_0020_384x384.parquet] Processed 1994 rows
+2026-02-01 10:01:10,309 [INFO] [batch_0021_384x384.parquet] Processed 1996 rows
+2026-02-01 10:02:15,265 [INFO] [batch_0022_384x384.parquet] Processed 1995 rows
+2026-02-01 10:03:31,554 [INFO] [batch_0023_384x384.parquet] Processed 1992 rows
+2026-02-01 10:04:40,240 [INFO] [batch_0024_384x384.parquet] Processed 1989 rows
+2026-02-01 10:05:55,812 [INFO] [batch_0025_384x384.parquet] Processed 1993 rows
+2026-02-01 10:07:00,366 [INFO] [batch_0026_384x384.parquet] Processed 1995 rows
+2026-02-01 10:08:10,532 [INFO] [batch_0027_384x384.parquet] Processed 1997 rows
+2026-02-01 10:09:11,505 [INFO] [batch_0028_384x384.parquet] Processed 1991 rows
+2026-02-01 10:10:21,951 [INFO] [batch_0029_384x384.parquet] Processed 1992 rows
+2026-02-01 10:11:30,988 [INFO] [batch_0030_384x384.parquet] Processed 1993 rows
+2026-02-01 10:12:26,034 [INFO] [batch_0031_384x384.parquet] Processed 1988 rows
+2026-02-01 10:13:36,732 [INFO] [batch_0032_384x384.parquet] Processed 1994 rows
+2026-02-01 10:14:36,787 [INFO] [batch_0033_384x384.parquet] Processed 1992 rows
+2026-02-01 10:15:36,921 [INFO] [batch_0034_384x384.parquet] Processed 1994 rows
+2026-02-01 10:16:38,623 [INFO] [batch_0035_384x384.parquet] Processed 1994 rows
+2026-02-01 10:17:27,583 [INFO] [batch_0036_384x384.parquet] Processed 1996 rows
+2026-02-01 10:18:29,976 [INFO] [batch_0037_384x384.parquet] Processed 1995 rows
+2026-02-01 10:19:26,843 [INFO] [batch_0038_384x384.parquet] Processed 1993 rows
+2026-02-01 10:20:14,532 [INFO] [batch_0039_384x384.parquet] Processed 1989 rows
+2026-02-01 10:21:13,694 [INFO] [batch_0040_384x384.parquet] Processed 1990 rows
+2026-02-01 10:22:05,858 [INFO] [batch_0041_384x384.parquet] Processed 1998 rows
+2026-02-01 10:23:04,226 [INFO] [batch_0042_384x384.parquet] Processed 1997 rows
+2026-02-01 10:23:56,641 [INFO] [batch_0043_384x384.parquet] Processed 1988 rows
+2026-02-01 10:24:38,594 [INFO] [batch_0044_384x384.parquet] Processed 1991 rows
+2026-02-01 10:25:42,517 [INFO] [batch_0045_384x384.parquet] Processed 1993 rows
+2026-02-01 10:26:23,732 [INFO] [batch_0046_384x384.parquet] Processed 1994 rows
+2026-02-01 10:27:39,298 [INFO] [batch_0047_384x384.parquet] Processed 1995 rows
+2026-02-01 10:28:34,546 [INFO] [batch_0048_384x384.parquet] Processed 1992 rows
+2026-02-01 10:29:35,568 [INFO] [batch_0049_384x384.parquet] Processed 1996 rows
+2026-02-01 10:30:38,004 [INFO] [batch_0050_384x384.parquet] Processed 1991 rows
+2026-02-01 10:31:50,544 [INFO] [batch_0051_384x384.parquet] Processed 1997 rows
+2026-02-01 10:32:38,165 [INFO] [batch_0052_384x384.parquet] Processed 1993 rows
+2026-02-01 10:33:54,330 [INFO] [batch_0053_384x384.parquet] Processed 1995 rows
+2026-02-01 10:35:11,070 [INFO] [batch_0054_384x384.parquet] Processed 1997 rows
+2026-02-01 10:36:06,495 [INFO] [batch_0055_384x384.parquet] Processed 1995 rows
+2026-02-01 10:37:26,449 [INFO] [batch_0056_384x384.parquet] Processed 1997 rows
+2026-02-01 10:38:40,433 [INFO] [batch_0057_384x384.parquet] Processed 1991 rows
+2026-02-01 10:39:36,229 [INFO] [batch_0058_384x384.parquet] Processed 1994 rows
+2026-02-01 10:40:50,558 [INFO] [batch_0059_384x384.parquet] Processed 1993 rows
+2026-02-01 10:42:00,100 [INFO] [batch_0060_384x384.parquet] Processed 1995 rows
+2026-02-01 10:42:53,440 [INFO] [batch_0061_384x384.parquet] Processed 1995 rows
+2026-02-01 10:44:21,706 [INFO] [batch_0062_384x384.parquet] Processed 1998 rows
+2026-02-01 10:45:56,656 [INFO] [batch_0063_384x384.parquet] Processed 1997 rows
+2026-02-01 10:46:53,942 [INFO] [batch_0064_384x384.parquet] Processed 1992 rows
+2026-02-01 10:47:47,760 [INFO] [batch_0065_384x384.parquet] Processed 1994 rows
+2026-02-01 10:48:37,571 [INFO] [batch_0066_384x384.parquet] Processed 1992 rows
+2026-02-01 10:50:00,819 [INFO] [batch_0067_384x384.parquet] Processed 1993 rows
+2026-02-01 10:51:30,799 [INFO] [batch_0068_384x384.parquet] Processed 1994 rows
+2026-02-01 10:52:28,413 [INFO] [batch_0069_384x384.parquet] Processed 1992 rows
+2026-02-01 10:53:50,597 [INFO] [batch_0070_384x384.parquet] Processed 1997 rows
+2026-02-01 10:55:01,173 [INFO] [batch_0071_384x384.parquet] Processed 1996 rows
+2026-02-01 10:56:03,395 [INFO] [batch_0072_384x384.parquet] Processed 1992 rows
+2026-02-01 10:57:10,601 [INFO] [batch_0073_384x384.parquet] Processed 1995 rows
+2026-02-01 10:58:22,789 [INFO] [batch_0074_384x384.parquet] Processed 1992 rows
+2026-02-01 10:59:39,697 [INFO] [batch_0075_384x384.parquet] Processed 1991 rows
+2026-02-01 11:00:48,962 [INFO] [batch_0076_384x384.parquet] Processed 1998 rows
+2026-02-01 11:01:59,729 [INFO] [batch_0077_384x384.parquet] Processed 1996 rows
+2026-02-01 11:03:01,575 [INFO] [batch_0078_384x384.parquet] Processed 1992 rows
+2026-02-01 11:04:15,721 [INFO] [batch_0079_384x384.parquet] Processed 1995 rows
+2026-02-01 11:05:26,147 [INFO] [batch_0080_384x384.parquet] Processed 1993 rows
+2026-02-01 11:06:21,742 [INFO] [batch_0081_384x384.parquet] Processed 1995 rows
+2026-02-01 11:07:34,071 [INFO] [batch_0082_384x384.parquet] Processed 1989 rows
+2026-02-01 11:08:51,443 [INFO] [batch_0083_384x384.parquet] Processed 1995 rows
+2026-02-01 11:09:45,289 [INFO] [batch_0084_384x384.parquet] Processed 1996 rows
+2026-02-01 11:10:59,507 [INFO] [batch_0085_384x384.parquet] Processed 1997 rows
+2026-02-01 11:12:12,671 [INFO] [batch_0086_384x384.parquet] Processed 1996 rows
+2026-02-01 11:13:16,945 [INFO] [batch_0087_384x384.parquet] Processed 1994 rows
+2026-02-01 11:14:26,324 [INFO] [batch_0088_384x384.parquet] Processed 1992 rows
+2026-02-01 11:15:25,871 [INFO] [batch_0089_384x384.parquet] Processed 1993 rows
+2026-02-01 11:16:43,653 [INFO] [batch_0090_384x384.parquet] Processed 1993 rows
+2026-02-01 11:17:52,205 [INFO] [batch_0091_384x384.parquet] Processed 1995 rows
+2026-02-01 11:19:02,073 [INFO] [batch_0092_384x384.parquet] Processed 1994 rows
+2026-02-01 11:20:14,843 [INFO] [batch_0093_384x384.parquet] Processed 1998 rows
+2026-02-01 11:21:09,193 [INFO] [batch_0094_384x384.parquet] Processed 1993 rows
+2026-02-01 11:22:03,303 [INFO] [batch_0095_384x384.parquet] Processed 1995 rows
+2026-02-01 11:23:12,708 [INFO] [batch_0096_384x384.parquet] Processed 1997 rows
+2026-02-01 11:24:18,831 [INFO] [batch_0097_384x384.parquet] Processed 1990 rows
+2026-02-01 11:25:07,701 [INFO] [batch_0098_384x384.parquet] Processed 1995 rows
+2026-02-01 11:26:18,306 [INFO] [batch_0099_384x384.parquet] Processed 1992 rows
+2026-02-01 11:27:02,698 [INFO] [batch_0100_384x384.parquet] Processed 1993 rows
+2026-02-01 11:28:08,644 [INFO] [batch_0101_384x384.parquet] Processed 1994 rows
+2026-02-01 11:29:33,678 [INFO] [batch_0102_384x384.parquet] Processed 1991 rows
+2026-02-01 11:30:25,760 [INFO] [batch_0103_384x384.parquet] Processed 1990 rows
+2026-02-01 11:31:38,365 [INFO] [batch_0104_384x384.parquet] Processed 1995 rows
+2026-02-01 11:33:06,206 [INFO] [batch_0105_384x384.parquet] Processed 1993 rows
+2026-02-01 11:33:59,497 [INFO] [batch_0106_384x384.parquet] Processed 1988 rows
+2026-02-01 11:35:04,565 [INFO] [batch_0107_384x384.parquet] Processed 1996 rows
+2026-02-01 11:36:30,898 [INFO] [batch_0108_384x384.parquet] Processed 1992 rows
+2026-02-01 11:37:34,766 [INFO] [batch_0109_384x384.parquet] Processed 1993 rows
+2026-02-01 11:38:36,780 [INFO] [batch_0110_384x384.parquet] Processed 1996 rows
+2026-02-01 11:39:53,826 [INFO] [batch_0111_384x384.parquet] Processed 1991 rows
+2026-02-01 11:40:48,014 [INFO] [batch_0112_384x384.parquet] Processed 1994 rows
+2026-02-01 11:41:49,113 [INFO] [batch_0113_384x384.parquet] Processed 1996 rows
+2026-02-01 11:42:56,188 [INFO] [batch_0114_384x384.parquet] Processed 1997 rows
+2026-02-01 11:43:43,288 [INFO] [batch_0115_384x384.parquet] Processed 1997 rows
+2026-02-01 11:44:48,748 [INFO] [batch_0116_384x384.parquet] Processed 1992 rows
+2026-02-01 11:45:54,394 [INFO] [batch_0117_384x384.parquet] Processed 1992 rows
+2026-02-01 11:46:53,275 [INFO] [batch_0118_384x384.parquet] Processed 1993 rows
+2026-02-01 11:48:08,611 [INFO] [batch_0119_384x384.parquet] Processed 1998 rows
+2026-02-01 11:49:07,195 [INFO] [batch_0120_384x384.parquet] Processed 1995 rows
+2026-02-01 11:50:22,347 [INFO] [batch_0121_384x384.parquet] Processed 1995 rows
+2026-02-01 11:51:26,391 [INFO] [batch_0122_384x384.parquet] Processed 1992 rows
+2026-02-01 11:52:22,734 [INFO] [batch_0123_384x384.parquet] Processed 1994 rows
+2026-02-01 11:53:34,357 [INFO] [batch_0124_384x384.parquet] Processed 1995 rows
+2026-02-01 11:54:05,024 [INFO] [batch_0125_384x384.parquet] Processed 1505 rows
+2026-02-01 11:54:05,024 [INFO] Merging all results...
+2026-02-01 11:54:05,057 [INFO] Final columns: ['grid_cell', 'grid_row_u', 'grid_col_r', 'product_id', 'timestamp', 'centre_lat', 'centre_lon', 'utm_crs', 'parquet_url', 'parquet_row', 'pixel_bbox', 'embedding']
+2026-02-01 11:54:05,058 [INFO] ✓ All unwanted columns removed
+2026-02-01 11:54:05,058 [INFO] ✓ Column 'crs' renamed to 'utm_crs'
+2026-02-01 11:54:05,058 [INFO] ✓ Column 'pixel_bbox' added
+2026-02-01 11:54:05,058 [INFO] Saving to /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet...
+2026-02-01 11:54:06,861 [INFO] ================================================================================
+2026-02-01 11:54:06,861 [INFO] Processing complete!
+2026-02-01 11:54:06,861 [INFO] Total rows: 248,719
+2026-02-01 11:54:06,862 [INFO] Embedding dimension: 256
+2026-02-01 11:54:06,862 [INFO] Output file: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/satclip/SatCLIP_crop_384x384.parquet
+2026-02-01 11:54:06,862 [INFO] ================================================================================

logs/compute_embeddings_siglip.log ADDED Viewed

	@@ -0,0 +1,200 @@

+2026-02-01 09:43:14,001 [INFO] ================================================================================
+2026-02-01 09:43:14,002 [INFO] Computing SIGLIP embeddings
+2026-02-01 09:43:14,002 [INFO] Timestamp: 2026-02-01T09:43:14.002069
+2026-02-01 09:43:14,002 [INFO] Device: cuda:2
+2026-02-01 09:43:14,002 [INFO] ================================================================================
+2026-02-01 09:43:14,003 [INFO] Batch size: 64
+2026-02-01 09:43:14,003 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/siglip/SigLIP_crop_384x384.parquet
+2026-02-01 09:43:14,004 [INFO] Loading siglip model...
+2026-02-01 09:43:14,196 [INFO] Parsing model identifier. Schema: None, Identifier: ViT-SO400M-14-SigLIP-384
+2026-02-01 09:43:14,196 [INFO] Loaded built-in ViT-SO400M-14-SigLIP-384 model config.
+2026-02-01 09:43:14,197 [INFO] `pretrained` specifies file path: /data1/zyj/checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin
+2026-02-01 09:43:14,197 [INFO] Instantiating model architecture: CustomTextCLIP
+2026-02-01 09:43:22,955 [INFO] Loading full pretrained weights from: /data1/zyj/checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin
+2026-02-01 09:43:24,815 [INFO] Final image preprocessing configuration set: {'size': (384, 384), 'mode': 'RGB', 'mean': (0.48145466, 0.4578275, 0.40821073), 'std': (0.26862954, 0.26130258, 0.27577711), 'interpolation': 'bicubic', 'resize_mode': 'shortest', 'fill_color': 0}
+2026-02-01 09:43:24,815 [INFO] Model ViT-SO400M-14-SigLIP-384 creation process complete.
+2026-02-01 09:43:25,908 [INFO] SigLIP model loaded on cuda:2
+2026-02-01 09:43:25,909 [INFO] Found 125 input files
+2026-02-01 09:44:47,927 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:46:05,633 [INFO] [batch_0002_384x384.parquet] Processed 1990 rows
+2026-02-01 09:47:28,903 [INFO] [batch_0003_384x384.parquet] Processed 1997 rows
+2026-02-01 09:48:39,715 [INFO] [batch_0004_384x384.parquet] Processed 1992 rows
+2026-02-01 09:49:56,387 [INFO] [batch_0005_384x384.parquet] Processed 1992 rows
+2026-02-01 09:51:18,436 [INFO] [batch_0006_384x384.parquet] Processed 1992 rows
+2026-02-01 09:52:45,064 [INFO] [batch_0007_384x384.parquet] Processed 1998 rows
+2026-02-01 09:54:13,231 [INFO] [batch_0008_384x384.parquet] Processed 1994 rows
+2026-02-01 09:55:40,342 [INFO] ================================================================================
+2026-02-01 09:55:40,343 [INFO] Computing SIGLIP embeddings
+2026-02-01 09:55:40,343 [INFO] Timestamp: 2026-02-01T09:55:40.343045
+2026-02-01 09:55:40,343 [INFO] Device: cuda:2
+2026-02-01 09:55:40,343 [INFO] ================================================================================
+2026-02-01 09:55:40,344 [INFO] Batch size: 256
+2026-02-01 09:55:40,344 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/siglip/SigLIP_crop_384x384.parquet
+2026-02-01 09:55:40,344 [INFO] Loading siglip model...
+2026-02-01 09:55:40,494 [INFO] Parsing model identifier. Schema: None, Identifier: ViT-SO400M-14-SigLIP-384
+2026-02-01 09:55:40,494 [INFO] Loaded built-in ViT-SO400M-14-SigLIP-384 model config.
+2026-02-01 09:55:40,494 [INFO] `pretrained` specifies file path: /data1/zyj/checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin
+2026-02-01 09:55:40,494 [INFO] Instantiating model architecture: CustomTextCLIP
+2026-02-01 09:55:50,054 [INFO] Loading full pretrained weights from: /data1/zyj/checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin
+2026-02-01 09:55:52,457 [INFO] Final image preprocessing configuration set: {'size': (384, 384), 'mode': 'RGB', 'mean': (0.48145466, 0.4578275, 0.40821073), 'std': (0.26862954, 0.26130258, 0.27577711), 'interpolation': 'bicubic', 'resize_mode': 'shortest', 'fill_color': 0}
+2026-02-01 09:55:52,457 [INFO] Model ViT-SO400M-14-SigLIP-384 creation process complete.
+2026-02-01 09:55:53,533 [INFO] SigLIP model loaded on cuda:2
+2026-02-01 09:55:53,534 [INFO] Found 125 input files
+2026-02-01 09:57:15,361 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 09:58:38,916 [INFO] [batch_0002_384x384.parquet] Processed 1990 rows
+2026-02-01 10:00:13,289 [INFO] [batch_0003_384x384.parquet] Processed 1997 rows
+2026-02-01 10:01:38,351 [INFO] [batch_0004_384x384.parquet] Processed 1992 rows
+2026-02-01 10:03:13,561 [INFO] [batch_0005_384x384.parquet] Processed 1992 rows
+2026-02-01 10:04:55,295 [INFO] [batch_0006_384x384.parquet] Processed 1992 rows
+2026-02-01 10:06:42,957 [INFO] [batch_0007_384x384.parquet] Processed 1998 rows
+2026-02-01 10:08:27,547 [INFO] [batch_0008_384x384.parquet] Processed 1994 rows
+2026-02-01 10:10:15,515 [INFO] [batch_0009_384x384.parquet] Processed 1993 rows
+2026-02-01 10:11:54,632 [INFO] [batch_0010_384x384.parquet] Processed 1993 rows
+2026-02-01 10:13:42,862 [INFO] [batch_0011_384x384.parquet] Processed 1990 rows
+2026-02-01 10:15:23,412 [INFO] [batch_0012_384x384.parquet] Processed 1993 rows
+2026-02-01 10:16:55,431 [INFO] [batch_0013_384x384.parquet] Processed 1993 rows
+2026-02-01 10:18:30,326 [INFO] [batch_0014_384x384.parquet] Processed 1991 rows
+2026-02-01 10:19:54,738 [INFO] [batch_0015_384x384.parquet] Processed 1993 rows
+2026-02-01 10:21:25,001 [INFO] [batch_0016_384x384.parquet] Processed 1990 rows
+2026-02-01 10:23:00,423 [INFO] [batch_0017_384x384.parquet] Processed 1991 rows
+2026-02-01 10:24:21,837 [INFO] [batch_0018_384x384.parquet] Processed 1991 rows
+2026-02-01 10:26:00,517 [INFO] [batch_0019_384x384.parquet] Processed 1996 rows
+2026-02-01 10:27:39,553 [INFO] [batch_0020_384x384.parquet] Processed 1994 rows
+2026-02-01 10:29:02,772 [INFO] [batch_0021_384x384.parquet] Processed 1996 rows
+2026-02-01 10:30:43,286 [INFO] [batch_0022_384x384.parquet] Processed 1995 rows
+2026-02-01 10:32:18,498 [INFO] [batch_0023_384x384.parquet] Processed 1992 rows
+2026-02-01 10:33:59,552 [INFO] [batch_0024_384x384.parquet] Processed 1989 rows
+2026-02-01 10:35:36,652 [INFO] [batch_0025_384x384.parquet] Processed 1993 rows
+2026-02-01 10:37:22,505 [INFO] [batch_0026_384x384.parquet] Processed 1995 rows
+2026-02-01 10:39:04,911 [INFO] [batch_0027_384x384.parquet] Processed 1997 rows
+2026-02-01 10:40:47,184 [INFO] [batch_0028_384x384.parquet] Processed 1991 rows
+2026-02-01 10:42:27,627 [INFO] [batch_0029_384x384.parquet] Processed 1992 rows
+2026-02-01 10:43:40,600 [INFO] ================================================================================
+2026-02-01 10:43:40,600 [INFO] Computing SIGLIP embeddings
+2026-02-01 10:43:40,600 [INFO] Timestamp: 2026-02-01T10:43:40.600706
+2026-02-01 10:43:40,600 [INFO] Device: cuda:5
+2026-02-01 10:43:40,600 [INFO] ================================================================================
+2026-02-01 10:43:40,602 [INFO] Batch size: 64
+2026-02-01 10:43:40,602 [INFO] Output path: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/siglip/SigLIP_crop_384x384.parquet
+2026-02-01 10:43:40,602 [INFO] Loading siglip model...
+2026-02-01 10:43:40,778 [INFO] Parsing model identifier. Schema: None, Identifier: ViT-SO400M-14-SigLIP-384
+2026-02-01 10:43:40,778 [INFO] Loaded built-in ViT-SO400M-14-SigLIP-384 model config.
+2026-02-01 10:43:40,778 [INFO] `pretrained` specifies file path: /data1/zyj/checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin
+2026-02-01 10:43:40,778 [INFO] Instantiating model architecture: CustomTextCLIP
+2026-02-01 10:43:59,641 [INFO] Loading full pretrained weights from: /data1/zyj/checkpoints/ViT-SO400M-14-SigLIP-384/open_clip_pytorch_model.bin
+2026-02-01 10:44:04,702 [INFO] Final image preprocessing configuration set: {'size': (384, 384), 'mode': 'RGB', 'mean': (0.48145466, 0.4578275, 0.40821073), 'std': (0.26862954, 0.26130258, 0.27577711), 'interpolation': 'bicubic', 'resize_mode': 'shortest', 'fill_color': 0}
+2026-02-01 10:44:04,702 [INFO] Model ViT-SO400M-14-SigLIP-384 creation process complete.
+2026-02-01 10:44:06,271 [INFO] SigLIP model loaded on cuda:5
+2026-02-01 10:44:06,272 [INFO] Found 125 input files
+2026-02-01 10:44:20,369 [INFO] [batch_0030_384x384.parquet] Processed 1993 rows
+2026-02-01 10:45:59,867 [INFO] [batch_0001_384x384.parquet] Processed 1996 rows
+2026-02-01 10:46:32,133 [INFO] [batch_0031_384x384.parquet] Processed 1988 rows
+2026-02-01 10:47:08,397 [INFO] [batch_0002_384x384.parquet] Processed 1990 rows
+2026-02-01 10:48:03,827 [INFO] [batch_0032_384x384.parquet] Processed 1994 rows
+2026-02-01 10:48:20,770 [INFO] [batch_0003_384x384.parquet] Processed 1997 rows
+2026-02-01 10:50:02,578 [INFO] [batch_0033_384x384.parquet] Processed 1992 rows
+2026-02-01 10:50:06,189 [INFO] [batch_0004_384x384.parquet] Processed 1992 rows
+2026-02-01 10:52:02,296 [INFO] [batch_0034_384x384.parquet] Processed 1994 rows
+2026-02-01 10:53:52,804 [INFO] [batch_0035_384x384.parquet] Processed 1994 rows
+2026-02-01 10:55:40,379 [INFO] [batch_0036_384x384.parquet] Processed 1996 rows
+2026-02-01 10:57:08,912 [INFO] [batch_0037_384x384.parquet] Processed 1995 rows
+2026-02-01 10:58:42,083 [INFO] [batch_0038_384x384.parquet] Processed 1993 rows
+2026-02-01 11:00:31,963 [INFO] [batch_0039_384x384.parquet] Processed 1989 rows
+2026-02-01 11:02:16,803 [INFO] [batch_0040_384x384.parquet] Processed 1990 rows
+2026-02-01 11:04:12,580 [INFO] [batch_0041_384x384.parquet] Processed 1998 rows
+2026-02-01 11:05:52,695 [INFO] [batch_0042_384x384.parquet] Processed 1997 rows
+2026-02-01 11:07:38,215 [INFO] [batch_0043_384x384.parquet] Processed 1988 rows
+2026-02-01 11:09:18,740 [INFO] [batch_0044_384x384.parquet] Processed 1991 rows
+2026-02-01 11:10:59,852 [INFO] [batch_0045_384x384.parquet] Processed 1993 rows
+2026-02-01 11:12:35,695 [INFO] [batch_0046_384x384.parquet] Processed 1994 rows
+2026-02-01 11:14:12,998 [INFO] [batch_0047_384x384.parquet] Processed 1995 rows
+2026-02-01 11:15:30,214 [INFO] [batch_0048_384x384.parquet] Processed 1992 rows
+2026-02-01 11:17:05,225 [INFO] [batch_0049_384x384.parquet] Processed 1996 rows
+2026-02-01 11:18:50,252 [INFO] [batch_0050_384x384.parquet] Processed 1991 rows
+2026-02-01 11:20:25,931 [INFO] [batch_0051_384x384.parquet] Processed 1997 rows
+2026-02-01 11:21:43,527 [INFO] [batch_0052_384x384.parquet] Processed 1993 rows
+2026-02-01 11:23:12,150 [INFO] [batch_0053_384x384.parquet] Processed 1995 rows
+2026-02-01 11:24:47,385 [INFO] [batch_0054_384x384.parquet] Processed 1997 rows
+2026-02-01 11:26:31,520 [INFO] [batch_0055_384x384.parquet] Processed 1995 rows
+2026-02-01 11:28:03,476 [INFO] [batch_0056_384x384.parquet] Processed 1997 rows
+2026-02-01 11:29:48,548 [INFO] [batch_0057_384x384.parquet] Processed 1991 rows
+2026-02-01 11:31:29,605 [INFO] [batch_0058_384x384.parquet] Processed 1994 rows
+2026-02-01 11:33:17,760 [INFO] [batch_0059_384x384.parquet] Processed 1993 rows
+2026-02-01 11:34:50,684 [INFO] [batch_0060_384x384.parquet] Processed 1995 rows
+2026-02-01 11:36:38,080 [INFO] [batch_0061_384x384.parquet] Processed 1995 rows
+2026-02-01 11:38:19,287 [INFO] [batch_0062_384x384.parquet] Processed 1998 rows
+2026-02-01 11:40:01,382 [INFO] [batch_0063_384x384.parquet] Processed 1997 rows
+2026-02-01 11:41:28,396 [INFO] [batch_0064_384x384.parquet] Processed 1992 rows
+2026-02-01 11:43:07,187 [INFO] [batch_0065_384x384.parquet] Processed 1994 rows
+2026-02-01 11:44:47,035 [INFO] [batch_0066_384x384.parquet] Processed 1992 rows
+2026-02-01 11:46:38,657 [INFO] [batch_0067_384x384.parquet] Processed 1993 rows
+2026-02-01 11:48:25,045 [INFO] [batch_0068_384x384.parquet] Processed 1994 rows
+2026-02-01 11:50:24,090 [INFO] [batch_0069_384x384.parquet] Processed 1992 rows
+2026-02-01 11:52:05,360 [INFO] [batch_0070_384x384.parquet] Processed 1997 rows
+2026-02-01 11:53:51,383 [INFO] [batch_0071_384x384.parquet] Processed 1996 rows
+2026-02-01 11:55:00,188 [INFO] [batch_0072_384x384.parquet] Processed 1992 rows
+2026-02-01 11:56:16,122 [INFO] [batch_0073_384x384.parquet] Processed 1995 rows
+2026-02-01 11:57:30,601 [INFO] [batch_0074_384x384.parquet] Processed 1992 rows
+2026-02-01 11:58:47,717 [INFO] [batch_0075_384x384.parquet] Processed 1991 rows
+2026-02-01 12:00:01,207 [INFO] [batch_0076_384x384.parquet] Processed 1998 rows
+2026-02-01 12:01:14,471 [INFO] [batch_0077_384x384.parquet] Processed 1996 rows
+2026-02-01 12:02:31,575 [INFO] [batch_0078_384x384.parquet] Processed 1992 rows
+2026-02-01 12:03:52,303 [INFO] [batch_0079_384x384.parquet] Processed 1995 rows
+2026-02-01 12:05:06,370 [INFO] [batch_0080_384x384.parquet] Processed 1993 rows
+2026-02-01 12:06:16,989 [INFO] [batch_0081_384x384.parquet] Processed 1995 rows
+2026-02-01 12:07:32,029 [INFO] [batch_0082_384x384.parquet] Processed 1989 rows
+2026-02-01 12:08:47,568 [INFO] [batch_0083_384x384.parquet] Processed 1995 rows
+2026-02-01 12:10:03,544 [INFO] [batch_0084_384x384.parquet] Processed 1996 rows
+2026-02-01 12:11:20,376 [INFO] [batch_0085_384x384.parquet] Processed 1997 rows
+2026-02-01 12:12:38,318 [INFO] [batch_0086_384x384.parquet] Processed 1996 rows
+2026-02-01 12:13:56,314 [INFO] [batch_0087_384x384.parquet] Processed 1994 rows
+2026-02-01 12:15:14,513 [INFO] [batch_0088_384x384.parquet] Processed 1992 rows
+2026-02-01 12:16:32,334 [INFO] [batch_0089_384x384.parquet] Processed 1993 rows
+2026-02-01 12:17:52,186 [INFO] [batch_0090_384x384.parquet] Processed 1993 rows
+2026-02-01 12:19:10,443 [INFO] [batch_0091_384x384.parquet] Processed 1995 rows
+2026-02-01 12:20:24,543 [INFO] [batch_0092_384x384.parquet] Processed 1994 rows
+2026-02-01 12:21:42,150 [INFO] [batch_0093_384x384.parquet] Processed 1998 rows
+2026-02-01 12:22:50,203 [INFO] [batch_0094_384x384.parquet] Processed 1993 rows
+2026-02-01 12:24:08,849 [INFO] [batch_0095_384x384.parquet] Processed 1995 rows
+2026-02-01 12:25:14,387 [INFO] [batch_0096_384x384.parquet] Processed 1997 rows
+2026-02-01 12:26:27,496 [INFO] [batch_0097_384x384.parquet] Processed 1990 rows
+2026-02-01 12:27:38,051 [INFO] [batch_0098_384x384.parquet] Processed 1995 rows
+2026-02-01 12:28:46,151 [INFO] [batch_0099_384x384.parquet] Processed 1992 rows
+2026-02-01 12:29:56,731 [INFO] [batch_0100_384x384.parquet] Processed 1993 rows
+2026-02-01 12:31:13,328 [INFO] [batch_0101_384x384.parquet] Processed 1994 rows
+2026-02-01 12:32:22,428 [INFO] [batch_0102_384x384.parquet] Processed 1991 rows
+2026-02-01 12:33:34,185 [INFO] [batch_0103_384x384.parquet] Processed 1990 rows
+2026-02-01 12:34:42,817 [INFO] [batch_0104_384x384.parquet] Processed 1995 rows
+2026-02-01 12:35:53,075 [INFO] [batch_0105_384x384.parquet] Processed 1993 rows
+2026-02-01 12:37:03,504 [INFO] [batch_0106_384x384.parquet] Processed 1988 rows
+2026-02-01 12:38:12,118 [INFO] [batch_0107_384x384.parquet] Processed 1996 rows
+2026-02-01 12:39:26,579 [INFO] [batch_0108_384x384.parquet] Processed 1992 rows
+2026-02-01 12:40:38,968 [INFO] [batch_0109_384x384.parquet] Processed 1993 rows
+2026-02-01 12:41:50,225 [INFO] [batch_0110_384x384.parquet] Processed 1996 rows
+2026-02-01 12:42:59,782 [INFO] [batch_0111_384x384.parquet] Processed 1991 rows
+2026-02-01 12:44:13,255 [INFO] [batch_0112_384x384.parquet] Processed 1994 rows
+2026-02-01 12:45:25,863 [INFO] [batch_0113_384x384.parquet] Processed 1996 rows
+2026-02-01 12:46:42,753 [INFO] [batch_0114_384x384.parquet] Processed 1997 rows
+2026-02-01 12:47:54,480 [INFO] [batch_0115_384x384.parquet] Processed 1997 rows
+2026-02-01 12:49:00,711 [INFO] [batch_0116_384x384.parquet] Processed 1992 rows
+2026-02-01 12:50:12,844 [INFO] [batch_0117_384x384.parquet] Processed 1992 rows
+2026-02-01 12:51:27,205 [INFO] [batch_0118_384x384.parquet] Processed 1993 rows
+2026-02-01 12:52:36,479 [INFO] [batch_0119_384x384.parquet] Processed 1998 rows
+2026-02-01 12:53:54,416 [INFO] [batch_0120_384x384.parquet] Processed 1995 rows
+2026-02-01 12:55:03,501 [INFO] [batch_0121_384x384.parquet] Processed 1995 rows
+2026-02-01 12:56:14,997 [INFO] [batch_0122_384x384.parquet] Processed 1992 rows
+2026-02-01 12:57:29,495 [INFO] [batch_0123_384x384.parquet] Processed 1994 rows
+2026-02-01 12:58:37,341 [INFO] [batch_0124_384x384.parquet] Processed 1995 rows
+2026-02-01 12:59:35,927 [INFO] [batch_0125_384x384.parquet] Processed 1505 rows
+2026-02-01 12:59:35,927 [INFO] Merging all results...
+2026-02-01 12:59:35,965 [INFO] Final columns: ['grid_cell', 'grid_row_u', 'grid_col_r', 'product_id', 'timestamp', 'centre_lat', 'centre_lon', 'utm_crs', 'parquet_url', 'parquet_row', 'pixel_bbox', 'embedding']
+2026-02-01 12:59:35,965 [INFO] ✓ All unwanted columns removed
+2026-02-01 12:59:35,965 [INFO] ✓ Column 'crs' renamed to 'utm_crs'
+2026-02-01 12:59:35,965 [INFO] ✓ Column 'pixel_bbox' added
+2026-02-01 12:59:35,965 [INFO] Saving to /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/siglip/SigLIP_crop_384x384.parquet...
+2026-02-01 12:59:44,647 [INFO] ================================================================================
+2026-02-01 12:59:44,648 [INFO] Processing complete!
+2026-02-01 12:59:44,648 [INFO] Total rows: 248,719
+2026-02-01 12:59:44,648 [INFO] Embedding dimension: 1152
+2026-02-01 12:59:44,648 [INFO] Output file: /data1/zyj/EarthEmbeddings/Core-S2L2A-249k/siglip/SigLIP_crop_384x384.parquet
+2026-02-01 12:59:44,648 [INFO] ================================================================================

models/FarSLIP/.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+**/logs/
+**/wandb/
+models/
+features/
+results/
+src/open_clip_train/config.py
+src/open_clip_train/output_samples/
+**/results_retrieval/
+**/results_classification/
+checkpoints/
+tests/data/
+*.pt
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+sync.sh
+gpu1sync.sh
+.idea
+*.pdf
+**/._*
+**/*DS_*
+**.jsonl
+src/sbatch
+src/misc
+.vscode
+src/debug
+core.*
+*.out
+# Allow
+!src/evaluation/misc/results_dbs/*

models/FarSLIP/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 LHRS
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

models/FarSLIP/README.md ADDED Viewed

	@@ -0,0 +1,237 @@

+<h1 align="center"> FarSLIP: Discovering Effective CLIP Adaptation for Fine-Grained Remote Sensing Understanding </h1>
+<p align="center">
+    <a href="https://huggingface.co/datasets/ZhenShiL/MGRS-200k">
+        <img alt="Hugging Face Dataset" src="https://img.shields.io/badge/🤗%20Hugging%20Face-Dataset-blue">
+    </a>
+    <a href="https://huggingface.co/ZhenShiL/FarSLIP">
+        <img alt="Hugging Face Model" src="https://img.shields.io/badge/🤗%20Hugging%20Face-Model-yellow">
+    </a>
+    <a href="https://arxiv.org/abs/2511.14901">
+        <img alt="arXiv" src="https://img.shields.io/badge/arXiv-2511.14901-b31b1b">
+    </a>
+</p>
+## Introduction
+We introduce FarSLIP, a vision-language foundation model for remote sensing (RS) that achieves fine-grained vision-language alignment. FarSLIP demonstrates state-of-the-art performance on both fine-grained and image-level tasks, including open-vocabulary semantic segmentation, zero-shot classification, and image-text retrieval.
+We also construct MGRS-200k, the first multi-granularity image-text dataset for RS. Each image is annotated with both short and long global-level captions, along with multiple object-category pairs.
+<figure>
+<div align="center">
+<img src=assets/model.png width="60%">
+</div>
+</figure>
+## Table of Contents
+- [Introduction](#Introduction)
+- [Preparation](#Preparation)
+  - [Installation](#Installation)
+  - [Checkpoints](#Checkpoints)
+  - [Dataset](#Dataset)
+- [Training](#Training)
+- [Testing](#Testing)
+    - [Open-vocabulary semantic segmentation](#open-vocabulary-semantic-segmentation)
+    - [Zero-shot scene classification](#zero-shot-scene-classification)
+    - [Zero-shot image-text retrieval](#zero-shot-image-text-retrieval)
+- [Acknowledgement](#Acknowledgement)
+- [Citing](#Citing)
+## Preparation
+### Installation
+1. Clone this repository.
+    ~~~shell
+    git clone git@github.com:NJU-LHRS/FarSLIP.git
+    cd FarSLIP
+    ~~~
+2. Create a new virtual environment.
+    ~~~shell
+    conda create -n farslip python=3.10
+    conda activate farslip
+    ~~~
+3. Install dependences.
+    ~~~shell
+    pip install -r requirements.txt
+    ~~~
+### Checkpoints
+You can download all our checkpoints from [Huggingface](https://huggingface.co/ZhenShiL/FarSLIP), or selectively download them through the links below.
+| Model name  | ViT-arch. | Test encoder | OVSS mIoU (%)  | ZSC top-1 acc. (%) | Download |
+|-------------|-----------|--------------|----------------|--------------------|----------------|
+| FarSLIP-s1  | ViT-B-32  | Vanilla      | 29.87          | 58.64              | [FarSLIP1_ViT-B-32](https://huggingface.co/ZhenShiL/FarSLIP/resolve/main/FarSLIP1_ViT-B-32.pt?download=true) |
+| FarSLIP-s1  | ViT-B-16  | LongCLIP     | 35.44          | 61.89              | [FarSLIP1_ViT-B-16](https://huggingface.co/ZhenShiL/FarSLIP/resolve/main/FarSLIP1_ViT-B-16.pt?download=true) |
+| FarSLIP-s2  | ViT-B-32  | Vanilla      | 30.49          | 60.12              | [FarSLIP2_ViT-B-32](https://huggingface.co/ZhenShiL/FarSLIP/resolve/main/FarSLIP2_ViT-B-32.pt?download=true) |
+| FarSLIP-s2  | ViT-B-16  | LongCLIP     | 35.41          | 62.24              | [FarSLIP2_ViT-B-16](https://huggingface.co/ZhenShiL/FarSLIP/resolve/main/FarSLIP2_ViT-B-16.pt?download=true) |
+### Dataset
+FarSLIP is trained in two stages.
++ In the first stage, we use the [RS5M](https://github.com/om-ai-lab/RS5M) dataset. A quick portal to the RS5M dataset: [link](https://huggingface.co/datasets/omlab/RS5M).
++ In the second stage, we use the proposed MGRS-200k dataset, which is available on [Huggingface](https://huggingface.co/datasets/ZhenShiL/MGRS-200k).
+[//]: # (<figure>)
+[//]: # (<div align="center">)
+[//]: # (<img src=assets/dataset.png width="80%">)
+[//]: # (</div>)
+[//]: # (<figcaption align="center"><em>Examples from MGRS-200k</em></figcaption>)
+[//]: # (</figure>)
+<p align="center">
+  <img src="assets/dataset.png" width="100%">
+  <br>
+  <em>Examples from MGRS-200k</em>
+</p>
+## Training
++ Validation data preparation
+    + Replace --root-val-img-dir and --val-data in [config.py](./open_clip_train/config.py) with the paths to your [SkyScript](https://github.com/wangzhecheng/SkyScript?tab=readme-ov-file#download) validation dataset ('SkyScript_val_5K_filtered_by_CLIP_openai').
++ Stage1
+    ~~~shell
+    torchrun --nproc_per_node=4 -m open_clip_train.main \
+    --train-dataset-name RS5M \
+    --train-data '/your/path/to/rs5m/{pub11,rs3}-train-{0000..0031}.tar' \
+    --train-dataset-type webdataset \
+    --train-num-samples 5070186 \
+    --method farslip1 \
+    --use-imagecrop-aug \
+    --local-method randomcrops \
+    --warmup 1000 \
+    --batch-size 40 \
+    --lr 1e-6 \
+    --wd 1.0 \
+    --epochs 1 \
+    --model ViT-B-16 \
+    --loss-type global_itc distill \
+    --distill-align roi2pooled
+    ~~~
++ Stage2
+    ~~~shell
+    torchrun --nproc_per_node=4 -m open_clip_train.main \
+    --train-dataset-name MGRS \
+    --root-train-img-dir '/your/path/to/mgrs/global_imgs/' \
+    --train-data '/your/path/to/mgrs/text_info.json' \
+    --train-dataset-type json \
+    --method farslip2 \
+    --warmup 250 \
+    --batch-size 40 \
+    --lr 4e-9 \
+    --wd 1.0 \
+    --epochs 10 \
+    --model ViT-B-16 \
+    --loss-type global_itc local_itc \
+    --local-itc-align cls
+    ~~~
+## Testing
+### Open-vocabulary semantic segmentation
++ Please checkout [FarSLIP-OVSS](https://github.com/NJU-LHRS/FarSLIP-OVSS) for evaluation of open-vocabulary semantic segmentation in RS images.
+<p align="center">
+  <img src="assets/ovss.png" width="100%">
+  <br>
+  <em>
+    OVSS accuracies across RS benchmarks (mIoU, %). G denotes general-domain models, and RS refers to RS-specific models.
+    f. indicates models specifically designed with fine-grained optimization. All models use an input image size of 224, except TIPS (448)
+  </em>
+</p>
+### Zero-shot scene classification
++ Please refer to [SkyScript](https://github.com/wangzhecheng/SkyScript?tab=readme-ov-file#download-benchmark-datasets) for scene classification dataset preparation, including 'SkyScript_cls', 'aid', 'eurosat', 'fmow', 'millionaid', 'patternnet', 'rsicb', 'nwpu'.
++ Replace the BENCHMARK_DATASET_ROOT_DIR in [tests/test_scene_classification.py](./tests/test_scene_classification.py) to your own path.
++ Run testing:
+    + FarSLIP-s1
+    ```
+    python -m tests.test_scene_classification --model-arch $VIT --model-name FarSLIP1 --force-quick-gelu --pretrained checkpoints/FarSLIP1_$VIT.pt
+    ```
+    <!-- + FarSLIP-s2 with vanilla CLIP text encoder
+    ```
+    python -m tests.test_scene_classification --model-arch $VIT --model-name FarSLIP2_VC --force-quick-gelu --pretrained checkpoints/FarSLIP2_VC_$VIT.pt
+    ``` -->
+    + FarSLIP-s2 with LongCLIP text encoder (supporting long text)
+    ```
+    python -m tests.test_scene_classification --model-arch $VIT --model-name FarSLIP2 --force-quick-gelu --pretrained checkpoints/FarSLIP2_$VIT.pt --use-long-clip
+    ```
+    - `$VIT` options: `ViT-B-16`, `ViT-B-32`
+<figure>
+<div align="center">
+<img src=assets/classification.png width="100%">
+</div>
+<figcaption align="center">
+<em>Comparison of zero-shot classification accuracies (Top-1 acc., %) of different RS-specific CLIP variants across multiple benchmarks.</em>
+</figcaption>
+</figure>
+### Zero-shot image-text retrieval
++ Please refer to [SkyScript](https://github.com/wangzhecheng/SkyScript?tab=readme-ov-file#download-benchmark-datasets) for image-text retrieval dataset preparation, including 'RSICD', 'RSITMD', 'ucmcaptions', and ['SkyScript-retrieval'](https://github.com/wangzhecheng/SkyScript?tab=readme-ov-file#download) ('SkyScript_test_30K_filtered_by_CLIP_openai.csv').
++ Replace the DATA_CSV_PATH_DICT, SKYSCRIPT_IMAGE_DIR, RETRIEVAL_IMAGE_DIR in [tests/test_retrieval.py](./tests/test_retrieval.py) to your own path.
++ Run testing:
+    + FarSLIP-s1
+    ```
+    python -m tests.test_retrieval --model-arch $VIT --model-name FarSLIP1 --force-quick-gelu --pretrained checkpoints/FarSLIP1_$VIT.pt
+    ```
+    <!-- + FarSLIP-s2 with vanilla CLIP text encoder
+    ```
+    python -m tests.test_retrieval --model-arch $VIT --model-name FarSLIP2_VC --force-quick-gelu --pretrained checkpoints/FarSLIP2_VC_$VIT.pt
+    ``` -->
+    + FarSLIP-s2 with LongCLIP text encoder (supporting long text)
+    ```
+    python -m tests.test_retrieval --model-arch $VIT --model-name FarSLIP2 --force-quick-gelu --pretrained checkpoints/FarSLIP2_$VIT.pt --use-long-clip
+    ```
+    - `$VIT` options: `ViT-B-16`, `ViT-B-32`
+<div align="center">
+<img src=assets/retrieval.png width="50%">
+</div>
+<figcaption align="center">
+<em>Comparison of cross-modal retrieval accuracies (%) of different RS-specific CLIP variants across multiple benchmarks. *
+indicates models trained with in-hold supervision.</em>
+</figcaption>
+</figure>
+## Acknowledgement
++ We gratitude to the following repositories for their wonderful works: [Open-CLIP](https://github.com/mlfoundations/open_clip), [CLIPSelf](https://github.com/wusize/CLIPSelf), [FineCLIP](https://github.com/Timsty1/FineCLIP), [Long-CLIP](https://github.com/beichenzbc/Long-CLIP), [SkyScript](https://github.com/wangzhecheng/SkyScript), [SegEarth](https://github.com/likyoo/SegEarth-OV).
+## Citing
++ If you find our work is useful, please give us 🌟 in GitHub and consider cite our paper:
+    ~~~tex
+    @article{li2025farslip,
+    title={FarSLIP: Discovering Effective CLIP Adaptation for Fine-Grained Remote Sensing Understanding},
+    author={Zhenshi Li and Weikang Yu and Dilxat Muhtar and Xueliang Zhang and Pengfeng Xiao and Pedram Ghamisi and Xiao Xiang Zhu},
+    journal={arXiv preprint arXiv:2511.14901},
+    year={2025}
+    }
+    ~~~

models/FarSLIP/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .open_clip import *

models/FarSLIP/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .version import __version__
+from .coca_model import CoCa
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
+from .factory import list_models, add_model_config, get_model_config, load_checkpoint
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss
+from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
+    convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
+    get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
+    get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
+from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
+from .tokenizer import SimpleTokenizer, tokenize, decode
+from .transform import image_transform, AugmentationCfg
+from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
+from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES

models/FarSLIP/open_clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

models/FarSLIP/open_clip/coca_model.py ADDED Viewed

	@@ -0,0 +1,582 @@

+from typing import Dict, List, Optional, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from dataclasses import dataclass
+from .transformer import (
+    LayerNormFp32,
+    LayerNorm,
+    QuickGELU,
+    MultimodalTransformer,
+)
+from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
+try:
+    from transformers import (
+        BeamSearchScorer,
+        LogitsProcessorList,
+        TopPLogitsWarper,
+        TopKLogitsWarper,
+        RepetitionPenaltyLogitsProcessor,
+        MinLengthLogitsProcessor,
+        MaxLengthCriteria,
+        StopStringCriteria,
+        EosTokenCriteria,
+        StoppingCriteriaList
+    )
+    GENERATION_TYPES = {
+        "top_k": TopKLogitsWarper,
+        "top_p": TopPLogitsWarper,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = True
+except ImportError as e:
+    GENERATION_TYPES = {
+        "top_k": None,
+        "top_p": None,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = False
+@dataclass
+class MultimodalCfg(CLIPTextCfg):
+    mlp_ratio: int = 4
+    dim_head: int = 64
+    heads: int = 8
+    n_queries: int = 256
+    attn_pooler_heads: int = 8
+def _build_text_decoder_tower(
+        embed_dim,
+        multimodal_cfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    norm_layer = (
+        LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+    )
+    decoder = MultimodalTransformer(
+        context_length=multimodal_cfg.context_length,
+        width=multimodal_cfg.width,
+        heads=multimodal_cfg.heads,
+        layers=multimodal_cfg.layers,
+        ls_init_value=multimodal_cfg.ls_init_value,
+        output_dim=embed_dim,
+        act_layer=act_layer,
+        norm_layer=norm_layer,
+    )
+    return decoder
+def _token_to_tensor(token_id, device: str = "cpu") -> torch.Tensor:
+    if not isinstance(token_id, torch.Tensor):
+        if isinstance(token_id, int):
+            token_id = [token_id]
+        token_id = torch.tensor(token_id, device=device)
+    return token_id
+class CoCa(nn.Module):
+    def __init__(
+            self,
+            embed_dim,
+            multimodal_cfg: MultimodalCfg,
+            text_cfg: CLIPTextCfg,
+            vision_cfg: CLIPVisionCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            nonscalar_logit_scale: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            pad_id: int = 0,
+    ):
+        super().__init__()
+        multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+        text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
+        vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
+        self.text = _build_text_tower(
+            embed_dim=embed_dim,
+            text_cfg=text_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        vocab_size = (
+            text_cfg.vocab_size  # for hf models
+            if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
+            else text_cfg.vocab_size
+        )
+        self.visual = _build_vision_tower(
+            embed_dim=embed_dim,
+            vision_cfg=vision_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        self.text_decoder = _build_text_decoder_tower(
+            vocab_size,
+            multimodal_cfg=multimodal_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        lshape = [1] if nonscalar_logit_scale else []
+        self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
+        else:
+            self.logit_bias = None
+        self.pad_id = pad_id
+        self.context_length = multimodal_cfg.context_length
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+        self.text_decoder.set_grad_checkpointing(enable)
+    def _encode_image(self, images, normalize: bool = True):
+        image_latent, tokens_embs = self.visual(images)
+        image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
+        return image_latent, tokens_embs
+    def _encode_text(self, text, normalize: bool = True):
+        text_latent, token_emb = self.text(text)
+        text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
+        return text_latent, token_emb
+    def encode_image(self, images, normalize: bool = True):
+        image_latent, _ = self._encode_image(images, normalize=normalize)
+        return image_latent
+    def encode_text(self, text, normalize: bool = True):
+        text_latent, _ = self._encode_text(text, normalize=normalize)
+        return text_latent
+    def forward_intermediates(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+            image_indices: Optional[Union[int, List[int]]] = None,
+            text_indices: Optional[Union[int, List[int]]] = None,
+            stop_early: bool = False,
+            normalize: bool = True,
+            normalize_intermediates: bool = False,
+            intermediates_only: bool = False,
+            image_output_fmt: str = 'NCHW',
+            image_output_extra_tokens: bool = False,
+            text_output_fmt: str = 'NLC',
+            text_output_extra_tokens: bool = False,
+            output_logits: bool = False,
+            output_logit_scale_bias: bool = False,
+    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
+        """ Forward features that returns intermediates.
+        Args:
+            image: Input image tensor
+            text: Input text tensor
+            image_indices: For image tower, Take last n blocks if int, all if None, select matching indices if sequence
+            text_indices: Take last n blocks if int, all if None, select matching indices if sequence
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            normalize: L2 Normalize final image and text features (if present)
+            normalize_intermediates: Apply final encoder norm layer to all intermediates (if possible)
+            intermediates_only: Only return intermediate features, do not return final features
+            image_output_fmt: Shape of intermediate image feature outputs
+            image_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            text_output_fmt: Shape of intermediate text feature outputs
+            text_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            output_logits: Include logits in output
+            output_logit_scale_bias: Include the logit scale bias in the output
+        Returns:
+        """
+        output = {}
+        if intermediates_only:
+            # intermediates only disables final feature normalization, and include logits
+            normalize = False
+            output_logits = False
+        if output_logits:
+            assert False, 'FIXME, needs implementing'
+        if image is not None:
+            image_output = self.visual.forward_intermediates(
+                image,
+                indices=image_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=image_output_fmt,
+                output_extra_tokens=image_output_extra_tokens,
+            )
+            if normalize and "image_features" in image_output:
+                image_output["image_features"] = F.normalize(image_output["image_features"], dim=-1)
+            output.update(image_output)
+        if text is not None:
+            text_output = self.text.forward_intermediates(
+                text,
+                indices=text_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=text_output_fmt,
+                output_extra_tokens=text_output_extra_tokens,
+            )
+            if normalize and "text_features" in text_output:
+                text_output["text_features"] = F.normalize(text_output["text_features"], dim=-1)
+            output.update(text_output)
+        # FIXME text decoder
+        logit_scale_exp = self.logit_scale.exp() if output_logits or output_logit_scale_bias else None
+        if output_logit_scale_bias:
+            output["logit_scale"] = logit_scale_exp
+            if self.logit_bias is not None:
+                output['logit_bias'] = self.logit_bias
+        return output
+    def forward(
+            self,
+            image,
+            text: Optional[torch.Tensor] = None,
+            image_latent: Optional[torch.Tensor] = None,
+            image_embs: Optional[torch.Tensor] = None,
+            output_labels: bool = True,
+    ):
+        if image_latent is None or image_embs is None:
+            image_latent, image_embs = self._encode_image(image)
+        if text is None:
+            return {"image_features": image_latent, "image_embs": image_embs}
+        text_latent, token_embs = self._encode_text(text)
+        # FIXME this isn't an ideal solution, would like to improve -RW
+        labels: Optional[torch.Tensor] = text[:, 1:] if output_labels else None
+        if output_labels:
+            # align text_embs and thus logits with labels for teacher-forcing caption loss
+            token_embs = token_embs[:, :-1]
+        logits = self.text_decoder(image_embs, token_embs)
+        out_dict = {
+            "image_features": image_latent,
+            "text_features": text_latent,
+            "logits": logits,
+            "logit_scale": self.logit_scale.exp()
+        }
+        if labels is not None:
+            out_dict["labels"] = labels
+        if self.logit_bias is not None:
+            out_dict["logit_bias"] = self.logit_bias
+        return out_dict
+    def generate(
+        self,
+        image,
+        text=None,
+        seq_len=30,
+        max_seq_len=77,
+        temperature=1.,
+        generation_type="beam_search",
+        top_p=0.1,  # keep tokens in the 1 - top_p quantile
+        top_k=1,  # keeps the top_k most probable tokens
+        pad_token_id=None,
+        eos_token_id=None,
+        sot_token_id=None,
+        num_beams=6,
+        num_beam_groups=3,
+        min_seq_len=5,
+        stopping_criteria=None,
+        repetition_penalty=1.0,
+        fixed_output_length=False # if True output.shape == (batch_size, seq_len)
+    ):
+        # taking many ideas and components from HuggingFace GenerationMixin
+        # https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
+        assert _has_transformers, "Please install transformers for generate functionality. `pip install transformers`."
+        assert seq_len > min_seq_len, "seq_len must be larger than min_seq_len"
+        device = image.device
+        with torch.no_grad():
+            sot_token_id = _token_to_tensor(49406 if sot_token_id is None else sot_token_id, device=device)
+            eos_token_id = _token_to_tensor(49407 if eos_token_id is None else eos_token_id, device=device)
+            pad_token_id = self.pad_id if pad_token_id is None else pad_token_id
+            logit_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(min_seq_len, eos_token_id),
+                    RepetitionPenaltyLogitsProcessor(repetition_penalty),
+                ]
+            )
+            if stopping_criteria is None:
+                stopping_criteria = [MaxLengthCriteria(max_length=seq_len)]
+            stopping_criteria = StoppingCriteriaList(stopping_criteria)
+            if generation_type == "beam_search":
+                output = self._generate_beamsearch(
+                    image_inputs=image,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    sot_token_id=sot_token_id,
+                    num_beams=num_beams,
+                    num_beam_groups=num_beam_groups,
+                    min_seq_len=min_seq_len,
+                    stopping_criteria=stopping_criteria,
+                    logit_processor=logit_processor,
+                )
+                if fixed_output_length and output.shape[1] < seq_len:
+                    pad_len = seq_len - output.shape[1]
+                    return torch.cat((
+                            output,
+                            torch.ones(output.shape[0], pad_len, device=device, dtype=output.dtype) * pad_token_id
+                        ),
+                        dim=1
+                    )
+                return output
+            elif generation_type == "top_p":
+                logit_warper = GENERATION_TYPES[generation_type](top_p)
+            elif generation_type == "top_k":
+                logit_warper = GENERATION_TYPES[generation_type](top_k)
+            else:
+                raise ValueError(
+                    f"generation_type has to be one of "
+                    f"{'| ' + ' | '.join(list(GENERATION_TYPES.keys())) + ' |'}."
+                )
+            image_latent, image_embs = self._encode_image(image)
+            if text is None:
+                text = torch.ones((image.shape[0], 1), device=device, dtype=torch.long) * sot_token_id
+            was_training = self.training
+            num_dims = len(text.shape)
+            if num_dims == 1:
+                text = text[None, :]
+            self.eval()
+            out = text
+            while True:
+                x = out[:, -max_seq_len:]
+                cur_len = x.shape[1]
+                logits = self(
+                    image,
+                    x,
+                    image_latent=image_latent,
+                    image_embs=image_embs,
+                    output_labels=False,
+                )["logits"][:, -1]
+                mask = (out[:, -1] == eos_token_id) | (out[:, -1] == pad_token_id)
+                sample = torch.ones((out.shape[0], 1), device=device, dtype=torch.long) * pad_token_id
+                if mask.all():
+                    if not fixed_output_length:
+                        break
+                else:
+                    logits = logits[~mask, :]
+                    filtered_logits = logit_processor(x[~mask, :], logits)
+                    filtered_logits = logit_warper(x[~mask, :], filtered_logits)
+                    probs = F.softmax(filtered_logits / temperature, dim=-1)
+                    if (cur_len + 1 == seq_len):
+                        sample[~mask, :] = torch.ones((sum(~mask), 1), device=device, dtype=torch.long) * eos_token_id
+                    else:
+                        sample[~mask, :] = torch.multinomial(probs, 1)
+                out = torch.cat((out, sample), dim=-1)
+                cur_len += 1
+                if all(stopping_criteria(out, None)):
+                    break
+            if num_dims == 1:
+                out = out.squeeze(0)
+            self.train(was_training)
+            return out
+    def _generate_beamsearch(
+            self,
+            image_inputs,
+            pad_token_id=None,
+            eos_token_id=None,
+            sot_token_id=None,
+            num_beams=6,
+            num_beam_groups=3,
+            min_seq_len=5,
+            stopping_criteria=None,
+            logit_processor=None,
+            logit_warper=None,
+    ):
+        device = image_inputs.device
+        batch_size = image_inputs.shape[0]
+        image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
+        image_latent, image_embs = self._encode_image(image_inputs)
+        input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
+        input_ids = input_ids * sot_token_id
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=device,
+            num_beam_groups=num_beam_groups,
+        )
+        # instantiate logits processors
+        logits_processor = (
+            LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
+            if logit_processor is None
+            else logit_processor
+        )
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
+        batch_beam_size, cur_len = input_ids.shape
+        beam_indices = None
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        while True:
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
+            outputs = self(
+                model_inputs['images'],
+                model_inputs['text'],
+                image_latent=image_latent,
+                image_embs=image_embs,
+                output_labels=False,
+            )
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+                # select outputs of beams of currentg group only
+                next_token_logits = outputs['logits'][batch_group_indices, -1, :]
+                vocab_size = next_token_logits.shape[-1]
+                next_token_scores_processed = logits_processor(
+                    group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
+                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                )
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+                next_tokens = next_tokens % vocab_size
+                # stateless
+                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    beam_indices=process_beam_indices,
+                    group_index=beam_group_idx,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
+                )
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            # increase cur_len
+            cur_len = cur_len + 1
+            if beam_scorer.is_done or all(stopping_criteria(input_ids, None)):
+                break
+        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=final_beam_indices,
+        )
+        return sequence_outputs['sequences']
+def prepare_inputs_for_generation(input_ids, image_inputs, past=None, **kwargs):
+    if past:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+    else:
+        position_ids = None
+    return {
+        "text": input_ids,
+        "images": image_inputs,
+        "past_key_values": past,
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+    }

models/FarSLIP/open_clip/constants.py ADDED Viewed

	@@ -0,0 +1,11 @@

+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+INCEPTION_MEAN = (0.5, 0.5, 0.5)
+INCEPTION_STD = (0.5, 0.5, 0.5)
+# Default name for a weights file hosted on the Huggingface Hub.
+HF_WEIGHTS_NAME = "open_clip_pytorch_model.bin"  # default pytorch pkl
+HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"  # safetensors version
+HF_CONFIG_NAME = 'open_clip_config.json'

models/FarSLIP/open_clip/convert.py ADDED Viewed

	@@ -0,0 +1,206 @@

+""" Conversion functions for 3rd part state-dicts and non-torch native checkpoint formats.
+"""
+from typing import Union
+import torch
+import numpy as np
+from .model import CLIP, CustomTextCLIP
+from .transformer import TextTransformer, Transformer
+@torch.no_grad()
+def load_big_vision_weights(model: CustomTextCLIP, checkpoint_path: str):
+    """ Load weights from .npz checkpoints for official Google big_vision image-text models
+    Currently, the SigLIP source models are supported and a CustomTextCLIP destination model
+    w/ timm image encoder.
+    """
+    from timm.layers import resample_patch_embed, resample_abs_pos_embed
+    def _n2p(w, t=True, idx=None):
+        if idx is not None:
+            w = w[idx]
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+    w = np.load(checkpoint_path)
+    interpolation = 'bilinear'
+    antialias = False
+    def _convert_timm_img(module, prefix):
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+        if embed_conv_w.shape[-2:] != module.patch_embed.proj.weight.shape[-2:]:
+            embed_conv_w = resample_patch_embed(
+                embed_conv_w,
+                module.patch_embed.proj.weight.shape[-2:],
+                interpolation=interpolation,
+                antialias=antialias,
+                verbose=True,
+            )
+        module.patch_embed.proj.weight.copy_(embed_conv_w)
+        module.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+        if module.cls_token is not None:
+            module.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+        pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False)
+        if pos_embed_w.shape != module.pos_embed.shape:
+            assert False, f'{pos_embed_w.shape}, {module.pos_embed.shape}'
+            num_prefix_tokens = 0 if getattr(module, 'no_embed_class', False) else getattr(module, 'num_prefix_tokens', 1)
+            pos_embed_w = resample_abs_pos_embed(  # resize pos embedding when different size from pretrained weights
+                pos_embed_w,
+                new_size=module.patch_embed.grid_size,
+                num_prefix_tokens=num_prefix_tokens,
+                interpolation=interpolation,
+                antialias=antialias,
+                verbose=True,
+            )
+        module.pos_embed.copy_(pos_embed_w)
+        mha_sub, b_sub, ln1_sub = (0, 0, 1)
+        for i, block in enumerate(module.blocks.children()):
+            if f'{prefix}Transformer/encoderblock/LayerNorm_0/scale' in w:
+                block_prefix = f'{prefix}Transformer/encoderblock/'
+                idx = i
+            else:
+                block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+                idx = None
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_{mha_sub}/'
+            block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'], idx=idx))
+            block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'], idx=idx))
+            block.attn.qkv.weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False, idx=idx).flatten(1).T for n in ('query', 'key', 'value')]))
+            block.attn.qkv.bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False, idx=idx).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel'], idx=idx).flatten(1))
+            block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'], idx=idx))
+            block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/scale'], idx=idx))
+            block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/bias'], idx=idx))
+            for r in range(2):
+                getattr(block.mlp, f'fc{r + 1}').weight.copy_(
+                    _n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/kernel'], idx=idx))
+                getattr(block.mlp, f'fc{r + 1}').bias.copy_(
+                    _n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/bias'], idx=idx))
+        module.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+        module.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+        if module.attn_pool is not None:
+            block_prefix = f'{prefix}MAPHead_0/'
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
+            module.attn_pool.latent.copy_(_n2p(w[f'{block_prefix}probe'], t=False))
+            module.attn_pool.q.weight.copy_(_n2p(w[f'{mha_prefix}query/kernel'], t=False).flatten(1).T)
+            module.attn_pool.q.bias.copy_(_n2p(w[f'{mha_prefix}query/bias'], t=False).reshape(-1))
+            module.attn_pool.kv.weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('key', 'value')]))
+            module.attn_pool.kv.bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('key', 'value')]))
+            module.attn_pool.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+            module.attn_pool.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+            module.attn_pool.norm.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+            module.attn_pool.norm.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+            for r in range(2):
+                getattr(module.attn_pool.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/kernel']))
+                getattr(module.attn_pool.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/bias']))
+    def _convert_openclip_transformer(module: Transformer, prefix):
+        for i, block in enumerate(module.resblocks.children()):
+            if f'{prefix}encoderblock/LayerNorm_0/scale' in w:
+                block_prefix = f'{prefix}encoderblock/'
+                idx = i
+            else:
+                block_prefix = f'{prefix}encoderblock_{i}/'
+                idx = None
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
+            block.ln_1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'], idx=idx))
+            block.ln_1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'], idx=idx))
+            block.attn.in_proj_weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False, idx=idx).flatten(1).T for n in ('query', 'key', 'value')]))
+            block.attn.in_proj_bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False, idx=idx).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.out_proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel'], idx=idx).flatten(1))
+            block.attn.out_proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'], idx=idx))
+            block.ln_2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/scale'], idx=idx))
+            block.ln_2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/bias'], idx=idx))
+            block.mlp.c_fc.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/kernel'], idx=idx))
+            block.mlp.c_fc.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/bias'], idx=idx))
+            block.mlp.c_proj.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/kernel'], idx=idx))
+            block.mlp.c_proj.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/bias'], idx=idx))
+    def _convert_openclip_txt(module: TextTransformer, prefix):
+        module.token_embedding.weight.copy_(_n2p(w[f'{prefix}Embed_0/embedding'], t=False))
+        pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False).squeeze(0)
+        module.positional_embedding.copy_(pos_embed_w)
+        _convert_openclip_transformer(module.transformer, prefix=prefix + 'Encoder_0/')
+        module.ln_final.weight.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/scale']))
+        module.ln_final.bias.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/bias']))
+        if module.text_projection is not None:
+            module.text_projection.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+            module.text_projection.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    root_prefix = 'params/' if 'params/b' in w else ''
+    _convert_timm_img(model.visual.trunk, f'{root_prefix}img/')
+    _convert_openclip_txt(model.text, f'{root_prefix}txt/')
+    model.logit_bias.copy_(_n2p(w[f'{root_prefix}b'])[0])
+    model.logit_scale.copy_(_n2p(w[f'{root_prefix}t'])[0])
+@torch.no_grad()
+def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict, fastvit = True):
+    def _convert_timm_img(state_dict):
+        if fastvit:
+            from timm.models.fastvit import checkpoint_filter_fn
+        else:
+            from timm.models.vision_transformer_hybrid import checkpoint_filter_fn
+        timm_state_dict = checkpoint_filter_fn(state_dict, model.visual.trunk)
+        timm_state_dict = {'visual.trunk.' + k: v for k, v in timm_state_dict.items()}
+        return timm_state_dict
+    def _convert_openclip_txt(state_dict, prefix='text_encoder.'):
+        text_dict = {}
+        for k, v in state_dict.items():
+            if not k.startswith(prefix):
+                continue
+            k = k.replace(prefix, '')
+            k = k.replace('projection_layer', 'text_projection')
+            k = k.replace('embedding_layer', 'token_embedding')
+            if k.startswith('positional_embedding.pos_embed.pos_embed'):
+                k = k.replace('positional_embedding.pos_embed.pos_embed', 'positional_embedding')
+                v = v.squeeze()
+            k = k.replace('final_layer_norm', 'ln_final')
+            k = k.replace('pre_norm_mha.0', 'ln_1')
+            k = k.replace('pre_norm_mha.1', 'attn')
+            k = k.replace('pre_norm_ffn.0', 'ln_2')
+            k = k.replace('pre_norm_ffn.1', 'mlp.c_fc')
+            k = k.replace('pre_norm_ffn.4', 'mlp.c_proj')
+            k = k.replace('qkv_proj.weight', 'in_proj_weight')
+            k = k.replace('qkv_proj.bias', 'in_proj_bias')
+            k = k.replace('transformer.', 'transformer.resblocks.')
+            text_dict['text.' + k] = v
+        return text_dict
+    image_dict = _convert_timm_img(state_dict)
+    text_dict = _convert_openclip_txt(state_dict)
+    out_dict = {**image_dict, **text_dict}
+    out_dict['logit_scale'] = state_dict['logit_scale']
+    return out_dict
+def convert_state_dict(model: Union[CustomTextCLIP, CLIP], state_dict):
+    if 'image_encoder.model.patch_embed.0.rbr_conv.0.conv.weight' in state_dict:
+        # Apple MobileCLIP s1 & s2 state_dicts (s0 and b not currently supported)
+        state_dict = convert_mobile_clip_state_dict(model, state_dict)
+    if 'image_encoder.model.patch_emb.0.block.conv.weight' in state_dict:
+        # convert b model
+        state_dict = convert_mobile_clip_state_dict(model, state_dict, fastvit=False)
+    return state_dict

models/FarSLIP/open_clip/factory.py ADDED Viewed

	@@ -0,0 +1,610 @@

+import json
+import logging
+import os
+import re
+import warnings
+from copy import deepcopy
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from .convert import convert_state_dict
+from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
+    resize_pos_embed, get_cast_dtype, resize_text_pos_embed, set_model_preprocess_cfg
+from .coca_model import CoCa
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss, SigLipLoss, MultiPosConLossMM
+from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\
+    list_pretrained_tags_by_model, download_pretrained_from_hf
+from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs
+from .tokenizer import HFTokenizer, SimpleTokenizer, SigLipTokenizer, DEFAULT_CONTEXT_LENGTH
+HF_HUB_PREFIX = 'hf-hub:'
+_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
+_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+def _rescan_model_configs():
+    global _MODEL_CONFIGS
+    config_ext = ('.json',)
+    config_files = []
+    for config_path in _MODEL_CONFIG_PATHS:
+        if config_path.is_file() and config_path.suffix in config_ext:
+            config_files.append(config_path)
+        elif config_path.is_dir():
+            for ext in config_ext:
+                config_files.extend(config_path.glob(f'*{ext}'))
+    for cf in config_files:
+        with open(cf, 'r') as f:
+            model_cfg = json.load(f)
+            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
+                _MODEL_CONFIGS[cf.stem] = model_cfg
+    _MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))}
+_rescan_model_configs()  # initial populate of model config registry
+def list_models():
+    """ enumerate available model architectures based on config files """
+    return list(_MODEL_CONFIGS.keys())
+def add_model_config(path):
+    """ add model config path or file and update registry """
+    if not isinstance(path, Path):
+        path = Path(path)
+    _MODEL_CONFIG_PATHS.append(path)
+    _rescan_model_configs()
+def get_model_config(model_name):
+    """ Fetch model config from builtin (local library) configs.
+    """
+    if model_name in _MODEL_CONFIGS:
+        return deepcopy(_MODEL_CONFIGS[model_name])
+    else:
+        return None
+def _get_hf_config(
+        model_id: str,
+        cache_dir: Optional[str] = None,
+):
+    """ Fetch model config from HuggingFace Hub.
+    """
+    config_path = download_pretrained_from_hf(
+        model_id,
+        filename='open_clip_config.json',
+        cache_dir=cache_dir,
+    )
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    return config
+def get_tokenizer(
+        model_name: str = '',
+        context_length: Optional[int] = None,
+        cache_dir: Optional[str] = None,
+        **kwargs,
+):
+    if model_name.startswith(HF_HUB_PREFIX):
+        model_name = model_name[len(HF_HUB_PREFIX):]
+        try:
+            config = _get_hf_config(model_name, cache_dir=cache_dir)['model_cfg']
+        except Exception:
+            tokenizer = HFTokenizer(
+                model_name,
+                context_length=context_length or DEFAULT_CONTEXT_LENGTH,
+                cache_dir=cache_dir,
+                **kwargs,
+            )
+            return tokenizer
+    else:
+        config = get_model_config(model_name)
+        assert config is not None, f"No valid model config found for {model_name}."
+    text_config = config.get('text_cfg', {})
+    if 'tokenizer_kwargs' in text_config:
+        tokenizer_kwargs = dict(text_config['tokenizer_kwargs'], **kwargs)
+    else:
+        tokenizer_kwargs = kwargs
+    if context_length is None:
+        context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH)
+    model_name = model_name.lower()
+    if text_config.get('hf_tokenizer_name', ''):
+        tokenizer = HFTokenizer(
+            text_config['hf_tokenizer_name'],
+            context_length=context_length,
+            cache_dir=cache_dir,
+            **tokenizer_kwargs,
+        )
+    elif 'siglip' in model_name:
+        tn = 'gemma' if 'siglip2'  in model_name else 'mc4' if 'i18n' in model_name else 'c4-en'
+        tokenizer = SigLipTokenizer(
+            tn,
+            context_length=context_length,
+            # **tokenizer_kwargs,
+        )
+    else:
+        tokenizer = SimpleTokenizer(
+            context_length=context_length,
+            **tokenizer_kwargs,
+        )
+    return tokenizer
+def load_state_dict(
+        checkpoint_path: str,
+        device='cpu',
+        weights_only=True,
+):
+    # Check if safetensors or not and load weights accordingly
+    if str(checkpoint_path).endswith(".safetensors"):
+        from safetensors.torch import load_file
+        checkpoint = load_file(checkpoint_path, device=device)
+    else:
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=weights_only)
+        except Exception:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif isinstance(checkpoint, torch.jit.ScriptModule):
+        state_dict = checkpoint.state_dict()
+        for key in ["input_resolution", "context_length", "vocab_size"]:
+            state_dict.pop(key, None)
+    else:
+        state_dict = checkpoint
+    if next(iter(state_dict.items()))[0].startswith('module'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    return state_dict
+def load_checkpoint(
+        model: Union[CLIP, CustomTextCLIP],
+        checkpoint_path: str,
+        strict: bool = True,
+        weights_only: bool = True,
+        device='cpu',
+):
+    if Path(checkpoint_path).suffix in ('.npz', '.npy'):
+        # Separate path loading numpy big_vision (SigLIP) weights
+        from open_clip.convert import load_big_vision_weights
+        load_big_vision_weights(model, checkpoint_path)
+        return {}
+    state_dict = load_state_dict(checkpoint_path, device=device, weights_only=weights_only)
+    # Detect & convert 3rd party state_dicts -> open_clip
+    state_dict = convert_state_dict(model, state_dict)
+    # Detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    # correct if logit_scale differs in being scaler vs 1d param
+    if 'logit_scale' in state_dict and model.logit_scale.ndim != state_dict['logit_scale'].ndim:
+        state_dict['logit_scale'] = state_dict['logit_scale'].reshape(model.logit_scale.shape)
+    # correct if logit_bias differs in being scaler vs 1d param
+    if 'logit_bias' in state_dict and model.logit_bias.ndim != state_dict['logit_bias'].ndim:
+        state_dict['logit_bias'] = state_dict['logit_bias'].reshape(model.logit_bias.shape)
+    # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712
+    if 'logit_bias' not in state_dict and model.logit_bias is not None:
+        state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"])
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
+    resize_pos_embed(state_dict, model)
+    resize_text_pos_embed(state_dict, model)
+    # Finally, load the massaged state_dict into model
+    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    if incompatible_keys.missing_keys:
+        print("Missing keys:", incompatible_keys.missing_keys)
+    if incompatible_keys.unexpected_keys:
+        print("Unexpected keys:", incompatible_keys.unexpected_keys)
+    logging.info(f"Missing keys: {incompatible_keys.missing_keys}")
+    return incompatible_keys
+def create_model(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        force_preprocess_cfg: Optional[Dict[str, Any]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        require_pretrained: bool = False,
+        load_weights_only: bool = True,
+        long_clip: Optional[str] = 'disable',
+        **model_kwargs,
+):
+    """Creates and configures a contrastive vision-language model.
+    Args:
+        model_name: Name of the model architecture to create. Can be a local model name
+            or a Hugging Face model ID prefixed with 'hf-hub:'.
+        pretrained: Tag/path for pretrained model weights. Can be:
+            - A pretrained tag name (e.g., 'openai')
+            - A path to local weights
+            - None to initialize with random weights
+        precision: Model precision/AMP configuration. Options:
+            - 'fp32': 32-bit floating point
+            - 'fp16'/'bf16': Mixed precision with FP32 for certain layers
+            - 'pure_fp16'/'pure_bf16': Pure 16-bit precision
+        device: Device to load the model on ('cpu', 'cuda', or torch.device object)
+        jit: If True, JIT compile the model
+        force_quick_gelu: Force use of QuickGELU activation
+        force_custom_text: Force use of custom text encoder
+        force_patch_dropout: Override default patch dropout value
+        force_image_size: Override default image size for vision encoder
+        force_preprocess_cfg: Override default preprocessing configuration
+        pretrained_image: Load pretrained weights for timm vision models
+        pretrained_hf: Load pretrained weights for HF text models when not loading CLIP weights
+        cache_dir: Override default cache directory for downloaded model files
+        output_dict: If True and model supports it, return dictionary of features
+        require_pretrained: Raise error if pretrained weights cannot be loaded
+        load_weights_only: Only deserialize model weights and unpickling torch checkpoints (for safety)
+        **model_kwargs: Additional keyword arguments passed to model constructor
+    Returns:
+        Created and configured model instance
+    Raises:
+        RuntimeError: If model config is not found or required pretrained weights
+            cannot be loaded
+    Examples:
+        # Create basic CLIP model
+        model = create_model('ViT-B/32')
+        # Create CLIP model with mixed precision on GPU
+        model = create_model('ViT-B/32', precision='fp16', device='cuda')
+        # Load pretrained OpenAI weights
+        model = create_model('ViT-B/32', pretrained='openai')
+        # Load Hugging Face model
+        model = create_model('hf-hub:organization/model-name')
+    """
+    force_preprocess_cfg = force_preprocess_cfg or {}
+    preprocess_cfg = asdict(PreprocessCfg())
+    has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX)
+    if has_hf_hub_prefix:
+        model_id = model_name[len(HF_HUB_PREFIX):]
+        checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
+        config = _get_hf_config(model_id, cache_dir=cache_dir)
+        preprocess_cfg = merge_preprocess_dict(preprocess_cfg, config['preprocess_cfg'])
+        model_cfg = config['model_cfg']
+        pretrained_hf = False  # override, no need to load original HF text weights
+    else:
+        model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
+        checkpoint_path = None
+        model_cfg = None
+    if isinstance(device, str):
+        device = torch.device(device)
+    model_cfg = model_cfg or get_model_config(model_name)
+    if model_cfg is not None:
+        logging.info(f'Loaded {model_name} model config.')
+    else:
+        logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
+        raise RuntimeError(f'Model config for {model_name} not found.')
+    if force_quick_gelu:
+        # override for use of QuickGELU on non-OpenAI transformer models
+        model_cfg["quick_gelu"] = True
+    if force_patch_dropout is not None:
+        # override the default patch dropout value
+        model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout
+    if force_image_size is not None:
+        # override model config's image size
+        model_cfg["vision_cfg"]["image_size"] = force_image_size
+    is_timm_model = 'timm_model_name' in model_cfg.get('vision_cfg', {})
+    if pretrained_image:
+        if is_timm_model:
+            # pretrained weight loading for timm models set via vision_cfg
+            model_cfg['vision_cfg']['timm_model_pretrained'] = True
+        else:
+            assert False, 'pretrained image towers currently only supported for timm models'
+    # cast_dtype set for fp16 and bf16 (manual mixed-precision), not set for 'amp' or 'pure' modes
+    cast_dtype = get_cast_dtype(precision)
+    is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {})
+    if is_hf_model:
+        # load pretrained weights for HF text model IFF no CLIP weights being loaded
+        model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf and not pretrained
+    custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model
+    model_cfg.update({"long_clip": long_clip})
+    model_cfg = dict(model_cfg, **model_kwargs)  # merge cfg dict w/ kwargs (kwargs overrides cfg)
+    if custom_text:
+        if "multimodal_cfg" in model_cfg:
+            model = CoCa(**model_cfg, cast_dtype=cast_dtype)
+        else:
+            model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+    else:
+        model = CLIP(**model_cfg, cast_dtype=cast_dtype)
+    if precision in ("fp16", "bf16"):
+        dtype = torch.float16 if 'fp16' in precision else torch.bfloat16
+        # manual mixed precision that matches original OpenAI behaviour
+        if is_timm_model:
+            # FIXME this is a bit janky, create timm based model in low-precision and
+            # then cast only LayerNormFp32 instances back to float32 so they don't break.
+            # Why? The convert_weights_to_lp fn only works with native models.
+            model.to(device=device, dtype=dtype)
+            from .transformer import LayerNormFp32
+            def _convert_ln(m):
+                if isinstance(m, LayerNormFp32):
+                    m.weight.data = m.weight.data.to(torch.float32)
+                    m.bias.data = m.bias.data.to(torch.float32)
+            model.apply(_convert_ln)
+        else:
+            model.to(device=device)
+            convert_weights_to_lp(model, dtype=dtype)
+    elif precision in ("pure_fp16", "pure_bf16"):
+        dtype = torch.float16 if 'fp16' in precision else torch.bfloat16
+        model.to(device=device, dtype=dtype)
+    else:
+        model.to(device=device)
+    pretrained_loaded = False
+    if pretrained:
+        checkpoint_path = ''
+        pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
+        if pretrained_cfg:
+            checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
+            preprocess_cfg = merge_preprocess_dict(preprocess_cfg, pretrained_cfg)
+            pretrained_quick_gelu = pretrained_cfg.get('quick_gelu', False)
+            model_quick_gelu = model_cfg.get('quick_gelu', False)
+            if pretrained_quick_gelu and not model_quick_gelu:
+                warnings.warn(
+                    f'These pretrained weights were trained with QuickGELU activation but the model config does '
+                    f'not have that enabled. Consider using a model config with a "-quickgelu" suffix or enable with a flag.')
+            elif not pretrained_quick_gelu and model_quick_gelu:
+                warnings.warn(
+                    f'The pretrained weights were not trained with QuickGELU but this activation is enabled in the '
+                    f'model config, consider using a model config without QuickGELU or disable override flags.')
+        elif os.path.exists(pretrained):
+            checkpoint_path = pretrained
+        if checkpoint_path:
+            logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
+            load_checkpoint(model, checkpoint_path, weights_only=load_weights_only, strict=False)
+        else:
+            error_str = (
+                f'Pretrained weights ({pretrained}) not found for model {model_name}.'
+                f' Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
+            logging.warning(error_str)
+            raise RuntimeError(error_str)
+        pretrained_loaded = True
+    elif has_hf_hub_prefix:
+        logging.info(f'Loading pretrained {model_name} weights ({checkpoint_path}).')
+        load_checkpoint(model, checkpoint_path, weights_only=load_weights_only)
+        pretrained_loaded = True
+    if require_pretrained and not pretrained_loaded:
+        # callers of create_model_from_pretrained always expect pretrained weights
+        raise RuntimeError(
+            f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.')
+    if output_dict and hasattr(model, "output_dict"):
+        model.output_dict = True
+    if jit:
+        model = torch.jit.script(model)
+    # set image preprocessing configuration in model attributes for convenience
+    if getattr(model.visual, 'image_size', None) is not None:
+        # use image_size set on model creation (via config or force_image_size arg)
+        force_preprocess_cfg['size'] = model.visual.image_size
+    set_model_preprocess_cfg(model, merge_preprocess_dict(preprocess_cfg, force_preprocess_cfg))
+    return model
+def create_loss(args):
+    if args.distill:
+        return DistillClipLoss(
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif "coca" in args.model.lower():
+        return CoCaLoss(
+            caption_loss_weight=args.coca_caption_loss_weight,
+            clip_loss_weight=args.coca_contrastive_loss_weight,
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif args.siglip:
+        assert not args.horovod, "Horovod not currently supported for SigLip"
+        return SigLipLoss(
+            rank=args.rank,
+            world_size=args.world_size,
+            dist_impl=args.loss_dist_impl,  # siglip has multiple distributed implementations to choose from
+        )
+    # elif args.mpcl_loss:
+    #     return MultiPosConLossMM(
+    #         rank=args.rank,
+    #         world_size=args.world_size,
+    #         temperature=0.07, w1=1.0, w2=1.0
+    #     )
+    return ClipLoss(
+        local_loss=args.local_loss,
+        gather_with_grad=args.gather_with_grad,
+        cache_labels=True,
+        rank=args.rank,
+        world_size=args.world_size,
+        use_horovod=args.horovod,
+    )
+def create_model_and_transforms(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        image_interpolation: Optional[str] = None,
+        image_resize_mode: Optional[str] = None,  # only effective for inference
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        load_weights_only: bool = True,
+        long_clip: Optional[str] = 'disable',
+        use_imagecrop_aug: Optional[bool] = False,
+        max_boxes: Optional[int] = 10,
+        local_method: str = 'grids',
+        **model_kwargs,
+):
+    force_preprocess_cfg = merge_preprocess_kwargs(
+        {},
+        mean=image_mean,
+        std=image_std,
+        interpolation=image_interpolation,
+        resize_mode=image_resize_mode,
+    )
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_patch_dropout=force_patch_dropout,
+        force_image_size=force_image_size,
+        force_preprocess_cfg=force_preprocess_cfg,
+        pretrained_image=pretrained_image,
+        pretrained_hf=pretrained_hf,
+        cache_dir=cache_dir,
+        output_dict=output_dict,
+        load_weights_only=load_weights_only,
+        long_clip=long_clip,
+        **model_kwargs,
+    )
+    pp_cfg = PreprocessCfg(**model.visual.preprocess_cfg)
+    preprocess_train = image_transform_v2(
+        pp_cfg,
+        is_train=True,
+        use_imagecrop_aug = use_imagecrop_aug,
+        max_boxes = max_boxes,
+        local_method = local_method,
+        aug_cfg=aug_cfg,
+    )
+    preprocess_val = image_transform_v2(
+        pp_cfg,
+        is_train=False,
+    )
+    return model, preprocess_train, preprocess_val
+def create_model_from_pretrained(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        image_interpolation: Optional[str] = None,
+        image_resize_mode: Optional[str] = None,  # only effective for inference
+        return_transform: bool = True,
+        cache_dir: Optional[str] = None,
+        load_weights_only: bool = True,
+        **model_kwargs,
+):
+    force_preprocess_cfg = merge_preprocess_kwargs(
+        {},
+        mean=image_mean,
+        std=image_std,
+        interpolation=image_interpolation,
+        resize_mode=image_resize_mode,
+    )
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_image_size=force_image_size,
+        force_preprocess_cfg=force_preprocess_cfg,
+        cache_dir=cache_dir,
+        require_pretrained=True,
+        load_weights_only=load_weights_only,
+        **model_kwargs,
+    )
+    if not return_transform:
+        return model
+    preprocess = image_transform_v2(
+        PreprocessCfg(**model.visual.preprocess_cfg),
+        is_train=False,
+    )
+    return model, preprocess