Spaces:

remisek
/

crop-map

Sleeping

App Files Files Community

remisek commited on Jan 23

Commit

4c0fb51

1 Parent(s): f5d4753

Install dependencies

Browse files

Files changed (23) hide show

.gitignore +4 -0
Dockerfile +19 -0
README.md +3 -0
app.py +413 -0
openeo_gfmap/__init__.py +23 -0
openeo_gfmap/backend.py +122 -0
openeo_gfmap/fetching.py +98 -0
openeo_gfmap/metadata.py +24 -0
openeo_gfmap/spatial.py +53 -0
openeo_gfmap/temporal.py +22 -0
pyproject.toml +16 -0
uv.lock +0 -0
worldcereal/__init__.py +39 -0
worldcereal/_version.py +3 -0
worldcereal/job.py +960 -0
worldcereal/openeo/__init__.py +0 -0
worldcereal/openeo/feature_extractor.py +582 -0
worldcereal/openeo/inference.py +1191 -0
worldcereal/openeo/mapping.py +250 -0
worldcereal/openeo/preprocessing.py +599 -0
worldcereal/openeo/udf_distance_to_cloud.py +72 -0
worldcereal/parameters.py +314 -0
worldcereal/utils/models.py +87 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.venv/
+.idea/
+__pycache__/
+*.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.12-slim
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+WORKDIR /app
+ENV UV_COMPILE_BYTECODE=1
+ENV UV_LINK_MODE=copy
+COPY pyproject.toml uv.lock ./
+RUN uv sync --frozen --no-cache --no-install-project
+COPY . .
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["steamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -9,3 +9,6 @@ short_description: Application for Automatic Crop Type Mapping
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# License
+This project is licensed under the terms of the MIT License. See the LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import streamlit as st
+import folium
+from folium import plugins
+from streamlit_folium import st_folium
+import rasterio
+from rasterio.warp import calculate_default_transform, reproject, Resampling
+import joblib
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from pathlib import Path
+from matplotlib import colors as colors
+import time
+from rasterio.crs import CRS
+from worldcereal.job import INFERENCE_JOB_OPTIONS, create_embeddings_process_graph
+from openeo_gfmap import TemporalContext, BoundingBoxExtent
+from worldcereal.parameters import EmbeddingsParameters
+crop_classes = {
+    "tuz": "#32cd32",
+    "burak": "#8b008b",
+    "jęczmień": "#ffd700",
+    "kukurydza": "#ffa500",
+    "lucerna": "#9acd32",
+    "mieszanka": "#daa520",
+    "owies": "#f0e68c",
+    "pszenica": "#f5deb3",
+    "pszenżyto": "#bdb76b",
+    "rzepak": "#ffff00",
+    "sad": "#228b22",
+    "słonecznik": "#ff4500",
+    "ziemniak": "#a0522d",
+    "łubin": "#9370db",
+    "żyto": "#cd853f",
+    "inne": "#808080"
+}
+class_to_id = {name: i for i, name in enumerate(crop_classes.keys())}
+id_to_class = {i: name for name, i in class_to_id.items()}
+st.set_page_config(page_title="Crop Map", layout="wide")
+model_path = Path("app/crop_map_app/models/random_forest_crop_classifier_06.joblib")
+demo_dir = Path("app/crop_map_app/embeddings/demo")
+temp_dir = Path("embeddings/temp_analysis")  # for new files
+temp_dir.mkdir(parents=True, exist_ok=True)
+def get_class_color_rgba(class_name, alpha=180):
+    hex_color = crop_classes.get(class_name, "#000000")
+    rgb = colors.hex2color(hex_color)
+    return (int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255), alpha)
+def create_legend_html(stats_legend):
+    html_parts = [
+        "<div style='background-color: rgba(255, 255, 255, 0.1); padding: 10px; border-radius: 5px; font-family: sans-serif;'>"
+    ]
+    for _, row in stats_legend.iterrows():
+        crop = row['Crop']
+        color = crop_classes.get(crop, "#000000")
+        percent = row['Percentage']
+        row_html = (
+            f"<div style='display: flex; align-items: center; margin-bottom: 4px;'>"
+            f"<div style='width: 15px; height: 15px; background-color: {color}; margin-right: 10px; border-radius: 3px;'></div>"
+            f"<span style='font-size: 14px; flex-grow: 1;'>{crop}</span>"
+            f"<span style='font-weight: bold; font-size: 14px;'>{percent:.1f}%</span>"
+            f"</div>"
+        )
+        html_parts.append(row_html)
+    html_parts.append("</div>")
+    return "".join(html_parts)
+@st.cache_resource
+def load_model():
+    if not model_path.exists(): return None
+    return joblib.load(model_path)
+@st.cache_data
+def run_prediction(tif_path, _model):
+    with rasterio.open(tif_path) as src:
+        embedding = src.read()
+        src_transform = src.transform
+        src_crs = src.crs
+        h, w = src.height, src.width
+    n_channels = embedding.shape[0]
+    reshaped = embedding.transpose(1, 2, 0).reshape(-1, n_channels)
+    # prediction
+    batch_size = 50000
+    preds = []
+    for i in range(0, reshaped.shape[0], batch_size):
+        batch = reshaped[i:i + batch_size]
+        batch = np.nan_to_num(batch)
+        preds.append(_model.predict(batch))
+    raw_class_map_str = np.concatenate(preds).reshape(h, w)
+    raw_class_map_int = np.zeros((h, w), dtype=np.uint8)
+    for class_name, class_id in class_to_id.items():
+        raw_class_map_int[raw_class_map_str == class_name] = class_id
+    src_crs_str = src_crs.to_string()
+    dst_crs = CRS.from_string('EPSG:4326')
+    left, bottom, right, top = rasterio.transform.array_bounds(h, w, src_transform)
+    transform, dst_width, dst_height = calculate_default_transform(
+        src_crs_str, dst_crs, w, h, left=left, bottom=bottom, right=right, top=top
+    )
+    destination = np.zeros((dst_height, dst_width), dtype=np.uint8)
+    reproject(
+        source=raw_class_map_int,
+        destination=destination,
+        src_transform=src_transform,
+        src_crs=src_crs_str,
+        dst_transform=transform,
+        dst_crs=dst_crs,
+        resampling=Resampling.nearest
+    )
+    bounds_orig = rasterio.transform.array_bounds(dst_height, dst_width, transform)
+    folium_bounds = [[bounds_orig[1], bounds_orig[0]], [bounds_orig[3], bounds_orig[2]]]
+    return destination, folium_bounds
+def run_openeo_job(lat, lon, size_km=1.0):
+    """
+    Runs WorldCereal job for a small box around lat/lon.
+    Returns path to downloaded tif or None.
+    """
+    try:
+        offset = (size_km / 111) / 2
+        west, east = lon - offset, lon + offset
+        south, north = lat - offset, lat + offset
+        spatial_extent = BoundingBoxExtent(
+            west=west, south=south, east=east, north=north, epsg=4326
+        )
+        # changing time range
+        temporal_extent = TemporalContext("2025-01-01", "2025-12-31")
+        st.info("Building OpenEO Process Graph...")
+        embedding_params = EmbeddingsParameters()
+        inference_result = create_embeddings_process_graph(
+            spatial_extent=spatial_extent,
+            temporal_extent=temporal_extent,
+            embeddings_parameters=embedding_params,
+            scale_uint16=True
+        )
+        job_title = f"thesis_demo_{lat}_{lon}"
+        st.info(f"Submitting Job: {job_title}...")
+        job = inference_result.create_job(
+            title=job_title,
+            job_options=INFERENCE_JOB_OPTIONS,
+        )
+        job.start()
+        job_id = job.job_id
+        st.success(f"Job started. ID: {job_id}")
+        status_box = st.empty()
+        while True:
+            metadata = job.describe_job()
+            status = metadata.get("status")
+            status_box.markdown(f"**Status:** `{status}` (refreshing every 5s...)")
+            if status == "finished":
+                break
+            elif status in ["error", "canceled"]:
+                st.error(f"Job failed with status: {status}")
+                return None
+            time.sleep(5)
+        st.info("Downloading results...")
+        results = job.get_results()
+        output_path = temp_dir / f"embedding_{lat}_{lon}.tif"
+        found = False
+        for asset in results.get_assets():
+            if asset.metadata.get("type", "").startswith("image/tiff"):
+                asset.download(str(output_path))
+                found = True
+                break
+        if found:
+            return output_path
+        else:
+            st.error("No TIFF found in results.")
+            return None
+    except Exception as e:
+        st.error(f"OpenEO Error: {str(e)}")
+        return None
+st.title("Crop Map")
+with st.sidebar:
+    st.header("Control Panel")
+    tif_files = list(demo_dir.glob("*.tif"))
+    if not tif_files:
+        st.error(f"No .tif files in {demo_dir}")
+        st.stop()
+    selected_tif = st.selectbox("Select Region", tif_files, format_func=lambda x: x.name)
+    possible_name = selected_tif.stem.replace("_embedding", "") + ".geojson"
+    geojson_path = selected_tif.parent / possible_name
+    has_geojson = geojson_path.exists()
+    if has_geojson:
+        st.success(f"Linked: {geojson_path.name}")
+    run_btn = st.button("Run Analysis", type="primary")
+if run_btn:
+    model = load_model()
+    if not model:
+        st.error("Model not found")
+        st.stop()
+    with st.spinner("Processing..."): # type: ignore[arg-type]
+        class_map, bounds = run_prediction(selected_tif, model)
+        h, w = class_map.shape
+        rgba_img = np.zeros((h, w, 4), dtype=np.uint8)
+        unique_ids = np.unique(class_map)
+        for uid in unique_ids:
+            if uid not in id_to_class: continue
+            crop = id_to_class[uid]
+            c = get_class_color_rgba(crop, alpha=255)
+            rgba_img[class_map == uid] = c
+        gdf = None
+        if has_geojson:
+            gdf = gpd.read_file(geojson_path)
+            if gdf.crs != "EPSG:4326":
+                gdf = gdf.to_crs("EPSG:4326")
+            gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.0001)
+        total = class_map.size
+        counts = {id_to_class[uid]: np.sum(class_map == uid) for uid in unique_ids if uid in id_to_class}
+        stats_df = pd.DataFrame([
+            {"Crop": k, "Pixels": v, "Percentage": v / total * 100} for k, v in counts.items()
+        ]).sort_values("Percentage", ascending=False)
+        st.session_state['analysis_results'] = {
+            "bounds": bounds,
+            "rgba_img": rgba_img,
+            "gdf": gdf,
+            "stats_df": stats_df
+        }
+tab1, tab2 = st.tabs(["Pre-loaded Regions", "Analyze New Area"])
+with tab1:
+    if 'analysis_results' in st.session_state:
+        data = st.session_state['analysis_results']
+        bounds = data['bounds']
+        rgba_img = data['rgba_img']
+        gdf = data['gdf']
+        stats_df = data['stats_df']
+        c1, c2 = st.columns([3, 1])
+        with c1:
+            center_lat = (bounds[0][0] + bounds[1][0]) / 2
+            center_lon = (bounds[0][1] + bounds[1][1]) / 2
+            overlay_opacity = st.slider("Overlay Opacity", 0.0, 1.0, 0.7, 0.1, key="opacity_tab1")
+            m = folium.Map(location=[center_lat, center_lon], zoom_start=14, control_scale=True)
+            folium.TileLayer(
+                tiles='CartoDB positron',
+                name='Light Map',
+                overlay=False
+            ).add_to(m)
+            folium.TileLayer(
+                tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
+                attr='Esri',
+                name='Satellite',
+                overlay=False
+            ).add_to(m)
+            folium.raster_layers.ImageOverlay(
+                image=rgba_img,
+                bounds=bounds,
+                opacity=overlay_opacity,
+                name='Prediction',
+                pixelated=True
+            ).add_to(m)
+            if gdf is not None:
+                folium.GeoJson(
+                    gdf,
+                    name="Fields",
+                    style_function=lambda x: {'color': 'white', 'weight': 1, 'fillOpacity': 0, 'dashArray': '5, 5'},
+                    tooltip=folium.GeoJsonTooltip(fields=['roslina'], aliases=['Crop:'])
+                ).add_to(m)
+            folium.LayerControl().add_to(m)
+            plugins.Fullscreen().add_to(m)
+            st_folium(m, height=600, use_container_width=True)
+        with c2:
+            st.subheader("Legend")
+            st.markdown(create_legend_html(stats_df), unsafe_allow_html=True)
+            st.dataframe(stats_df[["Crop", "Percentage"]], hide_index=True)
+with tab2:
+    c1, c2 = st.columns([1, 2])
+    if 'tab2_results'  not in st.session_state:
+        st.session_state['tab2_results'] = None
+    with c1:
+        st.markdown("### 1. Select Area")
+        lat = st.number_input("Latitude", value=50.93131691432723, format="%.4f")
+        lon = st.number_input("Longitude", value=22.781513694631702, format="%.4f")
+        if st.button("Generate the embedding and classify"):
+            with st.spinner("Talking to Satellites... (This takes ~5 mins)"): # type: ignore[arg-type]
+                tif_path = run_openeo_job(lat, lon)
+            if tif_path:
+                st.success("Embedding Generated!")
+                model = load_model()
+                class_map, bounds = run_prediction(tif_path, model)
+                h, w = class_map.shape
+                rgba_img = np.zeros((h, w, 4), dtype=np.uint8)
+                unique_ids = np.unique(class_map)
+                for uid in unique_ids:
+                    if uid not in id_to_class: continue
+                    crop = id_to_class[uid]
+                    c = get_class_color_rgba(crop, alpha=255)
+                    rgba_img[class_map == uid] = c
+                total = class_map.size
+                counts = {id_to_class[uid]: np.sum(class_map == uid) for uid in unique_ids if uid in id_to_class}
+                stats_df = pd.DataFrame([
+                    {"Crop": k, "Pixels": v, "Percentage": v / total * 100} for k, v in counts.items()
+                ]).sort_values("Percentage", ascending=False)
+                st.session_state['tab2_results'] = {
+                    "bounds": bounds,
+                    "rgba_img": rgba_img,
+                    "stats_df": stats_df
+                }
+                st.success("Classification Complete")
+    with c2:
+        if st.session_state['tab2_results']:
+            data = st.session_state['tab2_results']
+            bounds = data['bounds']
+            rgba_img = data['rgba_img']
+            stats_df = data['stats_df']
+            st.markdown("### 2. Analysis Results")
+            center_lat = (bounds[0][0] + bounds[1][0]) / 2
+            center_lon = (bounds[0][1] + bounds[1][1]) / 2
+            overlay_opacity = st.slider("Overlay Opacity", 0.0, 1.0, 0.7, 0.1, key="opacity_tab2")
+            m = folium.Map(location=[center_lat, center_lon], zoom_start=14, control_scale=True)
+            folium.TileLayer(
+                tiles='CartoDB positron',
+                name='Light Map',
+                overlay=False
+            ).add_to(m)
+            folium.TileLayer(
+                tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
+                attr='Esri',
+                name='Satellite',
+                overlay=False
+            ).add_to(m)
+            folium.raster_layers.ImageOverlay(
+                image=rgba_img,
+                bounds=bounds,
+                opacity=overlay_opacity,
+                name='Prediction',
+                pixelated=True
+            ).add_to(m)
+            folium.LayerControl().add_to(m)
+            plugins.Fullscreen().add_to(m)
+            st_folium(m, height=500, use_container_width=True)
+            st.divider()
+            col_leg, col_df = st.columns(2)
+            with col_leg:
+                st.subheader("Legend")
+                st.markdown(create_legend_html(stats_df), unsafe_allow_html=True)

openeo_gfmap/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""OpenEO General Framework for Mapping.
+Simplify the development of mapping applications through Remote Sensing data
+by leveraging the power of OpenEO (http://openeo.org/).
+More information available in the README.md file.
+"""
+from .backend import Backend, BackendContext
+from .fetching import FetchType
+from .metadata import FakeMetadata
+from .spatial import BoundingBoxExtent, SpatialContext
+from .temporal import TemporalContext
+__all__ = [
+    "Backend",
+    "BackendContext",
+    "SpatialContext",
+    "BoundingBoxExtent",
+    "TemporalContext",
+    "FakeMetadata",
+    "FetchType",
+]

openeo_gfmap/backend.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""Backend Contct.
+Defines on which backend the pipeline is being currently used.
+"""
+import logging
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Callable, Dict, Optional
+import openeo
+_log = logging.getLogger(__name__)
+class Backend(Enum):
+    """Enumerating the backends supported by the Mapping Framework."""
+    TERRASCOPE = "terrascope"
+    EODC = "eodc"  # Dask implementation. Do not test on this yet.
+    CDSE = "cdse"  # Terrascope implementation (pyspark) #URL: openeo.dataspace.copernicus.eu (need to register)
+    CDSE_STAGING = "cdse-staging"
+    LOCAL = "local"  # Based on the same components of EODc
+    FED = "fed"  # Federation backend
+@dataclass
+class BackendContext:
+    """Backend context and information.
+    Containing backend related information useful for the framework to
+    adapt the process graph.
+    """
+    backend: Backend
+def _create_connection(
+    url: str, *, env_var_suffix: str, connect_kwargs: Optional[dict] = None
+):
+    """
+    Generic helper to create an openEO connection
+    with support for multiple client credential configurations from environment variables
+    """
+    connection = openeo.connect(url, **(connect_kwargs or {}))
+    if (
+        os.environ.get("OPENEO_AUTH_METHOD") == "client_credentials"
+        and f"OPENEO_AUTH_CLIENT_ID_{env_var_suffix}" in os.environ
+    ):
+        # Support for multiple client credentials configs from env vars
+        client_id = os.environ[f"OPENEO_AUTH_CLIENT_ID_{env_var_suffix}"]
+        client_secret = os.environ[f"OPENEO_AUTH_CLIENT_SECRET_{env_var_suffix}"]
+        provider_id = os.environ.get(f"OPENEO_AUTH_PROVIDER_ID_{env_var_suffix}")
+        _log.info(
+            f"Doing client credentials from env var with {env_var_suffix=} {provider_id} {client_id=} {len(client_secret)=} "
+        )
+        connection.authenticate_oidc_client_credentials(
+            client_id=client_id, client_secret=client_secret, provider_id=provider_id
+        )
+    else:
+        # Standard authenticate_oidc procedure: refresh token, device code or default env var handling
+        # See https://open-eo.github.io/openeo-python-client/auth.html#oidc-authentication-dynamic-method-selection
+        # Use a shorter max poll time by default to alleviate the default impression that the test seem to hang
+        # because of the OIDC device code poll loop.
+        max_poll_time = int(
+            os.environ.get("OPENEO_OIDC_DEVICE_CODE_MAX_POLL_TIME") or 30
+        )
+        connection.authenticate_oidc(max_poll_time=max_poll_time)
+    return connection
+def vito_connection() -> openeo.Connection:
+    """Performs a connection to the VITO backend using the oidc authentication."""
+    return _create_connection(
+        url="openeo.vito.be",
+        env_var_suffix="VITO",
+    )
+def cdse_connection() -> openeo.Connection:
+    """Performs a connection to the CDSE backend using oidc authentication."""
+    return _create_connection(
+        url="openeo.dataspace.copernicus.eu",
+        env_var_suffix="CDSE",
+    )
+def cdse_staging_connection() -> openeo.Connection:
+    """Performs a connection to the CDSE backend using oidc authentication."""
+    return _create_connection(
+        url="openeo-staging.dataspace.copernicus.eu",
+        env_var_suffix="CDSE_STAGING",
+    )
+def eodc_connection() -> openeo.Connection:
+    """Perfroms a connection to the EODC backend using the oidc authentication."""
+    return _create_connection(
+        url="https://openeo.eodc.eu/openeo/1.1.0",
+        env_var_suffix="EODC",
+    )
+def fed_connection() -> openeo.Connection:
+    """Performs a connection to the OpenEO federated backend using the oidc
+    authentication."""
+    return _create_connection(
+        url="openeofed.dataspace.copernicus.eu/",
+        env_var_suffix="FED",
+    )
+BACKEND_CONNECTIONS: Dict[Backend, Callable] = {
+    Backend.TERRASCOPE: vito_connection,
+    Backend.CDSE: cdse_connection,
+    Backend.CDSE_STAGING: cdse_staging_connection,
+    Backend.FED: fed_connection,
+}

openeo_gfmap/fetching.py ADDED Viewed

	@@ -0,0 +1,98 @@

+""" Main file for extractions and pre-processing of data through OpenEO
+"""
+from enum import Enum
+from typing import Callable
+import openeo
+from openeo_gfmap import BackendContext
+from openeo_gfmap.spatial import SpatialContext
+from openeo_gfmap.temporal import TemporalContext
+class FetchType(Enum):
+    """Enumerates the different types of extraction. There are three types of
+    enumerations.
+    * TILE: Tile based extractions, getting the data for a dense part. The
+    output of such fetching process in a dense DataCube.
+    * POINT: Point based extractions. From a datasets of polygons, gets sparse
+    extractions and performs spatial aggregation on the selected polygons. The
+    output of such fetching process is a VectorCube, that can be used to get
+    a pandas.DataFrame
+    * POLYGON: Patch based extractions, returning a VectorCube of sparsed
+    patches. This can be retrieved as multiple NetCDF files from one job.
+    """
+    TILE = "tile"
+    POINT = "point"
+    POLYGON = "polygon"
+class CollectionFetcher:
+    """Base class to fetch a particular collection.
+    Parameters
+    ----------
+    backend_context: BackendContext
+        Information about the backend in use, useful in certain cases.
+    bands: list
+        List of band names to load from that collection.
+    collection_fetch: Callable
+        Function defining how to fetch a collection for a specific backend,
+        the function accepts the following parameters: connection,
+        spatial extent, temporal extent, bands and additional parameters.
+    collection_preprocessing: Callable
+        Function defining how to harmonize the data of a collection in a
+        backend. For example, this function could rename the bands as they
+        can be different for every backend/collection (SENTINEL2_L2A or
+        SENTINEL2_L2A_SENTINELHUB). Accepts the following parameters:
+        datacube (of pre-fetched collection) and additional parameters.
+    colection_params: dict
+        Additional parameters encoded within a dictionnary that will be
+        passed in the fetch and preprocessing function.
+    """
+    def __init__(
+        self,
+        backend_context: BackendContext,
+        bands: list,
+        collection_fetch: Callable,
+        collection_preprocessing: Callable,
+        **collection_params,
+    ):
+        self.backend_contect = backend_context
+        self.bands = bands
+        self.fetcher = collection_fetch
+        self.processing = collection_preprocessing
+        self.params = collection_params
+    def get_cube(
+        self,
+        connection: openeo.Connection,
+        spatial_context: SpatialContext,
+        temporal_context: TemporalContext,
+    ) -> openeo.DataCube:
+        """Retrieve a data cube from the given spatial and temporal context.
+        Parameters
+        ----------
+        connection: openeo.Connection
+            A connection to an OpenEO backend. The backend provided must be the
+            same as the one this extractor class is configured for.
+        spatial_extent: SpatialContext
+            Either a GeoJSON collection on which spatial filtering will be
+            applied or a bounding box with an EPSG code. If a bounding box is
+            provided, no filtering is applied and the entirety of the data is
+            fetched for that region.
+        temporal_extent: TemporalContext
+            The begin and end date of the extraction.
+        """
+        collection_data = self.fetcher(
+            connection, spatial_context, temporal_context, self.bands, **self.params
+        )
+        preprocessed_data = self.processing(collection_data, **self.params)
+        return preprocessed_data

openeo_gfmap/metadata.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Metadata utilities related to the usage of a DataCube. Used to interract
+with the OpenEO backends and cover some shortcomings.
+"""
+from dataclasses import dataclass
+@dataclass
+class FakeMetadata:
+    """Fake metdata object used for datacubes fetched from STAC catalogues.
+    This is used as a temporal fix for OpenEO backend shortcomings, but
+    will become unused with the time.
+    """
+    band_names: list
+    def rename_labels(self, _, target, source):
+        """Rename the labels of the band dimension."""
+        mapping = dict(zip(target, source))
+        band_names = self.band_names.copy()
+        for idx, name in enumerate(band_names):
+            if name in target:
+                self.band_names[idx] = mapping[name]
+        return self

openeo_gfmap/spatial.py ADDED Viewed

	@@ -0,0 +1,53 @@

+""" Definitions of spatial context, either point-based or spatial"""
+from dataclasses import dataclass
+from typing import Union
+from geojson import GeoJSON
+from shapely.geometry import Polygon, box
+@dataclass
+class BoundingBoxExtent:
+    """Definition of a bounding box as accepted by OpenEO
+    Contains the minx, miny, maxx, maxy coordinates expressed as west, south
+    east, north. The EPSG is also defined.
+    """
+    west: float
+    south: float
+    east: float
+    north: float
+    epsg: int = 4326
+    def __dict__(self):
+        return {
+            "west": self.west,
+            "south": self.south,
+            "east": self.east,
+            "north": self.north,
+            "crs": f"EPSG:{self.epsg}",
+            "srs": f"EPSG:{self.epsg}",
+        }
+    def __iter__(self):
+        return iter(
+            [
+                ("west", self.west),
+                ("south", self.south),
+                ("east", self.east),
+                ("north", self.north),
+                ("crs", f"EPSG:{self.epsg}"),
+                ("srs", f"EPSG:{self.epsg}"),
+            ]
+        )
+    def to_geometry(self) -> Polygon:
+        return box(self.west, self.south, self.east, self.north)
+    def to_geojson(self) -> GeoJSON:
+        return self.to_geometry().__geo_interface__
+SpatialContext = Union[GeoJSON, BoundingBoxExtent, str]

openeo_gfmap/temporal.py ADDED Viewed

	@@ -0,0 +1,22 @@

+""" Definitions of temporal context"""
+from dataclasses import dataclass
+from datetime import datetime
+@dataclass
+class TemporalContext:
+    """Temporal context is defined by a `start_date` and `end_date` values.
+    The value must be encoded on a YYYY-mm-dd format, e.g. 2020-01-01
+    """
+    start_date: str
+    end_date: str
+    def to_datetime(self):
+        """Converts the temporal context to a tuple of datetime objects."""
+        return (
+            datetime.strptime(self.start_date, "%Y-%m-%d"),
+            datetime.strptime(self.end_date, "%Y-%m-%d"),
+        )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "crop-map"
+version = "0.1.0"
+description = "Application for Crop Type Mapping"
+requires-python = ">=3.12"
+dependencies = [
+    "folium>=0.20.0",
+    "geojson>=3.2.0",
+    "geopandas>=1.1.2",
+    "joblib>=1.5.3",
+    "matplotlib>=3.10.8",
+    "openeo>=0.47.0",
+    "rasterio>=1.5.0",
+    "streamlit>=1.53.1",
+    "streamlit-folium>=0.26.1",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

worldcereal/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env python3
+from ._version import __version__
+__all__ = ["__version__"]
+SUPPORTED_SEASONS = [
+    "tc-s1",
+    "tc-s2",
+    "tc-annual",
+    "custom",
+]
+SEASONAL_MAPPING = {
+    "tc-s1": "S1",
+    "tc-s2": "S2",
+    "tc-annual": "ANNUAL",
+    "custom": "custom",
+}
+# Default buffer (days) prior to
+# season start
+SEASON_PRIOR_BUFFER = {
+    "tc-s1": 0,
+    "tc-s2": 0,
+    "tc-annual": 0,
+    "custom": 0,
+}
+# Default buffer (days) after
+# season end
+SEASON_POST_BUFFER = {
+    "tc-s1": 0,
+    "tc-s2": 0,
+    "tc-annual": 0,
+    "custom": 0,
+}

worldcereal/_version.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/usr/bin/env python3
2	+
3	+ __version__ = "2.4.1"

worldcereal/job.py ADDED Viewed

	@@ -0,0 +1,960 @@

+"""Executing inference jobs on the OpenEO backend.
+Possible entry points for inference in this module:
+- `generate_map`: This function is used to generate a map for a single patch.
+    It creates one OpenEO job and processes the inference for the specified
+    spatial and temporal extent.
+- `collect_inputs`: This function is used to collect preprocessed inputs
+    without performing inference. It retrieves the required data for further
+    processing or analysis.
+- `run_largescale_inference`: This function utilizes a job manager to
+    orchestrate and execute multiple inference jobs automatically, enabling
+    efficient large-scale processing.
+- `setup_inference_job_manager`: This function prepares the job manager
+    and job database for large-scale inference jobs. It sets up the necessary
+    infrastructure to manage and track jobs in a notebook environment.
+    Used in the WorldCereal demo notebooks.
+"""
+import json
+import shutil
+from copy import deepcopy
+from functools import partial
+from pathlib import Path
+from typing import Callable, Dict, List, Literal, Optional, Union
+import geopandas as gpd
+import openeo
+import pandas as pd
+from loguru import logger
+from openeo import BatchJob
+from openeo.extra.job_management import CsvJobDatabase, MultiBackendJobManager
+from openeo_gfmap import Backend, BackendContext, BoundingBoxExtent, TemporalContext
+from openeo_gfmap.backend import BACKEND_CONNECTIONS
+from pydantic import BaseModel
+from typing_extensions import TypedDict
+from worldcereal.openeo.mapping import _cropland_map, _croptype_map, _embeddings_map
+from worldcereal.openeo.preprocessing import worldcereal_preprocessed_inputs
+from worldcereal.parameters import (
+    CropLandParameters,
+    CropTypeParameters,
+    EmbeddingsParameters,
+    WorldCerealProductType,
+)
+from worldcereal.utils.models import load_model_lut
+ONNX_DEPS_URL = "https://s3.waw3-1.cloudferro.com/swift/v1/project_dependencies/onnx_deps_python311.zip"
+FEATURE_DEPS_URL = "https://s3.waw3-1.cloudferro.com/swift/v1/project_dependencies/torch_deps_python311.zip"
+INFERENCE_JOB_OPTIONS = {
+    "driver-memory": "4g",
+    "executor-memory": "2g",
+    "executor-memoryOverhead": "3g",
+    "max-executors": 20,
+    "python-memory": "disable",
+    "soft-errors": 0.1,
+    "image-name": "python311",
+    "udf-dependency-archives": [
+        f"{ONNX_DEPS_URL}#onnx_deps",
+        f"{FEATURE_DEPS_URL}#feature_deps",
+    ],
+}
+class WorldCerealProduct(TypedDict):
+    """Dataclass representing a WorldCereal inference product.
+    Attributes
+    ----------
+    url: str
+        URL to the product.
+    type: WorldCerealProductType
+        Type of the product. Either cropland or croptype.
+    temporal_extent: TemporalContext
+        Period of time for which the product has been generated.
+    path: Optional[Path]
+        Path to the downloaded product.
+    lut: Optional[Dict]
+        Look-up table for the product.
+    """
+    url: str
+    type: WorldCerealProductType
+    temporal_extent: TemporalContext
+    path: Optional[Path]
+    lut: Optional[Dict]
+class InferenceResults(BaseModel):
+    """Dataclass to store the results of the WorldCereal job.
+    Attributes
+    ----------
+    job_id : str
+        Job ID of the finished OpenEO job.
+    products: Dict[str, WorldCerealProduct]
+        Dictionary with the different products.
+    metadata: Optional[Path]
+        Path to metadata file, if it was downloaded locally.
+    """
+    job_id: str
+    products: Dict[str, WorldCerealProduct]
+    metadata: Optional[Path]
+class InferenceJobManager(MultiBackendJobManager):
+    """A job manager for executing large-scale WorldCereal inference jobs on the OpenEO backend.
+    Based on official MultiBackendJobManager with extension of how results are downloaded
+    and named.
+    """
+    @classmethod
+    def generate_output_path_inference(
+        cls,
+        root_folder: Path,
+        geometry_index: int,
+        row: pd.Series,
+        asset_id: Optional[str] = None,
+    ) -> Path:
+        """Method to generate the output path for inference jobs.
+        Parameters
+        ----------
+        root_folder : Path
+            root folder where the output parquet file will be saved
+        geometry_index : int
+            For point extractions, only one asset (a geoparquet file) is generated per job.
+            Therefore geometry_index is always 0. It has to be included in the function signature
+            to be compatible with the GFMapJobManager
+        row : pd.Series
+            the current job row from the GFMapJobManager
+        asset_id : str, optional
+            Needed for compatibility with GFMapJobManager but not used.
+        Returns
+        -------
+        Path
+            output path for the point extractions parquet file
+        """
+        tile_name = row.tile_name
+        # Create the subfolder to store the output
+        subfolder = root_folder / str(tile_name)
+        subfolder.mkdir(parents=True, exist_ok=True)
+        return subfolder
+    def on_job_done(self, job: BatchJob, row):
+        logger.info(f"Job {job.job_id} completed")
+        output_dir = self.generate_output_path_inference(self._root_dir, 0, row)
+        # Get job results
+        job_result = job.get_results()
+        # Get the products
+        assets = job_result.get_assets()
+        for asset in assets:
+            asset_name = asset.name.split(".")[0].split("_")[0]
+            asset_type = asset_name.split("-")[0]
+            asset_type = getattr(WorldCerealProductType, asset_type.upper())
+            filepath = asset.download(target=output_dir)
+            # We want to add the tile name to the filename
+            new_filepath = filepath.parent / f"{filepath.stem}_{row.tile_name}.tif"
+            shutil.move(filepath, new_filepath)
+        job_metadata = job.describe()
+        result_metadata = job_result.get_metadata()
+        job_metadata_path = output_dir / f"job_{job.job_id}.json"
+        result_metadata_path = output_dir / f"result_{job.job_id}.json"
+        with job_metadata_path.open("w", encoding="utf-8") as f:
+            json.dump(job_metadata, f, ensure_ascii=False)
+        with result_metadata_path.open("w", encoding="utf-8") as f:
+            json.dump(result_metadata, f, ensure_ascii=False)
+        # post_job_action(output_file)
+        logger.success("Job completed")
+def create_inference_process_graph(
+    spatial_extent: BoundingBoxExtent,
+    temporal_extent: TemporalContext,
+    product_type: WorldCerealProductType = WorldCerealProductType.CROPLAND,
+    cropland_parameters: CropLandParameters = CropLandParameters(),
+    croptype_parameters: CropTypeParameters = CropTypeParameters(),
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]] = None,
+    out_format: str = "GTiff",
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    tile_size: Optional[int] = 128,
+    target_epsg: Optional[int] = None,
+    connection: Optional[openeo.Connection] = None,
+) -> List[openeo.DataCube]:
+    """Wrapper function that creates the inference openEO process graph.
+    Parameters
+    ----------
+    spatial_extent : BoundingBoxExtent
+        spatial extent of the map
+    temporal_extent : TemporalContext
+        temporal range to consider
+    product_type : WorldCerealProductType, optional
+        product describer, by default WorldCerealProductType.CROPLAND
+    cropland_parameters: CropLandParameters
+        Parameters for the cropland product inference pipeline.
+    croptype_parameters: Optional[CropTypeParameters]
+        Parameters for the croptype product inference pipeline. Only required
+        whenever `product_type` is set to `WorldCerealProductType.CROPTYPE`,
+        will be ignored otherwise.
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]]
+        Sentinel-1 orbit state to use for the inference. If not provided,
+        the orbit state will be dynamically determined based on the spatial extent.
+    out_format : str, optional
+        Output format, by default "GTiff"
+    backend_context : BackendContext
+        backend to run the job on, by default CDSE.
+    tile_size: int, optional
+        Tile size to use for the data loading in OpenEO, by default 128.
+    target_epsg: Optional[int] = None
+        EPSG code to use for the output products. If not provided, the
+        default EPSG will be used.
+    connection: Optional[openeo.Connection] = None,
+        Optional OpenEO connection to use. If not provided, a new connection
+        will be created based on the backend_context.
+    Returns
+    -------
+    List[openeo.DataCube]
+        A list with one or more result objects or a list of DataCube objects, representing the inference
+        process graph. This object can be used to execute the job on the OpenEO backend.
+        The result will be a DataCube with the classification results.
+    Raises
+    ------
+    ValueError
+        if the product is not supported
+    ValueError
+        if the out_format is not supported
+    """
+    if product_type not in WorldCerealProductType:
+        raise ValueError(f"Product {product_type.value} not supported.")
+    if out_format not in ["GTiff", "NetCDF"]:
+        raise ValueError(f"Format {format} not supported.")
+    # Make a connection to the OpenEO backend
+    if connection is None:
+        connection = BACKEND_CONNECTIONS[backend_context.backend]()
+    # Preparing the input cube for inference
+    inputs = worldcereal_preprocessed_inputs(
+        connection=connection,
+        backend_context=backend_context,
+        spatial_extent=spatial_extent,
+        temporal_extent=temporal_extent,
+        tile_size=tile_size,
+        s1_orbit_state=s1_orbit_state,
+        target_epsg=target_epsg,
+        # disable_meteo=True,
+    )
+    # Spatial filtering
+    inputs = inputs.filter_bbox(dict(spatial_extent))
+    # Construct the feature extraction and model inference pipeline
+    if product_type == WorldCerealProductType.CROPLAND:
+        results = _cropland_map(
+            inputs,
+            temporal_extent,
+            cropland_parameters=cropland_parameters,
+        )
+    elif product_type == WorldCerealProductType.CROPTYPE:
+        if not isinstance(croptype_parameters, CropTypeParameters):
+            raise ValueError(
+                f"Please provide a valid `croptype_parameters` parameter."
+                f" Received: {croptype_parameters}"
+            )
+        # Generate crop type map with optional cropland masking
+        results = _croptype_map(
+            inputs,
+            temporal_extent,
+            cropland_parameters=cropland_parameters,
+            croptype_parameters=croptype_parameters,
+        )
+    return results
+def create_embeddings_process_graph(
+    spatial_extent: BoundingBoxExtent,
+    temporal_extent: TemporalContext,
+    embeddings_parameters: EmbeddingsParameters = EmbeddingsParameters(),
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]] = None,
+    out_format: str = "GTiff",
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    tile_size: Optional[int] = 128,
+    target_epsg: Optional[int] = None,
+    scale_uint16: bool = True,
+) -> openeo.DataCube:
+    """Create an OpenEO process graph for generating embeddings.
+    Parameters
+    ----------
+    spatial_extent : BoundingBoxExtent
+        Spatial extent of the map.
+    temporal_extent : TemporalContext
+        Temporal range to consider.
+    embeddings_parameters : EmbeddingsParameters, optional
+        Parameters for the embeddings product inference pipeline, by default EmbeddingsParameters().
+    s1_orbit_state : Optional[Literal["ASCENDING", "DESCENDING"]], optional
+        Sentinel-1 orbit state to use for the inference. If not provided, the orbit state will be dynamically determined based on the spatial extent, by default None.
+    out_format : str, optional
+        Output format, by default "GTiff".
+    backend_context : BackendContext, optional
+        Backend to run the job on, by default BackendContext(Backend.CDSE).
+    tile_size : Optional[int], optional
+        Tile size to use for the data loading in OpenEO, by default 128.
+    target_epsg : Optional[int], optional
+        EPSG code to use for the output products. If not provided, the default EPSG will be used.
+    scale_uint16 : bool, optional
+        Whether to scale the embeddings to uint16 for memory optimization, by default True.
+    Returns
+    -------
+    openeo.DataCube
+        DataCube object representing the embeddings process graph. This object can be used to execute the job on the OpenEO backend. The result will be a DataCube with the embeddings.
+    Raises
+    ------
+    ValueError
+        If the output format is not supported.
+    """
+    if out_format not in ["GTiff", "NetCDF"]:
+        raise ValueError(f"Format {format} not supported.")
+    # Make a connection to the OpenEO backend
+    connection = BACKEND_CONNECTIONS[backend_context.backend]()
+    # Preparing the input cube for inference
+    inputs = worldcereal_preprocessed_inputs(
+        connection=connection,
+        backend_context=backend_context,
+        spatial_extent=spatial_extent,
+        temporal_extent=temporal_extent,
+        tile_size=tile_size,
+        s1_orbit_state=s1_orbit_state,
+        target_epsg=target_epsg,
+        # disable_meteo=True,
+    )
+    # Spatial filtering
+    inputs = inputs.filter_bbox(dict(spatial_extent))
+    embeddings = _embeddings_map(
+        inputs,
+        temporal_extent,
+        embeddings_parameters=embeddings_parameters,
+        scale_uint16=scale_uint16,
+    )
+    # Save the final result
+    embeddings = embeddings.save_result(
+        format=out_format,
+        options=dict(
+            filename_prefix=f"WorldCereal_Embeddings_{temporal_extent.start_date}_{temporal_extent.end_date}",
+        ),
+    )
+    return embeddings
+def create_inputs_process_graph(
+    spatial_extent: BoundingBoxExtent,
+    temporal_extent: TemporalContext,
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]] = None,
+    out_format: str = "NetCDF",
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    tile_size: Optional[int] = 128,
+    target_epsg: Optional[int] = None,
+    compositing_window: Literal["month", "dekad"] = "month",
+) -> openeo.DataCube:
+    """Wrapper function that creates the inputs openEO process graph.
+    Parameters
+    ----------
+    spatial_extent : BoundingBoxExtent
+        spatial extent of the map
+    temporal_extent : TemporalContext
+        temporal range to consider
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]]
+        Sentinel-1 orbit state to use for the inference. If not provided,
+        the orbit state will be dynamically determined based on the spatial extent.
+    out_format : str, optional
+        Output format, by default "NetCDF"
+    backend_context : BackendContext
+        backend to run the job on, by default CDSE.
+    tile_size: int, optional
+        Tile size to use for the data loading in OpenEO, by default 128.
+    target_epsg: Optional[int] = None
+        EPSG code to use for the output products. If not provided, the
+        default EPSG will be used.
+    compositing_window: Literal["month", "dekad"]
+        Compositing window to use for the data loading in OpenEO, by default
+        "month".
+    Returns
+    -------
+    openeo.DataCube
+        DataCube object representing the inputs process graph.
+        This object can be used to execute the job on the OpenEO backend.
+        The result will be a DataCube with the preprocessed inputs.
+    Raises
+    ------
+    ValueError
+        if the out_format is not supported
+    """
+    if out_format not in ["GTiff", "NetCDF"]:
+        raise ValueError(f"Format {format} not supported.")
+    # Make a connection to the OpenEO backend
+    connection = BACKEND_CONNECTIONS[backend_context.backend]()
+    # Preparing the input cube for inference
+    inputs = worldcereal_preprocessed_inputs(
+        connection=connection,
+        backend_context=backend_context,
+        spatial_extent=spatial_extent,
+        temporal_extent=temporal_extent,
+        tile_size=tile_size,
+        s1_orbit_state=s1_orbit_state,
+        target_epsg=target_epsg,
+        compositing_window=compositing_window,
+        # disable_meteo=True,
+    )
+    # Spatial filtering
+    inputs = inputs.filter_bbox(dict(spatial_extent))
+    # Save the final result
+    inputs = inputs.save_result(
+        format=out_format,
+        options=dict(
+            filename_prefix=f"preprocessed-inputs_{temporal_extent.start_date}_{temporal_extent.end_date}",
+        ),
+    )
+    return inputs
+def create_inference_job(
+    row: pd.Series,
+    connection: openeo.Connection,
+    provider: str,
+    connection_provider: str,
+    product_type: WorldCerealProductType = WorldCerealProductType.CROPTYPE,
+    cropland_parameters: CropLandParameters = CropLandParameters(),
+    croptype_parameters: CropTypeParameters = CropTypeParameters(),
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]] = None,
+    target_epsg: Optional[int] = None,
+    job_options: Optional[dict] = None,
+) -> BatchJob:
+    """Create an OpenEO batch job for WorldCereal inference.
+    Parameters
+    ----------
+    row : pd.Series
+        _description_
+        Contains at least the following fields:
+        - start_date: str, start date of the temporal extent
+        - end_date: str, end date of the temporal extent
+        - geometry: shapely.geometry, geometry of the spatial extent
+        - tile_name: str, name of the tile
+        - epsg: int, EPSG code of the spatial extent
+        - bounds_epsg: str representation of tuple,
+                        bounds of the spatial extent in CRS as
+                        specified by epsg attribute
+    connection : openeo.Connection
+        openEO connection to the backend
+    provider : str
+        unused but required for compatibility with MultiBackendJobManager
+    connection_provider : str
+        unused but required for compatibility with MultiBackendJobManager6
+    product_type : WorldCerealProductType, optional
+        Type of the WorldCereal product to generate, by default WorldCerealProductType.CROPTYPE
+    croptype_parameters :  Optional[CropTypeParameters], optional
+        Parameters for the croptype product inference pipeline. Only required
+        whenever `product_type` is set to `WorldCerealProductType.CROPTYPE`,
+        will be ignored otherwise, by default None
+    cropland_parameters : Optional[CropLandParameters], optional
+        Parameters for the cropland product inference pipeline, by default None
+    s1_orbit_state : Optional[Literal["ASCENDING", "DESCENDING"]], optional
+        Sentinel-1 orbit state to use for the inference. If not provided, the
+        best orbit will be dynamically derived from the catalogue.
+    target_epsg : Optional[int], optional
+        EPSG code to reproject the data to. If not provided, the data will be
+        left in the original epsg as mentioned in the row.
+    job_options : Optional[dict], optional
+        Additional job options to pass to the OpenEO backend, by default None
+    Returns
+    -------
+    BatchJob
+        Batch job created on openEO backend.
+    """
+    # Get temporal and spatial extents from the row
+    temporal_extent = TemporalContext(start_date=row.start_date, end_date=row.end_date)
+    epsg = int(row.epsg)
+    bounds = eval(row.bounds_epsg)
+    spatial_extent = BoundingBoxExtent(
+        west=bounds[0], south=bounds[1], east=bounds[2], north=bounds[3], epsg=epsg
+    )
+    if target_epsg is None:
+        # If no target EPSG is provided, use the EPSG from the row
+        target_epsg = epsg
+    # Update default job options with the provided ones
+    inference_job_options = deepcopy(INFERENCE_JOB_OPTIONS)
+    if job_options is not None:
+        inference_job_options.update(job_options)
+    inference_result = create_inference_process_graph(
+        spatial_extent=spatial_extent,
+        temporal_extent=temporal_extent,
+        product_type=product_type,
+        croptype_parameters=croptype_parameters,
+        cropland_parameters=cropland_parameters,
+        s1_orbit_state=s1_orbit_state,
+        target_epsg=target_epsg,
+        connection=connection,
+    )
+    # Submit the job
+    return connection.create_job(
+        inference_result,
+        title=f"WorldCereal [{product_type.value}] job_{row.tile_name}",
+        description="Job that performs end-to-end WorldCereal inference",
+        additional=inference_job_options,  # TODO: once openeo-python-client supports job_options, use that
+    )
+def generate_map(
+    spatial_extent: BoundingBoxExtent,
+    temporal_extent: TemporalContext,
+    output_dir: Optional[Union[Path, str]] = None,
+    product_type: WorldCerealProductType = WorldCerealProductType.CROPLAND,
+    cropland_parameters: CropLandParameters = CropLandParameters(),
+    croptype_parameters: CropTypeParameters = CropTypeParameters(),
+    out_format: str = "GTiff",
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    tile_size: Optional[int] = 128,
+    job_options: Optional[dict] = None,
+    target_epsg: Optional[int] = None,
+) -> InferenceResults:
+    """Main function to generate a WorldCereal product.
+    Parameters
+    ----------
+    spatial_extent : BoundingBoxExtent
+        spatial extent of the map
+    temporal_extent : TemporalContext
+        temporal range to consider
+    output_dir : Optional[Union[Path, str]]
+        path to directory where products should be downloaded to
+    product_type : WorldCerealProductType, optional
+        product describer, by default WorldCerealProductType.CROPLAND
+    cropland_parameters: CropLandParameters
+        Parameters for the cropland product inference pipeline.
+    croptype_parameters: Optional[CropTypeParameters]
+        Parameters for the croptype product inference pipeline. Only required
+        whenever `product_type` is set to `WorldCerealProductType.CROPTYPE`,
+        will be ignored otherwise.
+    out_format : str, optional
+        Output format, by default "GTiff"
+    backend_context : BackendContext
+        backend to run the job on, by default CDSE.
+    tile_size: int, optional
+        Tile size to use for the data loading in OpenEO, by default 128.
+    job_options: dict, optional
+        Additional job options to pass to the OpenEO backend, by default None
+    target_epsg: Optional[int] = None
+        EPSG code to use for the output products. If not provided, the
+        default EPSG will be used.
+    Returns
+    -------
+    InferenceResults
+        Results of the finished WorldCereal job.
+    Raises
+    ------
+    ValueError
+        if the product is not supported
+    ValueError
+        if the out_format is not supported
+    """
+    # Get a connection to the OpenEO backend
+    connection = BACKEND_CONNECTIONS[backend_context.backend]()
+    # Create the process graph
+    results = create_inference_process_graph(
+        spatial_extent=spatial_extent,
+        temporal_extent=temporal_extent,
+        product_type=product_type,
+        cropland_parameters=cropland_parameters,
+        croptype_parameters=croptype_parameters,
+        out_format=out_format,
+        backend_context=backend_context,
+        tile_size=tile_size,
+        target_epsg=target_epsg,
+        connection=connection,
+    )
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+    # Submit the job
+    inference_job_options = deepcopy(INFERENCE_JOB_OPTIONS)
+    if job_options is not None:
+        inference_job_options.update(job_options)
+    # Execute the job
+    job = connection.create_job(
+        results,
+        additional=inference_job_options,  # TODO: once openeo-python-client supports job_options, use that
+        title=f"WorldCereal [{product_type.value}] job",
+        description="Job that performs end-to-end WorldCereal inference",
+    ).start_and_wait()
+    # Get look-up tables
+    luts = {}
+    luts[WorldCerealProductType.CROPLAND.value] = load_model_lut(
+        cropland_parameters.classifier_parameters.classifier_url
+    )
+    if product_type == WorldCerealProductType.CROPTYPE:
+        luts[WorldCerealProductType.CROPTYPE.value] = load_model_lut(
+            croptype_parameters.classifier_parameters.classifier_url
+        )
+    # Get job results
+    job_result = job.get_results()
+    # Get the products
+    assets = job_result.get_assets()
+    products = {}
+    for asset in assets:
+        asset_name = asset.name.split(".")[0].split("_")[0]
+        asset_type = asset_name.split("-")[0]
+        asset_type = getattr(WorldCerealProductType, asset_type.upper())
+        if output_dir is not None:
+            filepath = asset.download(target=output_dir)
+        else:
+            filepath = None
+        products[asset_name] = {
+            "url": asset.href,
+            "type": asset_type,
+            "temporal_extent": temporal_extent,
+            "path": filepath,
+            "lut": luts[asset_type.value],
+        }
+    # Download job metadata if output path is provided
+    if output_dir is not None:
+        metadata_file = output_dir / "job-results.json"
+        metadata_file.write_text(json.dumps(job_result.get_metadata()))
+    else:
+        metadata_file = None
+    # Compile InferenceResults and return
+    return InferenceResults(
+        job_id=job.job_id, products=products, metadata=metadata_file
+    )
+def collect_inputs(
+    spatial_extent: BoundingBoxExtent,
+    temporal_extent: TemporalContext,
+    output_path: Union[Path, str],
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    tile_size: Optional[int] = 128,
+    job_options: Optional[dict] = None,
+    compositing_window: Literal["month", "dekad"] = "month",
+):
+    """Function to retrieve preprocessed inputs that are being
+    used in the generation of WorldCereal products.
+    Parameters
+    ----------
+    spatial_extent : BoundingBoxExtent
+        spatial extent of the map
+    temporal_extent : TemporalContext
+        temporal range to consider
+    output_path : Union[Path, str]
+        output path to download the product to
+    backend_context : BackendContext
+        backend to run the job on, by default CDSE
+    tile_size: int, optional
+        Tile size to use for the data loading in OpenEO, by default 128
+        so it uses the OpenEO default setting.
+    job_options: dict, optional
+        Additional job options to pass to the OpenEO backend, by default None
+    compositing_window: Literal["month", "dekad"]
+        Compositing window to use for the data loading in OpenEO, by default
+        "month".
+    """
+    # Make a connection to the OpenEO backend
+    connection = BACKEND_CONNECTIONS[backend_context.backend]()
+    # Preparing the input cube for the inference
+    inputs = worldcereal_preprocessed_inputs(
+        connection=connection,
+        backend_context=backend_context,
+        spatial_extent=spatial_extent,
+        temporal_extent=temporal_extent,
+        tile_size=tile_size,
+        validate_temporal_context=False,
+        compositing_window=compositing_window,
+    )
+    # Spatial filtering
+    inputs = inputs.filter_bbox(dict(spatial_extent))
+    JOB_OPTIONS = {
+        "driver-memory": "4g",
+        "executor-memory": "1g",
+        "executor-memoryOverhead": "1g",
+        "python-memory": "3g",
+        "soft-errors": 0.1,
+    }
+    if job_options is not None:
+        JOB_OPTIONS.update(job_options)
+    inputs.execute_batch(
+        outputfile=output_path,
+        out_format="NetCDF",
+        title="WorldCereal [collect_inputs] job",
+        description="Job that collects inputs for WorldCereal inference",
+        job_options=JOB_OPTIONS,
+    )
+def run_largescale_inference(
+    production_grid: Union[Path, gpd.GeoDataFrame],
+    output_dir: Union[Path, str],
+    product_type: WorldCerealProductType = WorldCerealProductType.CROPLAND,
+    cropland_parameters: CropLandParameters = CropLandParameters(),
+    croptype_parameters: CropTypeParameters = CropTypeParameters(),
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    target_epsg: Optional[int] = None,
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]] = None,
+    job_options: Optional[dict] = None,
+    parallel_jobs: int = 2,
+):
+    """
+    Run large-scale inference jobs on the OpenEO backend.
+    This function orchestrates the execution of large-scale inference jobs
+    using a production grid (either a Parquet file or a GeoDataFrame) and specified parameters.
+    It manages job creation, tracking, and execution on the OpenEO backend.
+    Parameters
+    ----------
+    production_grid : Union[Path, gpd.GeoDataFrame]
+        Path to the production grid file in Parquet format or a GeoDataFrame.
+        The grid must contain the required attributes: 'start_date', 'end_date',
+        'geometry', 'tile_name', 'epsg' and 'bounds_epsg'.
+    output_dir : Union[Path, str]
+        Directory where output files and job tracking information will be stored.
+    product_type : WorldCerealProductType
+        Type of product to generate. Defaults to WorldCerealProductType.CROPLAND.
+    cropland_parameters : CropLandParameters
+        Parameters for cropland inference.
+    croptype_parameters : CropTypeParameters
+        Parameters for crop type inference.
+    backend_context : BackendContext
+        Context for the backend to use. Defaults to BackendContext(Backend.CDSE).
+    target_epsg : Optional[int]
+        EPSG code for the target coordinate reference system.
+        If None, no reprojection will be performed.
+    s1_orbit_state : Optional[Literal["ASCENDING", "DESCENDING"]]
+        Sentinel-1 orbit state to use ('ASCENDING' or 'DESCENDING')
+        If None, no specific orbit state is enforced.
+    job_options : Optional[dict]
+        Additional options for configuring the inference jobs. Defaults to None.
+    parallel_jobs : int
+        Number of parallel jobs to manage on the backend. Defaults to 2. Note that load
+        balancing does not guarantee that all jobs will run in parallel.
+    Returns
+    -------
+    None
+    """
+    job_manager, job_db, start_job = setup_inference_job_manager(
+        production_grid=production_grid,
+        output_dir=output_dir,
+        product_type=product_type,
+        cropland_parameters=cropland_parameters,
+        croptype_parameters=croptype_parameters,
+        backend_context=backend_context,
+        target_epsg=target_epsg,
+        s1_orbit_state=s1_orbit_state,
+        job_options=job_options,
+        parallel_jobs=parallel_jobs,
+    )
+    job_df = job_db.df
+    job_tracking_csv = job_db.path
+    # Run the jobs
+    job_manager.run_jobs(
+        df=job_df,
+        start_job=start_job,
+        job_db=job_tracking_csv,
+    )
+    logger.info("Job manager finished.")
+def setup_inference_job_manager(
+    production_grid: Union[Path, gpd.GeoDataFrame],
+    output_dir: Union[Path, str],
+    product_type: WorldCerealProductType = WorldCerealProductType.CROPLAND,
+    cropland_parameters: CropLandParameters = CropLandParameters(),
+    croptype_parameters: CropTypeParameters = CropTypeParameters(),
+    backend_context: BackendContext = BackendContext(Backend.CDSE),
+    target_epsg: Optional[int] = None,
+    s1_orbit_state: Optional[Literal["ASCENDING", "DESCENDING"]] = None,
+    job_options: Optional[dict] = None,
+    parallel_jobs: int = 2,
+) -> tuple[InferenceJobManager, CsvJobDatabase, Callable]:
+    """
+    Prepare large-scale inference jobs on the OpenEO backend.
+    This function sets up the job manager, creates job tracking information,
+    and defines the job creation function for WorldCereal inference jobs.
+    Parameters
+    ----------
+    production_grid : Union[Path, gpd.GeoDataFrame]
+        Path to the production grid file in Parquet format or a GeoDataFrame.
+        The grid must contain the required attributes: 'start_date', 'end_date',
+        'geometry', 'tile_name', 'epsg' and 'bounds_epsg'.
+    output_dir : Union[Path, str]
+        Directory where output files and job tracking information will be stored.
+    product_type : WorldCerealProductType
+        Type of product to generate. Defaults to WorldCerealProductType.CROPLAND.
+    cropland_parameters : CropLandParameters
+        Parameters for cropland inference.
+    croptype_parameters : CropTypeParameters
+        Parameters for crop type inference.
+    backend_context : BackendContext
+        Context for the backend to use. Defaults to BackendContext(Backend.CDSE).
+    target_epsg : Optional[int]
+        EPSG code for the target coordinate reference system.
+        If None, no reprojection will be performed.
+    s1_orbit_state : Optional[Literal["ASCENDING", "DESCENDING"]]
+        Sentinel-1 orbit state to use ('ASCENDING' or 'DESCENDING')
+        If None, no specific orbit state is enforced.
+    job_options : Optional[dict]
+        Additional options for configuring the inference jobs. Defaults to None.
+    parallel_jobs : int
+        Number of parallel jobs to manage on the backend. Defaults to 2. Note that load
+        balancing does not guarantee that all jobs will run in parallel.
+    Returns
+    -------
+    tuple[InferenceJobManager, CsvJobDatabase, callable]
+        A tuple containing:
+        - InferenceJobManager: The job manager for handling inference jobs.
+        - CsvJobDatabase: The job database for tracking job information.
+        - callable: A function to create individual inference jobs.
+    Raises
+    -------
+    AssertionError:
+        If the production grid does not contain the required attributes.
+    """
+    # Setup output directory
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Make a connection to the OpenEO backend
+    backend = backend_context.backend
+    connection = BACKEND_CONNECTIONS[backend]()
+    # Setup the job manager
+    logger.info("Setting up the job manager.")
+    manager = InferenceJobManager(root_dir=output_dir)
+    manager.add_backend(
+        backend.value, connection=connection, parallel_jobs=parallel_jobs
+    )
+    # Configure job tracking CSV file
+    job_tracking_csv = output_dir / "job_tracking.csv"
+    job_db = CsvJobDatabase(path=job_tracking_csv)
+    if not job_db.exists():
+        logger.info("Job tracking file does not exist, creating new jobs.")
+        if isinstance(production_grid, Path):
+            production_gdf = gpd.read_parquet(production_grid)
+        elif isinstance(production_grid, gpd.GeoDataFrame):
+            production_gdf = production_grid
+        else:
+            raise ValueError("production_grid must be a Path or a GeoDataFrame.")
+        REQUIRED_ATTRIBUTES = [
+            "start_date",
+            "end_date",
+            "geometry",
+            "tile_name",
+            "epsg",
+            "bounds_epsg",
+        ]
+        for attr in REQUIRED_ATTRIBUTES:
+            assert (
+                attr in production_gdf.columns
+            ), f"The production grid must contain a '{attr}' column."
+        job_df = production_gdf[REQUIRED_ATTRIBUTES].copy()
+        df = manager._normalize_df(job_df)
+        # Save the job tracking DataFrame to the job database
+        job_db.persist(df)
+    else:
+        logger.info("Job tracking file already exists, skipping job creation.")
+    # Define the job creation function
+    start_job = partial(
+        create_inference_job,
+        product_type=product_type,
+        cropland_parameters=cropland_parameters,
+        croptype_parameters=croptype_parameters,
+        s1_orbit_state=s1_orbit_state,
+        job_options=job_options,
+        target_epsg=target_epsg,
+    )
+    # Check if there are jobs to run
+    if job_db.df.empty:
+        logger.warning("No jobs to run. The job tracking CSV is empty.")
+        raise ValueError(
+            "No jobs to run. The job tracking CSV is empty. "
+            "Please check the production grid and ensure it contains valid data."
+        )
+    return manager, job_db, start_job

worldcereal/openeo/__init__.py ADDED Viewed

File without changes

worldcereal/openeo/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,582 @@

+"""openEO UDF to compute Presto/Prometheo features."""
+import copy
+import functools
+import logging
+import random
+import sys
+import urllib.request
+import zipfile
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import xarray as xr
+from openeo.metadata import CollectionMetadata
+from openeo.udf import XarrayDataCube
+from openeo.udf.udf_data import UdfData
+from pyproj import Transformer
+from pyproj.crs import CRS
+from scipy.ndimage import (
+    convolve,
+    zoom,
+)
+from shapely.geometry import Point
+from shapely.ops import transform
+sys.path.append("feature_deps")
+import torch  # noqa: E402
+PROMETHEO_WHL_URL = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/worldcereal/dependencies/prometheo-0.0.3-py3-none-any.whl"
+GFMAP_BAND_MAPPING = {
+    "S2-L2A-B02": "B2",
+    "S2-L2A-B03": "B3",
+    "S2-L2A-B04": "B4",
+    "S2-L2A-B05": "B5",
+    "S2-L2A-B06": "B6",
+    "S2-L2A-B07": "B7",
+    "S2-L2A-B08": "B8",
+    "S2-L2A-B8A": "B8A",
+    "S2-L2A-B11": "B11",
+    "S2-L2A-B12": "B12",
+    "S1-SIGMA0-VH": "VH",
+    "S1-SIGMA0-VV": "VV",
+    "AGERA5-TMEAN": "temperature_2m",
+    "AGERA5-PRECIP": "total_precipitation",
+}
+LAT_HARMONIZED_NAME = "GEO-LAT"
+LON_HARMONIZED_NAME = "GEO-LON"
+EPSG_HARMONIZED_NAME = "GEO-EPSG"
+logger = logging.getLogger(__name__)
+@functools.lru_cache(maxsize=1)
+def unpack_prometheo_wheel(wheel_url: str):
+    destination_dir = Path.cwd() / "dependencies" / "prometheo"
+    destination_dir.mkdir(exist_ok=True, parents=True)
+    # Downloads the wheel file
+    modelfile, _ = urllib.request.urlretrieve(
+        wheel_url, filename=Path.cwd() / Path(wheel_url).name
+    )
+    with zipfile.ZipFile(modelfile, "r") as zip_ref:
+        zip_ref.extractall(destination_dir)
+    return destination_dir
+@functools.lru_cache(maxsize=1)
+def compile_encoder(presto_encoder):
+    """Helper function that compiles the encoder of a Presto model
+    and performs a warm-up on dummy data. The lru_cache decorator
+    ensures caching on compute nodes to be able to actually benefit
+    from the compilation process.
+    Parameters
+    ----------
+    presto_encoder : nn.Module
+        Encoder part of Presto model to compile
+    """
+    presto_encoder = torch.compile(presto_encoder)  # type: ignore
+    for _ in range(3):
+        presto_encoder(
+            torch.rand((1, 12, 17)),
+            torch.ones((1, 12)).long(),
+            torch.rand(1, 2),
+        )
+    return presto_encoder
+def evaluate_resolution(inarr: xr.DataArray, epsg: int) -> int:
+    """Helper function to get the resolution in meters for
+    the input array.
+    Parameters
+    ----------
+    inarr : xr.DataArray
+        input array to determine resolution for.
+    Returns
+    -------
+    int
+        resolution in meters.
+    """
+    if epsg == 4326:
+        logger.info(
+            "Converting WGS84 coordinates to EPSG:3857 to determine resolution."
+        )
+        transformer = Transformer.from_crs(epsg, 3857, always_xy=True)
+        points = [Point(x, y) for x, y in zip(inarr.x.values, inarr.y.values)]
+        points = [transform(transformer.transform, point) for point in points]
+        resolution = abs(points[1].x - points[0].x)
+    else:
+        resolution = abs(inarr.x[1].values - inarr.x[0].values)
+    logger.info(f"Resolution for computing slope: {resolution}")
+    return resolution
+def compute_slope(inarr: xr.DataArray, resolution: int) -> xr.DataArray:
+    """Computes the slope using the scipy library. The input array should
+    have the following bands: 'elevation' And no time dimension. Returns a
+    new DataArray containing the new `slope` band.
+    Parameters
+    ----------
+    inarr : xr.DataArray
+        input array containing a band 'elevation'.
+    resolution : int
+        resolution of the input array in meters.
+    Returns
+    -------
+    xr.DataArray
+        output array containing 'slope' band in degrees.
+    """
+    def _rolling_fill(darr, max_iter=2):
+        """Helper function that also reflects values inside
+        a patch with NaNs."""
+        if max_iter == 0:
+            return darr
+        else:
+            max_iter -= 1
+        # arr of shape (rows, cols)
+        mask = np.isnan(darr)
+        if ~np.any(mask):
+            return darr
+        roll_params = [(0, 1), (0, -1), (1, 0), (-1, 0)]
+        random.shuffle(roll_params)
+        for roll_param in roll_params:
+            rolled = np.roll(darr, roll_param, axis=(0, 1))
+            darr[mask] = rolled[mask]
+        return _rolling_fill(darr, max_iter=max_iter)
+    def _downsample(arr: np.ndarray, factor: int) -> np.ndarray:
+        """Downsamples a 2D NumPy array by a given factor with average resampling and reflect padding.
+        Parameters
+        ----------
+        arr : np.ndarray
+            The 2D input array.
+        factor : int
+            The factor by which to downsample. For example, factor=2 downsamples by 2x.
+        Returns
+        -------
+        np.ndarray
+            Downsampled array.
+        """
+        # Get the original shape of the array
+        X, Y = arr.shape
+        # Calculate how much padding is needed for each dimension
+        pad_X = (
+            factor - (X % factor)
+        ) % factor  # Ensures padding is only applied if needed
+        pad_Y = (
+            factor - (Y % factor)
+        ) % factor  # Ensures padding is only applied if needed
+        # Pad the array using 'reflect' mode
+        padded = np.pad(arr, ((0, pad_X), (0, pad_Y)), mode="reflect")
+        # Reshape the array to form blocks of size 'factor' x 'factor'
+        reshaped = padded.reshape(
+            (X + pad_X) // factor, factor, (Y + pad_Y) // factor, factor
+        )
+        # Take the mean over the factor-sized blocks
+        downsampled = np.nanmean(reshaped, axis=(1, 3))
+        return downsampled
+    dem = inarr.sel(bands="elevation").values
+    dem_arr = dem.astype(np.float32)
+    # Invalid to NaN and keep track of these pixels
+    dem_arr[dem_arr == 65535] = np.nan
+    idx_invalid = np.isnan(dem_arr)
+    # Fill NaNs with rolling fill
+    dem_arr = _rolling_fill(dem_arr)
+    # We make sure DEM is at 20m for slope computation
+    # compatible with global slope collection
+    factor = int(20 / resolution)
+    if factor < 1 or factor % 2 != 0:
+        raise NotImplementedError(
+            f"Unsupported resolution for slope computation: {resolution}"
+        )
+    dem_arr_downsampled = _downsample(dem_arr, factor)
+    x_odd, y_odd = dem_arr.shape[0] % 2 != 0, dem_arr.shape[1] % 2 != 0
+    # Mask NaN values in the DEM data
+    dem_masked = np.ma.masked_invalid(dem_arr_downsampled)
+    # Define convolution kernels for x and y gradients (simple finite difference approximation)
+    kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) / (
+        8.0 * 20  # array is now at 20m resolution
+    )  # x-derivative kernel
+    kernel_y = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]]) / (
+        8.0 * 20  # array is now at 20m resolution
+    )  # y-derivative kernel
+    # Apply convolution to compute gradients
+    dx = convolve(dem_masked, kernel_x)  # Gradient in the x-direction
+    dy = convolve(dem_masked, kernel_y)  # Gradient in the y-direction
+    # Reapply the mask to the gradients
+    dx = np.ma.masked_where(dem_masked.mask, dx)
+    dy = np.ma.masked_where(dem_masked.mask, dy)
+    # Calculate the magnitude of the gradient (rise/run)
+    gradient_magnitude = np.ma.sqrt(dx**2 + dy**2)
+    # Convert gradient magnitude to slope (in degrees)
+    slope = np.ma.arctan(gradient_magnitude) * (180 / np.pi)
+    # Upsample to original resolution with bilinear interpolation
+    mask = slope.mask
+    mask = zoom(mask, zoom=factor, order=0)
+    slope = zoom(slope, zoom=factor, order=1)
+    slope[mask] = 65535
+    # Strip one row or column if original array was odd in that dimension
+    if x_odd:
+        slope = slope[:-1, :]
+    if y_odd:
+        slope = slope[:, :-1]
+    # Fill slope values where the original DEM had NaNs
+    slope[idx_invalid] = 65535
+    slope[np.isnan(slope)] = 65535
+    slope = slope.astype(np.uint16)
+    return xr.DataArray(
+        slope[None, :, :],
+        dims=("bands", "y", "x"),
+        coords={
+            "bands": ["slope"],
+            "y": inarr.y,
+            "x": inarr.x,
+        },
+    )
+def select_timestep_from_temporal_features(
+    features: xr.DataArray, target_date: Optional[str] = None
+) -> xr.DataArray:
+    """Select a specific timestep from temporal features based on target date.
+    Parameters
+    ----------
+    features : xr.DataArray
+        Temporal features with time dimension preserved.
+    target_date : str, optional
+        Target date in ISO format (YYYY-MM-DD). If None, selects middle timestep.
+    Returns
+    -------
+    xr.DataArray
+        Features for the selected timestep with time dimension removed.
+    """
+    if target_date is None:
+        # Select middle timestep
+        mid_idx = len(features.t) // 2
+        features = features.isel(t=mid_idx)
+    else:
+        # Parse target date and find closest timestep
+        target_datetime = np.datetime64(target_date)
+        # Check if target_datetime is within the temporal extent of features
+        min_time = features.t.min().values
+        max_time = features.t.max().values
+        if target_datetime < min_time or target_datetime > max_time:
+            raise ValueError(
+                f"Target date {target_date} is outside the temporal extent of features. "
+                f"Available time range: {min_time} to {max_time}"
+            )
+        # Find closest timestep
+        features = features.sel(t=target_datetime, method="nearest")
+    return features
+def extract_presto_embeddings(
+    inarr: xr.DataArray, parameters: dict, epsg: int
+) -> xr.DataArray:
+    """Executes the feature extraction process on the input array."""
+    if epsg is None:
+        raise ValueError(
+            "EPSG code is required for Presto feature extraction, but was "
+            "not correctly initialized."
+        )
+    if "presto_model_url" not in parameters:
+        raise ValueError('Missing required parameter "presto_model_url"')
+    presto_model_url = parameters.get("presto_model_url")
+    logger.info(f'Loading Presto model from "{presto_model_url}"')
+    prometheo_wheel_url = parameters.get("prometheo_wheel_url", PROMETHEO_WHL_URL)
+    logger.info(f'Loading Prometheo wheel from "{prometheo_wheel_url}"')
+    ignore_dependencies = parameters.get("ignore_dependencies", False)
+    if ignore_dependencies:
+        logger.info(
+            "`ignore_dependencies` flag is set to True. Make sure that "
+            "Presto and its dependencies are available on the runtime "
+            "environment"
+        )
+    # The below is required to avoid flipping of the result
+    # when running on OpenEO backend!
+    inarr = inarr.transpose(
+        "bands", "t", "x", "y"
+    )  # Presto/Prometheo expects xy dimension order
+    # Change the band names
+    new_band_names = [GFMAP_BAND_MAPPING.get(b.item(), b.item()) for b in inarr.bands]
+    inarr = inarr.assign_coords(bands=new_band_names)
+    # Log pixel statistics
+    total_pixels = inarr.size
+    num_nan_pixels = np.isnan(inarr.values).sum()
+    num_zero_pixels = (inarr.values == 0).sum()
+    num_nodatavalue_pixels = (inarr.values == 65535).sum()
+    logger.info("Band names: " + ", ".join(inarr.bands.values))
+    logger.debug(
+        f"Array dtype: {inarr.dtype}, "
+        f"Array size: {inarr.shape}, total pixels: {total_pixels}, "
+        f"Pixel statistics: NaN pixels = {num_nan_pixels} "
+        f"({num_nan_pixels / total_pixels * 100:.2f}%), "
+        f"0 pixels = {num_zero_pixels} "
+        f"({num_zero_pixels / total_pixels * 100:.2f}%), "
+        f"NODATAVALUE pixels = {num_nodatavalue_pixels} "
+        f"({num_nodatavalue_pixels / total_pixels * 100:.2f}%)"
+    )
+    # Log mean value (ignoring NaNs) per band
+    for band in inarr.bands.values:
+        band_data = inarr.sel(bands=band).values
+        mean_value = np.nanmean(band_data)
+        logger.debug(f"Band '{band}': Mean value (ignoring NaNs) = {mean_value:.2f}")
+    # Handle NaN values in Presto compatible way
+    inarr = inarr.fillna(65535)
+    if not ignore_dependencies:
+        # Unzip the Presto dependencies on the backend
+        logger.info("Unpacking prometheo wheel")
+        deps_dir = unpack_prometheo_wheel(prometheo_wheel_url)
+        logger.info("Appending dependencies")
+        sys.path.append(str(deps_dir))
+    if "slope" not in inarr.bands:
+        # If 'slope' is not present we need to compute it here
+        logger.warning("`slope` not found in input array. Computing ...")
+        resolution = evaluate_resolution(inarr.isel(t=0), epsg)
+        slope = compute_slope(inarr.isel(t=0), resolution)
+        slope = slope.expand_dims({"t": inarr.t}, axis=0).astype("float32")
+        inarr = xr.concat([inarr.astype("float32"), slope], dim="bands")
+    batch_size = parameters.get("batch_size", 256)
+    temporal_prediction = parameters.get("temporal_prediction", False)
+    target_date = parameters.get("target_date", None)
+    logger.info(
+        (
+            f"Extracting Presto features with batch size {batch_size}, "
+            f"temporal_prediction={temporal_prediction}, "
+            f"target_date={target_date}"
+        )
+    )
+    # TODO: compile_presto not used for now?
+    # compile_presto = parameters.get("compile_presto", False)
+    # self.logger.info(f"Compile presto: {compile_presto}")
+    logger.info("Loading Presto model for inference")
+    # TODO: try to take run_model_inference from worldcereal
+    from prometheo.datasets.worldcereal import run_model_inference
+    from prometheo.models import Presto
+    from prometheo.models.pooling import PoolingMethods
+    from prometheo.models.presto.wrapper import load_presto_weights
+    presto_model = Presto()
+    presto_model = load_presto_weights(presto_model, presto_model_url)
+    logger.info("Extracting presto features")
+    # Check if we have the expected 12 timesteps
+    if len(inarr.t) != 12:
+        raise ValueError(f"Can only run Presto on 12 timesteps, got: {len(inarr.t)}")
+    # Determine pooling method based on temporal_prediction parameter
+    pooling_method = (
+        PoolingMethods.TIME if temporal_prediction else PoolingMethods.GLOBAL
+    )
+    logger.info(f"Using pooling method: {pooling_method}")
+    features = run_model_inference(
+        inarr,
+        presto_model,
+        epsg=epsg,
+        batch_size=batch_size,
+        pooling_method=pooling_method,
+    )
+    # If temporal prediction, select specific timestep based on target_date
+    if temporal_prediction:
+        features = select_timestep_from_temporal_features(features, target_date)
+    features = features.transpose(
+        "bands", "y", "x"
+    )  # openEO expects yx order after the UDF
+    return features
+def get_latlons(inarr: xr.DataArray, epsg: int) -> xr.DataArray:
+    """Returns the latitude and longitude coordinates of the given array in
+    a dataarray. Returns a dataarray with the same width/height of the input
+    array, but with two bands, one for latitude and one for longitude. The
+    metadata coordinates of the output array are the same as the input
+    array, as the array wasn't reprojected but instead new features were
+    computed.
+    The latitude and longitude band names are standardized to the names
+    `LAT_HARMONIZED_NAME` and `LON_HARMONIZED_NAME` respectively.
+    """
+    lon = inarr.coords["x"]
+    lat = inarr.coords["y"]
+    lon, lat = np.meshgrid(lon, lat)
+    if epsg is None:
+        raise Exception(
+            "EPSG code was not defined, cannot extract lat/lon array "
+            "as the CRS is unknown."
+        )
+    # If the coordiantes are not in EPSG:4326, we need to reproject them
+    if epsg != 4326:
+        # Initializes a pyproj reprojection object
+        transformer = Transformer.from_crs(
+            crs_from=CRS.from_epsg(epsg),
+            crs_to=CRS.from_epsg(4326),
+            always_xy=True,
+        )
+        lon, lat = transformer.transform(xx=lon, yy=lat)
+    # Create a two channel numpy array of the lat and lons together by stacking
+    latlon = np.stack([lat, lon])
+    # Repack in a dataarray
+    return xr.DataArray(
+        latlon,
+        dims=["bands", "y", "x"],
+        coords={
+            "bands": [LAT_HARMONIZED_NAME, LON_HARMONIZED_NAME],
+            "y": inarr.coords["y"],
+            "x": inarr.coords["x"],
+        },
+    )
+def rescale_s1_backscatter(arr: xr.DataArray) -> xr.DataArray:
+    """Rescales the input array from uint16 to float32 decibel values.
+    The input array should be in uint16 format, as this optimizes memory usage in Open-EO
+    processes. This function is called automatically on the bands of the input array, except
+    if the parameter `rescale_s1` is set to False.
+    """
+    s1_bands = ["S1-SIGMA0-VV", "S1-SIGMA0-VH", "S1-SIGMA0-HV", "S1-SIGMA0-HH"]
+    s1_bands_to_select = list(set(arr.bands.values) & set(s1_bands))
+    if len(s1_bands_to_select) == 0:
+        return arr
+    data_to_rescale = arr.sel(bands=s1_bands_to_select).astype(np.float32).data
+    # Assert that the values are set between 1 and 65535
+    if data_to_rescale.min().item() < 1 or data_to_rescale.max().item() > 65535:
+        raise ValueError(
+            "The input array should be in uint16 format, with values between 1 and 65535. "
+            "This restriction assures that the data was processed according to the S1 fetcher "
+            "preprocessor. The user can disable this scaling manually by setting the "
+            "`rescale_s1` parameter to False in the feature extractor."
+        )
+    # Converting back to power values
+    data_to_rescale = 20.0 * np.log10(data_to_rescale) - 83.0
+    data_to_rescale = np.power(10, data_to_rescale / 10.0)
+    data_to_rescale[~np.isfinite(data_to_rescale)] = np.nan
+    # Converting power values to decibels
+    data_to_rescale = 10.0 * np.log10(data_to_rescale)
+    # Change the bands within the array
+    arr.loc[dict(bands=s1_bands_to_select)] = data_to_rescale
+    return arr
+# Below comes the actual UDF part
+# Apply the Feature Extraction UDF
+def apply_udf_data(udf_data: UdfData) -> UdfData:
+    """This is the actual openeo UDF that will be executed by the backend."""
+    cube = udf_data.datacube_list[0]
+    parameters = copy.deepcopy(udf_data.user_context)
+    proj = udf_data.proj
+    if proj is not None:
+        proj = proj["EPSG"]
+    parameters[EPSG_HARMONIZED_NAME] = proj
+    arr = cube.get_array().transpose("bands", "t", "y", "x")
+    epsg = parameters.pop(EPSG_HARMONIZED_NAME)
+    logger.info(f"EPSG code determined for feature extraction: {epsg}")
+    if parameters.get("rescale_s1", True):
+        arr = rescale_s1_backscatter(arr)
+    arr = extract_presto_embeddings(inarr=arr, parameters=parameters, epsg=epsg)
+    cube = XarrayDataCube(arr)
+    udf_data.datacube_list = [cube]
+    return udf_data
+# Change band names
+def apply_metadata(metadata: CollectionMetadata, context: dict) -> CollectionMetadata:
+    return metadata.rename_labels(
+        dimension="bands", target=[f"presto_ft_{i}" for i in range(128)]
+    )

worldcereal/openeo/inference.py ADDED Viewed

	@@ -0,0 +1,1191 @@

+"""openEO UDF to compute Presto/Prometheo features with clean code structure."""
+import logging
+import os
+import random
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import requests
+import xarray as xr
+from openeo.udf import XarrayDataCube
+from openeo.udf.udf_data import UdfData
+from pyproj import Transformer
+from scipy.ndimage import convolve, zoom
+from shapely.geometry import Point
+from shapely.ops import transform
+try:
+    from loguru import logger
+    logger.remove()
+    logger.add(sys.stderr, level="INFO")
+    class InterceptHandler(logging.Handler):
+        def emit(self, record):
+            level = record.levelname
+            logger.opt(depth=6).log(level, record.getMessage())
+    # Replace existing handlers
+    for h in logging.root.handlers[:]:
+        logging.root.removeHandler(h)
+    logging.root.setLevel(logging.INFO)
+    logging.root.addHandler(InterceptHandler())
+except ImportError:
+    # loguru not available, use standard logging
+    logger = logging.getLogger(__name__)
+_MODULE_CACHE_KEY = f"__model_cache_{__name__}"
+# Constants
+PROMETHEO_WHL_URL = "https://s3.waw3-1.cloudferro.com/swift/v1/project_dependencies/prometheo-0.0.3-py3-none-any.whl"
+GFMAP_BAND_MAPPING = {
+    "S2-L2A-B02": "B2",
+    "S2-L2A-B03": "B3",
+    "S2-L2A-B04": "B4",
+    "S2-L2A-B05": "B5",
+    "S2-L2A-B06": "B6",
+    "S2-L2A-B07": "B7",
+    "S2-L2A-B08": "B8",
+    "S2-L2A-B8A": "B8A",
+    "S2-L2A-B11": "B11",
+    "S2-L2A-B12": "B12",
+    "S1-SIGMA0-VH": "VH",
+    "S1-SIGMA0-VV": "VV",
+    "AGERA5-TMEAN": "temperature_2m",
+    "AGERA5-PRECIP": "total_precipitation",
+}
+LAT_HARMONIZED_NAME = "GEO-LAT"
+LON_HARMONIZED_NAME = "GEO-LON"
+EPSG_HARMONIZED_NAME = "GEO-EPSG"
+S1_BANDS = ["S1-SIGMA0-VV", "S1-SIGMA0-VH", "S1-SIGMA0-HV", "S1-SIGMA0-HH"]
+NODATA_VALUE = 65535
+POSTPROCESSING_EXCLUDED_VALUES = [254, 255, 65535]
+POSTPROCESSING_NODATA = 255
+NUM_THREADS = 2
+sys.path.append("feature_deps")
+sys.path.append("onnx_deps")
+import onnxruntime as ort  # noqa: E402
+_PROMETHEO_INSTALLED = False
+# Global variables for Prometheo imports
+Presto = None
+load_presto_weights = None
+run_model_inference = None
+PoolingMethods = None
+# =============================================================================
+# STANDALONE FUNCTIONS (Work in both apply_udf_data and apply_metadata contexts)
+# =============================================================================
+def get_model_cache():
+    """Get or create module-specific cache."""
+    if not hasattr(sys, _MODULE_CACHE_KEY):
+        setattr(sys, _MODULE_CACHE_KEY, {})
+    return getattr(sys, _MODULE_CACHE_KEY)
+def _ensure_prometheo_dependencies():
+    """Non-cached dependency check."""
+    global _PROMETHEO_INSTALLED, Presto, load_presto_weights, run_model_inference, PoolingMethods
+    if _PROMETHEO_INSTALLED:
+        return
+    try:
+        # Try to import first
+        from prometheo.datasets.worldcereal import run_model_inference
+        from prometheo.models import Presto
+        from prometheo.models.pooling import PoolingMethods
+        from prometheo.models.presto.wrapper import load_presto_weights
+        # They're now available in the global scope
+        _PROMETHEO_INSTALLED = True
+        return
+    except ImportError:
+        pass
+    # Installation required
+    logger.info("Prometheo not available, installing...")
+    _install_prometheo()
+    # Import immediately after installation - these will be available globally
+    from prometheo.datasets.worldcereal import run_model_inference
+    from prometheo.models import Presto
+    from prometheo.models.pooling import PoolingMethods
+    from prometheo.models.presto.wrapper import load_presto_weights
+    optimize_pytorch_cpu_performance(NUM_THREADS)
+    _PROMETHEO_INSTALLED = True
+def _install_prometheo():
+    """Non-cached installation function."""
+    import shutil
+    import tempfile
+    import urllib.request
+    import zipfile
+    temp_dir = Path(tempfile.mkdtemp())
+    try:
+        # Download wheel
+        wheel_path, _ = urllib.request.urlretrieve(PROMETHEO_WHL_URL)
+        # Extract to temp directory
+        with zipfile.ZipFile(wheel_path, "r") as zip_ref:
+            zip_ref.extractall(temp_dir)
+        # Add to Python path
+        sys.path.append(str(temp_dir))
+        logger.info(f"Prometheo installed to {temp_dir}.")
+    except Exception as e:
+        if temp_dir.exists():
+            shutil.rmtree(temp_dir)
+        logger.error(f"Failed to install prometheo: {e}")
+        raise
+def load_onnx_model_cached(model_url: str):
+    """ONNX loading is fine since it's pure (no side effects)."""
+    cache = get_model_cache()
+    if model_url in cache:
+        logger.debug(f"ONNX model cache hit for {model_url}.")
+        return cache[model_url]
+    logger.info(f"Loading ONNX model from {model_url}")
+    response = requests.get(model_url, timeout=120)
+    session_options, providers = optimize_onnx_cpu_performance(NUM_THREADS)
+    model = ort.InferenceSession(response.content, session_options, providers=providers)
+    metadata = model.get_modelmeta().custom_metadata_map
+    class_params = eval(metadata["class_params"], {"__builtins__": None}, {})
+    lut = dict(zip(class_params["class_names"], class_params["class_to_label"]))
+    sorted_lut = {k: v for k, v in sorted(lut.items(), key=lambda item: item[1])}
+    result = (model, sorted_lut)
+    cache[model_url] = result
+    return result
+def load_presto_weights_cached(presto_model_url: str):
+    """Manual caching for Presto weights with dependency check."""
+    cache = get_model_cache()
+    if presto_model_url in cache:
+        logger.debug(f"Presto model cache hit for {presto_model_url}")
+        return cache[presto_model_url]
+    # Ensure dependencies are available (not cached)
+    _ensure_prometheo_dependencies()
+    logger.info(f"Loading Presto weights from: {presto_model_url}")
+    model = Presto()  # type: ignore
+    result = load_presto_weights(model, presto_model_url)  # type: ignore
+    cache[presto_model_url] = result
+    return result
+def get_output_labels(lut_sorted: dict, postprocess_parameters: dict = {}) -> list:
+    """Generate output band names from LUT - works in both contexts.
+    Parameters
+    ----------
+    lut_sorted : dict
+        Sorted lookup table mapping class names to labels.
+    postprocess_parameters : dict
+        Postprocessing parameters to determine whether to keep per-class probability bands.
+        If not provided, we assume all probabilities are kept."""
+    # Determine whether to remove per-class probability bands
+    # based on postprocessing parameters
+    postprocessing_enabled = postprocess_parameters.get("enabled", True)
+    keep_class_probs = postprocess_parameters.get("keep_class_probs", True)
+    if postprocessing_enabled and (not keep_class_probs):
+        # Only classification and overall probability
+        return ["classification", "probability"]
+    else:
+        # Include per-class probabilities
+        class_names = lut_sorted.keys()
+        return ["classification", "probability"] + [
+            f"probability_{name}" for name in class_names
+        ]
+def optimize_pytorch_cpu_performance(num_threads):
+    """CPU-specific optimizations for Prometheo."""
+    import torch
+    # Thread configuration
+    torch.set_num_threads(num_threads)
+    torch.set_num_interop_threads(
+        num_threads
+    )  # TODO test setting to 4 due to parallel slope cal ect
+    os.environ["OMP_NUM_THREADS"] = str(num_threads)
+    os.environ["MKL_NUM_THREADS"] = str(num_threads)
+    os.environ["OPENBLAS_NUM_THREADS"] = str(num_threads)
+    logger.info(f"PyTorch CPU:  using {num_threads} threads")
+    # CPU-specific optimizations
+    if hasattr(torch.backends, "mkldnn"):
+        torch.backends.mkldnn.enabled = True
+    torch.set_grad_enabled(False)  # Disable gradients for inference
+    return num_threads
+def optimize_onnx_cpu_performance(num_threads):
+    """CPU-specific ONNX optimizations."""
+    session_options = ort.SessionOptions()
+    session_options.intra_op_num_threads = num_threads
+    session_options.inter_op_num_threads = (
+        num_threads  # TODO test setting to 1 due to sequential nature
+    )
+    # CPU-specific optimizations
+    session_options.enable_cpu_mem_arena = True
+    session_options.enable_mem_pattern = True
+    session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    providers = ["CPUExecutionProvider"]
+    return session_options, providers
+# =============================================================================
+# POSTPROCESSING FUNCTIONS
+# =============================================================================
+def majority_vote(
+    base_labels: xr.DataArray,
+    max_probabilities: xr.DataArray,
+    kernel_size: int,
+) -> xr.DataArray:
+    """Majority vote is performed using a sliding local kernel.
+    For each pixel, the voting of a final class is done by counting
+    neighbours values.
+    Pixels that have one of the specified excluded values are
+    excluded in the voting process and are unchanged.
+    The prediction probabilities are reevaluated by taking, for each pixel,
+    the average of probabilities of the neighbors that belong to the winning class.
+    (For example, if a pixel was voted to class 2 and there are three
+    neighbors of that class, then the new probability is the sum of the
+    old probabilities of each pixels divided by 3)
+    Parameters
+    ----------
+    base_labels : xr.DataArray
+        The original predicted classification labels.
+    max_probabilities : xr.DataArray
+        The original probabilities of the winning class (ranging between 0 and 100).
+    kernel_size : int
+        The size of the kernel used for the neighbour around the pixel.
+    Returns
+    -------
+    xr.DataArray
+        The cleaned classification labels and associated probabilities.
+    """
+    from scipy.signal import convolve2d
+    prediction = base_labels.values
+    probability = max_probabilities.values
+    # As the probabilities are in integers between 0 and 100,
+    # we use uint16 matrices to store the vote scores
+    assert (
+        kernel_size <= 25
+    ), f"Kernel value cannot be larger than 25 (currently: {kernel_size}) because it might lead to scenarios where the 16-bit count matrix is overflown"
+    # Build a class mapping, so classes are converted to indexes and vice-versa
+    unique_values = set(np.unique(prediction))
+    unique_values = sorted(unique_values - set(POSTPROCESSING_EXCLUDED_VALUES))  # type: ignore
+    index_value_lut = [(k, v) for k, v in enumerate(unique_values)]
+    counts = np.zeros(shape=(*prediction.shape, len(unique_values)), dtype=np.uint16)
+    probabilities = np.zeros(
+        shape=(*probability.shape, len(unique_values)), dtype=np.uint16
+    )
+    # Iterates for each classes
+    for cls_idx, cls_value in index_value_lut:
+        # Take the binary mask of the interest class, and multiply by the probabilities
+        class_mask = ((prediction == cls_value) * probability).astype(np.uint16)
+        # Set to 0 the class scores where the label is excluded
+        for excluded_value in POSTPROCESSING_EXCLUDED_VALUES:
+            class_mask[prediction == excluded_value] = 0
+        # Binary class mask, used to count HOW MANY neighbours pixels are used for this class
+        binary_class_mask = (class_mask > 0).astype(np.uint16)
+        # Creates the kernel
+        kernel = np.ones(shape=(kernel_size, kernel_size), dtype=np.uint16)
+        # Counts around the window the sum of probabilities for that given class
+        counts[:, :, cls_idx] = convolve2d(class_mask, kernel, mode="same")
+        # Counts the number of neighbors pixels that voted for that given class
+        class_voters = convolve2d(binary_class_mask, kernel, mode="same")
+        # Remove the 0 values because might create divide by 0 issues
+        class_voters[class_voters == 0] = 1
+        probabilities[:, :, cls_idx] = np.divide(counts[:, :, cls_idx], class_voters)
+    # Initializes output array
+    aggregated_predictions = np.zeros(
+        shape=(counts.shape[0], counts.shape[1]), dtype=np.uint16
+    )
+    # Initializes probabilities output array
+    aggregated_probabilities = np.zeros(
+        shape=(counts.shape[0], counts.shape[1]), dtype=np.uint16
+    )
+    if len(unique_values) > 0:
+        # Takes the indices that have the biggest scores
+        aggregated_predictions_indices = np.argmax(counts, axis=2)
+        # Get the new probabilities of the predictions
+        aggregated_probabilities = np.take_along_axis(
+            probabilities,
+            aggregated_predictions_indices.reshape(
+                *aggregated_predictions_indices.shape, 1
+            ),
+            axis=2,
+        ).squeeze()
+        # Check which pixels have a counts value equal to 0
+        no_score_mask = np.sum(counts, axis=2) == 0
+        # convert back to values from indices
+        for cls_idx, cls_value in index_value_lut:
+            aggregated_predictions[aggregated_predictions_indices == cls_idx] = (
+                cls_value
+            )
+            aggregated_predictions = aggregated_predictions.astype(np.uint16)
+        aggregated_predictions[no_score_mask] = POSTPROCESSING_NODATA
+        aggregated_probabilities[no_score_mask] = POSTPROCESSING_NODATA
+    # Setting excluded values back to their original values
+    for excluded_value in POSTPROCESSING_EXCLUDED_VALUES:
+        aggregated_predictions[prediction == excluded_value] = excluded_value
+        aggregated_probabilities[prediction == excluded_value] = excluded_value
+    return xr.DataArray(
+        np.stack((aggregated_predictions, aggregated_probabilities)),
+        dims=["bands", "y", "x"],
+        coords={
+            "bands": ["classification", "probability"],
+            "y": base_labels.y,
+            "x": base_labels.x,
+        },
+    )
+def smooth_probabilities(
+    base_labels: xr.DataArray, class_probabilities: xr.DataArray
+) -> xr.DataArray:
+    """Performs gaussian smoothing on the class probabilities. Requires the
+    base labels to keep the pixels that are excluded away from smoothing.
+    """
+    from scipy.signal import convolve2d
+    base_labels_vals = base_labels.values
+    probabilities_vals = class_probabilities.values
+    excluded_mask = np.in1d(
+        base_labels_vals.reshape(-1),
+        POSTPROCESSING_EXCLUDED_VALUES,
+    ).reshape(*base_labels_vals.shape)
+    conv_kernel = np.array([[1, 2, 1], [2, 3, 2], [1, 2, 1]], dtype=np.int16)
+    for class_idx in range(probabilities_vals.shape[0]):
+        probabilities_vals[class_idx] = (
+            convolve2d(
+                probabilities_vals[class_idx],
+                conv_kernel,
+                mode="same",
+                boundary="symm",
+            )
+            / conv_kernel.sum()
+        )
+        probabilities_vals[class_idx][excluded_mask] = 0
+    # Sum of probabilities should be 1, cast to uint16
+    probabilities_vals = np.round(
+        probabilities_vals / probabilities_vals.sum(axis=0) * 100.0
+    ).astype("uint16")
+    return xr.DataArray(
+        probabilities_vals,
+        coords=class_probabilities.coords,
+        dims=class_probabilities.dims,
+    )
+def reclassify(
+    base_labels: xr.DataArray,
+    base_max_probs: xr.DataArray,
+    probabilities: xr.DataArray,
+) -> xr.DataArray:
+    base_labels_vals = base_labels.values
+    base_max_probs_vals = base_max_probs.values
+    excluded_mask = np.in1d(
+        base_labels_vals.reshape(-1),
+        POSTPROCESSING_EXCLUDED_VALUES,
+    ).reshape(*base_labels_vals.shape)
+    new_labels_vals = np.argmax(probabilities.values, axis=0)
+    new_max_probs_vals = np.max(probabilities.values, axis=0)
+    new_labels_vals[excluded_mask] = base_labels_vals[excluded_mask]
+    new_max_probs_vals[excluded_mask] = base_max_probs_vals[excluded_mask]
+    return xr.DataArray(
+        np.stack((new_labels_vals, new_max_probs_vals)),
+        dims=["bands", "y", "x"],
+        coords={
+            "bands": ["classification", "probability"],
+            "y": base_labels.y,
+            "x": base_labels.x,
+        },
+    )
+# =============================================================================
+# ERROR HANDLING - SIMPLE VERSION
+# =============================================================================
+def create_nan_output_array(
+    inarr: xr.DataArray, num_outputs: int, error_info: str = ""
+) -> xr.DataArray:
+    """Creates a NaN-filled output array with proper dimensions and coordinates.
+    Parameters
+    ----------
+    inarr : xr.DataArray
+        Input array to derive dimensions from
+    num_outputs : int
+        Number of output bands/classes
+    error_info : str
+        Error information to include in attributes for debugging
+    Returns
+    -------
+    xr.DataArray
+        NaN-filled array with proper structure
+    """
+    logger.error(f"Creating NaN output array due to error: {error_info}")
+    logger.error(f"Input array shape: {inarr.shape}, dims: {inarr.dims}")
+    logger.error(
+        f"Input array coords - bands: {inarr.bands.values}, t: {len(inarr.t)}, x: {len(inarr.x)}, y: {len(inarr.y)}"
+    )
+    # Create NaN array with same spatial dimensions
+    nan_array = np.full(
+        (num_outputs, len(inarr.y), len(inarr.x)), np.nan, dtype=np.float32
+    )
+    # Create output array with proper coordinates
+    output_array = xr.DataArray(
+        nan_array,
+        dims=["bands", "y", "x"],
+        coords={
+            "bands": list(range(num_outputs)),
+            "y": inarr.y,
+            "x": inarr.x,
+        },
+        attrs={"error": error_info},
+    )
+    return output_array
+# =============================================================================
+# CLASSES (Main logic for apply_udf_data)
+# =============================================================================
+class SlopeCalculator:
+    """Handles slope computation from elevation data."""
+    @staticmethod
+    def compute(resolution: float, elevation_data: np.ndarray) -> np.ndarray:
+        """Compute slope from elevation data."""
+        dem_arr = SlopeCalculator._prepare_dem_array(elevation_data)
+        dem_downsampled = SlopeCalculator._downsample_to_20m(dem_arr, resolution)
+        slope = SlopeCalculator._compute_slope_gradient(dem_downsampled)
+        result = SlopeCalculator._upsample_to_original(slope, dem_arr.shape, resolution)
+        return result
+    @staticmethod
+    def _prepare_dem_array(dem: np.ndarray) -> np.ndarray:
+        """Prepare DEM array by handling NaNs and invalid values."""
+        dem_arr = dem.astype(np.float32)
+        dem_arr[dem_arr == NODATA_VALUE] = np.nan
+        return SlopeCalculator._fill_nans(dem_arr)
+    @staticmethod
+    def _fill_nans(dem_arr: np.ndarray, max_iter: int = 2) -> np.ndarray:
+        """Fill NaN values using rolling fill approach."""
+        if max_iter == 0 or not np.any(np.isnan(dem_arr)):
+            return dem_arr
+        mask = np.isnan(dem_arr)
+        roll_params = [(0, 1), (0, -1), (1, 0), (-1, 0)]
+        random.shuffle(roll_params)
+        for roll_param in roll_params:
+            rolled = np.roll(dem_arr, roll_param, axis=(0, 1))
+            dem_arr[mask] = rolled[mask]
+        return SlopeCalculator._fill_nans(dem_arr, max_iter - 1)
+    @staticmethod
+    def _downsample_to_20m(dem_arr: np.ndarray, resolution: float) -> np.ndarray:
+        """Downsample DEM to 20m resolution for slope computation."""
+        factor = int(20 / resolution)
+        if factor < 1 or factor % 2 != 0:
+            raise ValueError(f"Unsupported resolution for slope: {resolution}")
+        X, Y = dem_arr.shape
+        pad_X, pad_Y = (
+            (factor - (X % factor)) % factor,
+            (factor - (Y % factor)) % factor,
+        )
+        padded = np.pad(dem_arr, ((0, pad_X), (0, pad_Y)), mode="reflect")
+        reshaped = padded.reshape(
+            (X + pad_X) // factor, factor, (Y + pad_Y) // factor, factor
+        )
+        return np.nanmean(reshaped, axis=(1, 3))
+    @staticmethod
+    def _compute_slope_gradient(dem: np.ndarray) -> np.ndarray:
+        """Compute slope gradient using Sobel operators."""
+        kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) / (8.0 * 20)
+        kernel_y = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]]) / (8.0 * 20)
+        dx = convolve(dem, kernel_x)
+        dy = convolve(dem, kernel_y)
+        gradient_magnitude = np.sqrt(dx**2 + dy**2)
+        return np.arctan(gradient_magnitude) * (180 / np.pi)
+    @staticmethod
+    def _upsample_to_original(
+        slope: np.ndarray, original_shape: Tuple[int, ...], resolution: float
+    ) -> np.ndarray:
+        """Upsample slope back to original resolution."""
+        factor = int(20 / resolution)
+        slope_upsampled = zoom(slope, zoom=factor, order=1)
+        # Handle odd dimensions
+        if original_shape[0] % 2 != 0:
+            slope_upsampled = slope_upsampled[:-1, :]
+        if original_shape[1] % 2 != 0:
+            slope_upsampled = slope_upsampled[:, :-1]
+        return slope_upsampled.astype(np.uint16)
+class CoordinateTransformer:
+    """Handles coordinate transformations and spatial operations."""
+    @staticmethod
+    def get_resolution(inarr: xr.DataArray, epsg: int) -> float:
+        """Calculate resolution in meters."""
+        if epsg == 4326:
+            return CoordinateTransformer._get_wgs84_resolution(inarr)
+        return abs(inarr.x[1].values - inarr.x[0].values)
+    @staticmethod
+    def _get_wgs84_resolution(inarr: xr.DataArray) -> float:
+        """Convert WGS84 coordinates to meters for resolution calculation."""
+        transformer = Transformer.from_crs(4326, 3857, always_xy=True)
+        points = [Point(x, y) for x, y in zip(inarr.x.values, inarr.y.values)]
+        points = [transform(transformer.transform, point) for point in points]
+        return abs(points[1].x - points[0].x)
+    @staticmethod
+    def get_lat_lon_array(inarr: xr.DataArray, epsg: int) -> xr.DataArray:
+        """Create latitude/longitude array from coordinates."""
+        lon, lat = np.meshgrid(inarr.x.values, inarr.y.values)
+        if epsg != 4326:
+            transformer = Transformer.from_crs(epsg, 4326, always_xy=True)
+            lon, lat = transformer.transform(lon, lat)
+        latlon = np.stack([lat, lon])
+        return xr.DataArray(
+            latlon,
+            dims=["bands", "y", "x"],
+            coords={
+                "bands": [LAT_HARMONIZED_NAME, LON_HARMONIZED_NAME],
+                "y": inarr.y,
+                "x": inarr.x,
+            },
+        )
+class DataPreprocessor:
+    """Handles data preprocessing operations."""
+    @staticmethod
+    def rescale_s1_backscatter(arr: xr.DataArray) -> xr.DataArray:
+        """Rescale Sentinel-1 backscatter from uint16 to dB values."""
+        s1_bands_present = [b for b in S1_BANDS if b in arr.bands.values]
+        if not s1_bands_present:
+            return arr
+        s1_data = arr.sel(bands=s1_bands_present).astype(np.float32)
+        DataPreprocessor._validate_s1_data(s1_data.values)
+        # Convert to power values then to dB
+        power_values = 20.0 * np.log10(s1_data.values) - 83.0
+        power_values = np.power(10, power_values / 10.0)
+        power_values[~np.isfinite(power_values)] = np.nan
+        db_values = 10.0 * np.log10(power_values)
+        arr.loc[dict(bands=s1_bands_present)] = db_values
+        return arr
+    @staticmethod
+    def _validate_s1_data(data: np.ndarray) -> None:
+        """Validate S1 data meets preprocessing requirements."""
+        if data.min() < 1 or data.max() > NODATA_VALUE:
+            raise ValueError(
+                "S1 data should be uint16 format with values 1-65535. "
+                "Set 'rescale_s1' to False to disable scaling."
+            )
+class PrestoFeatureExtractor:
+    """Handles Presto feature extraction pipeline."""
+    def __init__(self, parameters: Dict[str, Any]):
+        self.parameters = parameters
+    def extract(self, inarr: xr.DataArray, epsg: int) -> xr.DataArray:
+        """Extract Presto features from input array."""
+        if epsg is None:
+            raise ValueError("EPSG code required for Presto feature extraction")
+        # ONLY check top level - no nested lookup
+        presto_model_url = self.parameters.get("presto_model_url")
+        if not presto_model_url:
+            logger.error(
+                f"Missing presto_model_url. Available keys: {list(self.parameters.keys())}"
+            )
+            raise ValueError('Missing required parameter "presto_model_url"')
+        if len(inarr.t) != 12:
+            error_msg = (
+                f"Presto requires exactly 12 timesteps, but got {len(inarr.t)}. "
+                f"Available timesteps: {inarr.t.values}. "
+                f"Patch coordinates - x: {inarr.x.values.tolist()}, y: {inarr.y.values.tolist()}"
+            )
+            logger.error(error_msg)
+            # Return NaN array instead of crashing
+            return create_nan_output_array(
+                inarr, self.parameters["num_outputs"], error_msg
+            )
+        inarr = self._preprocess_input(inarr)
+        if "slope" not in inarr.bands:
+            inarr = self._add_slope_band(inarr, epsg)
+        return self._run_presto_inference(inarr, epsg)
+    def _preprocess_input(self, inarr: xr.DataArray) -> xr.DataArray:
+        """Preprocess input array for Presto."""
+        inarr = inarr.transpose("bands", "t", "x", "y")
+        # Harmonize band names
+        new_bands = [GFMAP_BAND_MAPPING.get(b.item(), b.item()) for b in inarr.bands]
+        inarr = inarr.assign_coords(bands=new_bands)
+        return inarr.fillna(NODATA_VALUE)
+    def _add_slope_band(self, inarr: xr.DataArray, epsg: int) -> xr.DataArray:
+        """Compute and add slope band to array."""
+        logger.warning("Slope band not found, computing...")
+        resolution = CoordinateTransformer.get_resolution(inarr.isel(t=0), epsg)
+        elevation_data = inarr.sel(bands="COP-DEM").isel(t=0).values
+        slope_array = SlopeCalculator.compute(resolution, elevation_data)
+        slope_da = (
+            xr.DataArray(
+                slope_array[None, :, :],
+                dims=("bands", "y", "x"),
+                coords={"bands": ["slope"], "y": inarr.y, "x": inarr.x},
+            )
+            .expand_dims({"t": inarr.t})
+            .astype("float32")
+        )
+        return xr.concat([inarr.astype("float32"), slope_da], dim="bands")
+    def _run_presto_inference(self, inarr: xr.DataArray, epsg: int) -> xr.DataArray:
+        """Run Presto model inference with safe dependency handling."""
+        # Dependencies are now handled by load_presto_weights_cached
+        import gc
+        import torch
+        _ensure_prometheo_dependencies()
+        presto_model_url = self.parameters["presto_model_url"]
+        model = load_presto_weights_cached(presto_model_url)
+        # Import here to ensure dependencies are available
+        pooling_method = (
+            PoolingMethods.TIME  # type: ignore
+            if self.parameters.get("temporal_prediction")
+            else PoolingMethods.GLOBAL  # type: ignore
+        )
+        logger.info("Running presto inference ...")
+        try:
+            with torch.inference_mode():
+                features = run_model_inference(
+                    inarr,
+                    model,
+                    epsg=epsg,
+                    batch_size=self.parameters.get("batch_size", 256),  # TODO optimize?
+                    pooling_method=pooling_method,
+                )  # type: ignore
+            logger.info("Inference completed.")
+            if self.parameters.get("temporal_prediction"):
+                features = self._select_temporal_features(features)
+            return features.transpose("bands", "y", "x")
+        finally:
+            gc.collect()
+    def _select_temporal_features(self, features: xr.DataArray) -> xr.DataArray:
+        """Select specific timestep from temporal features."""
+        target_date = self.parameters.get("target_date")
+        if target_date is None:
+            mid_idx = len(features.t) // 2
+            return features.isel(t=mid_idx)
+        target_dt = np.datetime64(target_date)
+        min_time, max_time = features.t.min().values, features.t.max().values
+        if target_dt < min_time or target_dt > max_time:
+            raise ValueError(
+                f"Target date {target_date} outside feature range: {min_time} to {max_time}"
+            )
+        return features.sel(t=target_dt, method="nearest")
+class ONNXClassifier:
+    """Handles ONNX model inference for classification."""
+    def __init__(self, parameters: Dict[str, Any]):
+        self.parameters = parameters
+    def predict(self, features: xr.DataArray) -> xr.DataArray:
+        """Run classification prediction."""
+        classifier_url = self.parameters.get("classifier_url")
+        if not classifier_url:
+            logger.error(
+                f"Missing classifier_url. Available keys: {list(self.parameters.keys())}"
+            )
+            raise ValueError('Missing required parameter "classifier_url"')
+        session, lut = load_onnx_model_cached(classifier_url)
+        features_flat = self._prepare_features(features)
+        logger.info("Running ONNX model inference ...")
+        predictions = self._run_inference(session, lut, features_flat)
+        logger.info("ONNX inference completed.")
+        return self._reshape_predictions(predictions, features, lut)
+    def _prepare_features(self, features: xr.DataArray) -> np.ndarray:
+        """Prepare features for inference."""
+        return (
+            features.transpose("bands", "x", "y")
+            .stack(xy=["x", "y"])
+            .transpose()
+            .values
+        )
+    def _run_inference(
+        self, session: Any, lut: Dict, features: np.ndarray
+    ) -> np.ndarray:
+        """Run ONNX model inference."""
+        outputs = session.run(None, {"features": features})
+        labels = np.zeros(len(outputs[0]), dtype=np.uint16)
+        probabilities = np.zeros(len(outputs[0]), dtype=np.uint8)
+        for i, (label, prob) in enumerate(zip(outputs[0], outputs[1])):
+            labels[i] = lut[label]
+            probabilities[i] = int(round(prob[label] * 100))
+        class_probs = np.array(
+            [[prob[label] for label in lut.keys()] for prob in outputs[1]]
+        )
+        class_probs = (class_probs * 100).round().astype(np.uint8)
+        return np.hstack([labels[:, None], probabilities[:, None], class_probs]).T
+    def _reshape_predictions(
+        self, predictions: np.ndarray, original_features: xr.DataArray, lut: Dict
+    ) -> xr.DataArray:
+        """Reshape predictions to match original spatial dimensions."""
+        output_labels = get_output_labels(lut)
+        x_coords, y_coords = original_features.x.values, original_features.y.values
+        reshaped = predictions.reshape(
+            (len(output_labels), len(x_coords), len(y_coords))
+        )
+        return xr.DataArray(
+            reshaped,
+            dims=["bands", "x", "y"],
+            coords={"bands": output_labels, "x": x_coords, "y": y_coords},
+        ).transpose("bands", "y", "x")
+class Postprocessor:
+    """Handles postprocessing of classification results."""
+    def __init__(self, parameters: Dict[str, Any], classifier_url: str):
+        self.parameters = parameters
+        self.classifier_url = classifier_url
+    def apply(self, inarr: xr.DataArray) -> xr.DataArray:
+        inarr = inarr.transpose(
+            "bands", "y", "x"
+        )  # Ensure correct dimension order for openEO backend
+        _, lookup_table = load_onnx_model_cached(self.classifier_url)
+        if self.parameters.get("method") == "smooth_probabilities":
+            # Cast to float for more accurate gaussian smoothing
+            class_probabilities = (
+                inarr.isel(bands=slice(2, None)).astype("float32") / 100.0
+            )
+            # Peform probability smoothing
+            class_probabilities = smooth_probabilities(
+                inarr.sel(bands="classification"), class_probabilities
+            )
+            # Reclassify
+            new_labels = reclassify(
+                inarr.sel(bands="classification"),
+                inarr.sel(bands="probability"),
+                class_probabilities,
+            )
+            # Re-apply labels
+            class_labels = list(lookup_table.values())
+            # Create a final labels array with same dimensions as new_labels
+            final_labels = xr.full_like(new_labels, fill_value=65535)
+            for idx, label in enumerate(class_labels):
+                final_labels.loc[{"bands": "classification"}] = xr.where(
+                    new_labels.sel(bands="classification") == idx,
+                    label,
+                    final_labels.sel(bands="classification"),
+                )
+            new_labels.sel(bands="classification").values = final_labels.sel(
+                bands="classification"
+            ).values
+            # Append the per-class probabalities if required
+            if self.parameters.get("keep_class_probs", False):
+                new_labels = xr.concat([new_labels, class_probabilities], dim="bands")
+        elif self.parameters.get("method") == "majority_vote":
+            kernel_size = self.parameters.get("kernel_size", 5)
+            new_labels = majority_vote(
+                inarr.sel(bands="classification"),
+                inarr.sel(bands="probability"),
+                kernel_size=kernel_size,
+            )
+            # Append the per-class probabalities if required
+            if self.parameters.get("keep_class_probs", False):
+                class_probabilities = inarr.isel(bands=slice(2, None))
+                new_labels = xr.concat([new_labels, class_probabilities], dim="bands")
+        else:
+            raise ValueError(
+                f"Unknown post-processing method: {self.parameters.get('method')}"
+            )
+        new_labels = new_labels.transpose(
+            "bands", "y", "x"
+        )  # Ensure correct dimension order for openEO backend
+        return new_labels
+# =============================================================================
+# MAIN UDF FUNCTIONS
+# =============================================================================
+def run_single_workflow(
+    input_array: xr.DataArray,
+    epsg: int,
+    parameters: Dict[str, Any],
+    mask: Optional[xr.DataArray] = None,
+) -> xr.DataArray:
+    """Run a single classification workflow with optional masking."""
+    # Preprocess data
+    if parameters["feature_parameters"].get("rescale_s1", True):
+        logger.info("Rescale s1 ...")
+        input_array = DataPreprocessor.rescale_s1_backscatter(input_array)
+    # Extract features
+    logger.info("Extract Presto embeddings ...")
+    feature_extractor = PrestoFeatureExtractor(parameters["feature_parameters"])
+    features = feature_extractor.extract(input_array, epsg)
+    logger.info("Presto embedding extraction done.")
+    # Classify
+    logger.info("Onnx classification ...")
+    classifier = ONNXClassifier(parameters["classifier_parameters"])
+    classes = classifier.predict(features)
+    logger.info("Onnx classification done.")
+    # Postprocess
+    postprocess_parameters: Dict[str, Any] = parameters.get(
+        "postprocess_parameters", {}
+    )
+    if postprocess_parameters.get("enable"):
+        logger.info("Postprocessing classification results ...")
+        if postprocess_parameters.get("save_intermediate"):
+            classes_raw = classes.assign_coords(
+                bands=[f"raw_{b}" for b in list(classes.bands.values)]
+            )
+        postprocessor = Postprocessor(
+            postprocess_parameters,
+            classifier_url=parameters.get("classifier_parameters", {}).get(
+                "classifier_url"
+            ),
+        )
+        classes = postprocessor.apply(classes)
+        if postprocess_parameters.get("save_intermediate"):
+            classes = xr.concat([classes, classes_raw], dim="bands")
+        logger.info("Postprocessing done.")
+    # Set masked areas to specific value
+    if mask is not None:
+        logger.info("`mask` provided, applying to classification results ...")
+        classes = classes.where(mask, 254)  # 254 = non-cropland
+    return classes
+def combine_results(
+    croptype_result: xr.DataArray, cropland_result: xr.DataArray
+) -> xr.DataArray:
+    """Combine crop type results with ALL cropland classification bands."""
+    # Rename cropland bands to avoid conflicts
+    cropland_bands_renamed = [
+        f"cropland_{band}" for band in cropland_result.bands.values
+    ]
+    cropland_result = cropland_result.assign_coords(bands=cropland_bands_renamed)
+    # Rename croptype bands for clarity
+    croptype_bands_renamed = [
+        f"croptype_{band}" for band in croptype_result.bands.values
+    ]
+    croptype_result = croptype_result.assign_coords(bands=croptype_bands_renamed)
+    # Combine all bands from both results
+    combined_bands = list(croptype_bands_renamed) + list(cropland_bands_renamed)
+    combined_data = np.concatenate(
+        [croptype_result.values, cropland_result.values], axis=0
+    )
+    result = xr.DataArray(
+        combined_data,
+        dims=["bands", "y", "x"],
+        coords={
+            "bands": combined_bands,
+            "y": croptype_result.y,
+            "x": croptype_result.x,
+        },
+    )
+    return result
+def apply_udf_data(udf_data: UdfData) -> UdfData:
+    """Main UDF entry point - expects cropland_params and croptype_params in context."""
+    input_cube = udf_data.datacube_list[0]
+    parameters = udf_data.user_context.copy()
+    epsg = udf_data.proj["EPSG"] if udf_data.proj else None
+    if epsg is None:
+        raise ValueError("EPSG code not found in projection information")
+    # Prepare input array
+    input_array = input_cube.get_array().transpose("bands", "t", "y", "x")
+    # Extract both parameter sets directly from context
+    cropland_params = parameters.get("cropland_params", {})
+    croptype_params = parameters.get("croptype_params", {})
+    # Check if we have both parameter sets for dual workflow
+    if cropland_params and croptype_params:
+        logger.info(
+            "Running combined workflow: cropland masking + croptype mapping ..."
+        )
+        # Run cropland classification - pass the FLAT parameters
+        logger.info("Running cropland classification ...")
+        cropland_result = run_single_workflow(input_array, epsg, cropland_params)
+        logger.info("Cropland classification done.")
+        # Extract cropland mask for masking the crop type classification
+        cropland_mask = cropland_result.sel(bands="classification") > 0
+        # Run crop type classification with mask
+        logger.info("Running crop type classification ...")
+        croptype_result = run_single_workflow(
+            input_array, epsg, croptype_params, cropland_mask
+        )
+        logger.info("Croptype classification done.")
+        # Combine ALL bands from both results
+        result = combine_results(croptype_result, cropland_result)
+        result_cube = XarrayDataCube(result)
+    else:
+        # Single workflow (fallback to original behavior)
+        logger.info("Running single workflow ...")
+        result = run_single_workflow(input_array, epsg, parameters)
+        result_cube = XarrayDataCube(result)
+    udf_data.datacube_list = [result_cube]
+    return udf_data
+def apply_metadata(metadata, context: Dict) -> Any:
+    """Update collection metadata for combined output with ALL bands.
+    Band naming logic summary (kept for mapping module resilience):
+    - Single workflow (either cropland OR croptype parameters only):
+        Base bands: classification, probability, probability_<class>
+        If save_intermediate: raw_<band> duplicates are appended.
+    - Combined workflow (both croptype_params & cropland_params):
+        Prefixed bands: croptype_<band> and cropland_<band>
+        If save_intermediate: croptype_raw_<band> and cropland_raw_<band> duplicates appended.
+    No renaming occurs here beyond prefixing for the combined workflow; logic in
+    mapping.py must therefore accept both prefixed and unprefixed forms.
+    """
+    try:
+        # For dual workflow, combine band names from both models
+        if "croptype_params" in context and "cropland_params" in context:
+            # Get croptype band names
+            croptype_classifier_url = context["croptype_params"][
+                "classifier_parameters"
+            ].get("classifier_url")
+            if croptype_classifier_url:
+                _, croptype_lut = load_onnx_model_cached(croptype_classifier_url)
+                postprocess_parameters = context["croptype_params"].get(
+                    "postprocess_parameters", {}
+                )
+                croptype_bands = [
+                    f"croptype_{band}"
+                    for band in get_output_labels(croptype_lut, postprocess_parameters)
+                ]
+                if postprocess_parameters.get("save_intermediate", False):
+                    croptype_bands += [
+                        band.replace("croptype_", "croptype_raw_")
+                        for band in croptype_bands
+                    ]
+            else:
+                raise ValueError("No croptype LUT found")
+            # Get cropland band names
+            cropland_classifier_url = context["cropland_params"][
+                "classifier_parameters"
+            ].get("classifier_url")
+            if cropland_classifier_url:
+                _, cropland_lut = load_onnx_model_cached(cropland_classifier_url)
+                postprocess_parameters = context["cropland_params"].get(
+                    "postprocess_parameters", {}
+                )
+                cropland_bands = [
+                    f"cropland_{band}"
+                    for band in get_output_labels(cropland_lut, postprocess_parameters)
+                ]
+                if postprocess_parameters.get("save_intermediate", False):
+                    cropland_bands += [
+                        band.replace("cropland_", "cropland_raw_")
+                        for band in cropland_bands
+                    ]
+            else:
+                raise ValueError("No cropland LUT found")
+            output_labels = croptype_bands + cropland_bands
+        else:
+            # Single workflow
+            classifier_url = context["classifier_parameters"].get("classifier_url")
+            if classifier_url:
+                _, lut_sorted = load_onnx_model_cached(classifier_url)
+                postprocess_parameters = context.get("postprocess_parameters", {})
+                output_labels = get_output_labels(lut_sorted, postprocess_parameters)
+                if postprocess_parameters.get("save_intermediate", False):
+                    output_labels += [f"raw_{band}" for band in output_labels]
+            else:
+                raise ValueError("No classifier URL found in context")
+        return metadata.rename_labels(dimension="bands", target=output_labels)
+    except Exception as e:
+        logger.warning(f"Could not load model in metadata context: {e}")
+        return metadata

worldcereal/openeo/mapping.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""Mapping helpers for cropland, croptype and embeddings products.
+Band naming conventions produced by the UDF (`inference.py`):
+Single workflow (only cropland OR only croptype parameters passed to UDF):
+    classification, probability, probability_<class>
+    If save_intermediate: raw_<band> duplicates (e.g. raw_classification)
+Combined workflow (croptype with cropland masking: both `croptype_params` &
+`cropland_params` passed):
+    croptype_<band>, cropland_<band>
+    If save_intermediate: croptype_raw_<band>, cropland_raw_<band>
+        Example: croptype_classification -> croptype_raw_classification
+Important: Raw bands in the combined workflow do NOT duplicate the base prefix;
+they simply replace the leading product prefix with <product>_raw_.
+Simplification: We ignore any *save_intermediate* flags. If raw bands are
+present we save them; the UDF only emits them when intermediate results were
+requested upstream.
+"""
+from pathlib import Path
+from typing import List
+import openeo
+from openeo import DataCube
+from openeo_gfmap import TemporalContext
+from openeo_gfmap.preprocessing.scaling import compress_uint16
+from worldcereal.openeo.inference import apply_metadata
+from worldcereal.parameters import (
+    CropLandParameters,
+    CropTypeParameters,
+    EmbeddingsParameters,
+    WorldCerealProductType,
+)
+NEIGHBORHOOD_SPEC = dict(
+    size=[
+        {"dimension": "x", "unit": "px", "value": 128},
+        {"dimension": "y", "unit": "px", "value": 128},
+    ],
+    overlap=[
+        {"dimension": "x", "unit": "px", "value": 0},
+        {"dimension": "y", "unit": "px", "value": 0},
+    ],
+)
+def _run_udf(inputs: DataCube, udf: openeo.UDF) -> DataCube:
+    return inputs.apply_neighborhood(process=udf, **NEIGHBORHOOD_SPEC)
+def _reduce_temporal_mean(cube: DataCube) -> DataCube:
+    return cube.reduce_dimension(dimension="t", reducer="mean")
+def _filename_prefix(
+    product: WorldCerealProductType, temporal: TemporalContext, raw: bool = False
+) -> str:
+    suffix = "-raw" if raw else ""
+    return f"{product.value}{suffix}_{temporal.start_date}_{temporal.end_date}"
+def _save_result(cube: DataCube, prefix: str) -> DataCube:
+    return cube.save_result(format="GTiff", options={"filename_prefix": prefix})
+def _cropland_map(
+    inputs: DataCube,
+    temporal_extent: TemporalContext,
+    cropland_parameters: CropLandParameters,
+) -> List[DataCube]:
+    """Produce cropland product from preprocessed inputs (single workflow).
+    Saves final bands and any raw_* bands purely based on presence.
+    """
+    inference_udf = openeo.UDF.from_file(
+        path=Path(__file__).resolve().parent / "inference.py",
+        context=cropland_parameters.model_dump(),
+    )
+    classes = _run_udf(inputs, inference_udf)
+    classes.metadata = apply_metadata(
+        classes.metadata, cropland_parameters.model_dump()
+    )
+    classes = _reduce_temporal_mean(classes)
+    classes = compress_uint16(classes)
+    bands = classes.metadata.band_names
+    result_cubes: List[DataCube] = []
+    final_bands = [b for b in bands if not b.startswith("raw_")]
+    if final_bands:
+        final_cube = classes.filter_bands(final_bands)
+        result_cubes.append(
+            _save_result(
+                final_cube,
+                _filename_prefix(WorldCerealProductType.CROPLAND, temporal_extent),
+            )
+        )
+    raw_bands = [b for b in bands if b.startswith("raw_")]
+    if raw_bands:
+        raw_cube = classes.filter_bands(raw_bands)
+        result_cubes.append(
+            _save_result(
+                raw_cube,
+                _filename_prefix(
+                    WorldCerealProductType.CROPLAND, temporal_extent, raw=True
+                ),
+            )
+        )
+    return result_cubes
+def _croptype_map(
+    inputs: DataCube,
+    temporal_extent: TemporalContext,
+    croptype_parameters: CropTypeParameters,
+    cropland_parameters: CropLandParameters,
+) -> List[DataCube]:
+    """Produce crop type product. Optionally includes cropland masking.
+    Cropland mask final bands saved only if `croptype_parameters.save_mask` is True.
+    """
+    if croptype_parameters.mask_cropland:
+        parameters = {
+            "cropland_params": cropland_parameters.model_dump(),
+            "croptype_params": croptype_parameters.model_dump(),
+        }
+    else:
+        parameters = croptype_parameters.model_dump()
+    inference_udf = openeo.UDF.from_file(
+        path=Path(__file__).resolve().parent / "inference.py",
+        context=parameters,
+    )
+    classes = _run_udf(inputs, inference_udf)
+    classes.metadata = apply_metadata(classes.metadata, parameters)
+    classes = _reduce_temporal_mean(classes)
+    classes = compress_uint16(classes)
+    bands = classes.metadata.band_names
+    result_cubes: List[DataCube] = []
+    if croptype_parameters.mask_cropland:
+        # Prefixed croptype final and raw bands
+        croptype_final_bands = [
+            b for b in bands if b.startswith("croptype_") and "raw" not in b
+        ]
+        # Raw croptype bands (presence-based)
+        raw_croptype_bands = [b for b in bands if b.startswith("croptype_raw_")]
+    else:
+        # Single workflow: unprefixed croptype bands
+        croptype_final_bands = [b for b in bands if not b.startswith("raw_")]
+        raw_croptype_bands = [b for b in bands if b.startswith("raw_")]
+    # Final croptype
+    croptype_cube = classes.filter_bands(croptype_final_bands).rename_labels(
+        dimension="bands",
+        target=[
+            b.replace("croptype_", "") for b in croptype_final_bands
+        ],  # Remove prefix
+    )
+    result_cubes.append(
+        _save_result(
+            croptype_cube,
+            _filename_prefix(WorldCerealProductType.CROPTYPE, temporal_extent),
+        )
+    )
+    # Raw croptype if present
+    if raw_croptype_bands:
+        raw_croptype_cube = classes.filter_bands(raw_croptype_bands).rename_labels(
+            dimension="bands",
+            target=[
+                b.replace("croptype_", "") for b in raw_croptype_bands
+            ],  # Remove prefix
+        )
+        result_cubes.append(
+            _save_result(
+                raw_croptype_cube,
+                _filename_prefix(
+                    WorldCerealProductType.CROPTYPE, temporal_extent, raw=True
+                ),
+            )
+        )
+    # Optional cropland mask & raw cropland bands
+    if croptype_parameters.save_mask:
+        cropland_final_bands = [
+            b
+            for b in bands
+            if b.startswith("cropland_") and not b.startswith("cropland_raw_")
+        ]
+        cropland_cube = classes.filter_bands(cropland_final_bands).rename_labels(
+            dimension="bands",
+            target=[
+                b.replace("cropland_", "") for b in cropland_final_bands
+            ],  # Remove prefix
+        )
+        result_cubes.append(
+            _save_result(
+                cropland_cube,
+                _filename_prefix(WorldCerealProductType.CROPLAND, temporal_extent),
+            )
+        )
+        raw_cropland_bands = [b for b in bands if b.startswith("cropland_raw_")]
+        if raw_cropland_bands:
+            raw_cropland_cube = classes.filter_bands(raw_cropland_bands).rename_labels(
+                dimension="bands",
+                target=[
+                    b.replace("cropland_", "") for b in raw_cropland_bands
+                ],  # Remove prefix
+            )
+            result_cubes.append(
+                _save_result(
+                    raw_cropland_cube,
+                    _filename_prefix(
+                        WorldCerealProductType.CROPLAND, temporal_extent, raw=True
+                    ),
+                )
+            )
+    return result_cubes
+def _embeddings_map(
+    inputs: DataCube,
+    temporal_extent: TemporalContext,  # temporal extent unused but kept for signature consistency
+    embeddings_parameters: EmbeddingsParameters,
+    scale_uint16: bool = True,
+) -> DataCube:
+    """Produce embeddings map using Prometheo feature extractor."""
+    feature_udf = openeo.UDF.from_file(
+        path=Path(__file__).resolve().parent / "feature_extractor.py",
+        context=embeddings_parameters.feature_parameters.model_dump(),
+    )
+    embeddings = _run_udf(inputs, feature_udf)
+    embeddings = _reduce_temporal_mean(embeddings)
+    if scale_uint16:
+        OFFSET = -6
+        SCALE = 0.0002
+        embeddings = (embeddings - OFFSET) / SCALE
+        embeddings = embeddings.linear_scale_range(0, 65534, 0, 65534)
+    return embeddings

worldcereal/openeo/preprocessing.py ADDED Viewed

	@@ -0,0 +1,599 @@

+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Union
+import pandas as pd
+from geojson import GeoJSON
+from openeo import UDF, Connection, DataCube
+from openeo_gfmap import (
+    Backend,
+    BackendContext,
+    BoundingBoxExtent,
+    FetchType,
+    SpatialContext,
+    TemporalContext,
+)
+from openeo_gfmap.fetching.generic import build_generic_extractor
+from openeo_gfmap.fetching.s1 import build_sentinel1_grd_extractor
+from openeo_gfmap.fetching.s2 import build_sentinel2_l2a_extractor
+from openeo_gfmap.preprocessing.compositing import mean_compositing, median_compositing
+from openeo_gfmap.preprocessing.sar import compress_backscatter_uint16
+from openeo_gfmap.utils.catalogue import UncoveredS1Exception, select_s1_orbitstate_vvvh
+WORLDCEREAL_S2_BANDS = [
+    "S2-L2A-B02",
+    "S2-L2A-B03",
+    "S2-L2A-B04",
+    "S2-L2A-B05",
+    "S2-L2A-B06",
+    "S2-L2A-B07",
+    "S2-L2A-B08",
+    "S2-L2A-B8A",
+    "S2-L2A-B11",
+    "S2-L2A-B12",
+]
+WORLDCEREAL_S1_BANDS = [
+    "S1-SIGMA0-VH",
+    "S1-SIGMA0-VV",
+]
+WORLDCEREAL_DEM_BANDS = ["elevation", "slope"]
+WORLDCEREAL_METEO_BANDS = ["AGERA5-PRECIP", "AGERA5-TMEAN"]
+WORLDCEREAL_BANDS = {
+    "SENTINEL2": WORLDCEREAL_S2_BANDS,
+    "SENTINEL1": WORLDCEREAL_S1_BANDS,
+    "DEM": WORLDCEREAL_DEM_BANDS,
+    "METEO": WORLDCEREAL_METEO_BANDS,
+}
+class InvalidTemporalContextError(Exception):
+    pass
+def spatially_filter_cube(
+    connection: Connection, cube: DataCube, spatial_extent: Optional[SpatialContext]
+) -> DataCube:
+    """
+    Apply spatial filtering to a data cube based on the given spatial extent.
+    Parameters
+    ----------
+    connection : Connection
+        The connection object used to interact with the openEO backend.
+    cube : DataCube
+        The input data cube to be spatially filtered.
+    spatial_extent : Optional[SpatialContext]
+        The spatial extent used for filtering the data cube. It can be a BoundingBoxExtent,
+        a GeoJSON object, or a URL to a GeoJSON or Parquet file. If set to `None`,
+        no spatial filtering will be applied.
+    Returns
+    -------
+    DataCube
+        The spatially filtered data cube.
+    Raises
+    ------
+    ValueError
+        If the spatial_extent parameter is not of type BoundingBoxExtent, GeoJSON, or str.
+    """
+    if isinstance(spatial_extent, BoundingBoxExtent):
+        cube = cube.filter_bbox(dict(spatial_extent))
+    elif isinstance(spatial_extent, GeoJSON):
+        cube = cube.filter_spatial(spatial_extent)
+    elif isinstance(spatial_extent, str):
+        geometry = connection.load_url(
+            spatial_extent,
+            format=(
+                "Parquet"
+                if ".parquet" in spatial_extent or ".geoparquet" in spatial_extent
+                else "GeoJSON"
+            ),
+        )
+        cube = cube.filter_spatial(geometry)
+    return cube
+def select_best_s1_orbit_direction(
+    backend_context: BackendContext,
+    spatial_extent: SpatialContext,
+    temporal_extent: TemporalContext,
+) -> str:
+    """Selects the best Sentinel-1 orbit direction based on the given spatio-temporal context.
+    Parameters
+    ----------
+    backend_context : BackendContext
+        The backend context for accessing the data.
+    spatial_extent : SpatialContext
+        The spatial extent of the data.
+    temporal_extent : TemporalContext
+        The temporal extent of the data.
+    Returns
+    -------
+    str
+        The selected orbit direction (either "ASCENDING" or "DESCENDING").
+    """
+    try:
+        orbit_direction = select_s1_orbitstate_vvvh(
+            backend_context, spatial_extent, temporal_extent
+        )
+    except UncoveredS1Exception as exc:
+        orbit_direction = "ASCENDING"
+        print(
+            f"Could not find any Sentinel-1 data for the given spatio-temporal context. "
+            f"Using ASCENDING orbit direction as a last resort. Error: {exc}"
+        )
+    return orbit_direction
+def raw_datacube_S2(
+    connection: Connection,
+    backend_context: BackendContext,
+    temporal_extent: TemporalContext,
+    bands: List[str],
+    fetch_type: FetchType,
+    spatial_extent: Optional[SpatialContext] = None,
+    filter_tile: Optional[str] = None,
+    distance_to_cloud_flag: Optional[bool] = True,
+    additional_masks_flag: Optional[bool] = True,
+    apply_mask_flag: Optional[bool] = False,
+    tile_size: Optional[int] = None,
+    target_epsg: Optional[int] = None,
+) -> DataCube:
+    """Extract Sentinel-2 datacube from OpenEO using GFMAP routines.
+    Raw data is extracted with no cloud masking applied by default (can be
+    enabled by setting `apply_mask=True`). In additional to the raw band values
+    a cloud-mask computed from the dilation of the SCL layer, as well as a
+    rank mask from the BAP compositing are added.
+    Parameters
+    ----------
+    connection : Connection
+        OpenEO connection instance.
+    backend_context : BackendContext
+        GFMAP Backend context to use for extraction.
+    temporal_extent : TemporalContext
+        Temporal context to extract data from.
+    bands : List[str]
+        List of Sentinel-2 bands to extract.
+    fetch_type : FetchType
+        GFMAP Fetch type to use for extraction.
+    spatial_extent : Optional[SpatialContext], optional
+        Spatial context to extract data from, can be a GFMAP BoundingBoxExtent,
+        a GeoJSON dict or an URL to a publicly accessible GeoParquet file.
+    filter_tile : Optional[str], optional
+        Filter by tile ID, by default disabled. This forces the process to only
+        one tile ID from the Sentinel-2 collection.
+    apply_mask : bool, optional
+        Apply cloud masking, by default False. Can be enabled for high
+        optimization of memory usage.
+    target_epsg : Optional[int], optional
+        Target EPSG to resample the data, by default None.
+    """
+    # Extract the SCL collection only
+    scl_cube_properties = {"eo:cloud_cover": lambda val: val <= 95.0}
+    if filter_tile:
+        scl_cube_properties["tileId"] = lambda val: val == filter_tile
+    # Create the job to extract S2
+    extraction_parameters: dict[str, Any] = {
+        "target_resolution": 10,
+        "target_crs": target_epsg,
+        "load_collection": {
+            "eo:cloud_cover": lambda val: val <= 95.0,
+        },
+    }
+    scl_cube = connection.load_collection(
+        collection_id="SENTINEL2_L2A",
+        bands=["SCL"],
+        temporal_extent=[temporal_extent.start_date, temporal_extent.end_date],
+        properties=scl_cube_properties,
+    )
+    # Resample to 10m resolution for the SCL layer, using optional target_epsg
+    scl_cube = scl_cube.resample_spatial(projection=target_epsg, resolution=10)
+    # Compute the SCL dilation mask
+    scl_dilated_mask = scl_cube.process(
+        "to_scl_dilation_mask",
+        data=scl_cube,
+        scl_band_name="SCL",
+        kernel1_size=17,  # 17px dilation on a 10m layer
+        kernel2_size=77,  # 77px dilation on a 10m layer
+        mask1_values=[2, 4, 5, 6, 7],
+        mask2_values=[3, 8, 9, 10, 11],
+        erosion_kernel_size=3,
+    ).rename_labels("bands", ["S2-L2A-SCL_DILATED_MASK"])
+    additional_masks = scl_dilated_mask
+    if distance_to_cloud_flag:
+        # Compute the distance to cloud and add it to the cube
+        distance_to_cloud = scl_cube.apply_neighborhood(
+            process=UDF.from_file(Path(__file__).parent / "udf_distance_to_cloud.py"),
+            size=[
+                {"dimension": "x", "unit": "px", "value": 256},
+                {"dimension": "y", "unit": "px", "value": 256},
+                {"dimension": "t", "unit": "null", "value": "P1D"},
+            ],
+            overlap=[
+                {"dimension": "x", "unit": "px", "value": 16},
+                {"dimension": "y", "unit": "px", "value": 16},
+            ],
+        ).rename_labels("bands", ["S2-L2A-DISTANCE-TO-CLOUD"])
+        additional_masks = scl_dilated_mask.merge_cubes(distance_to_cloud)
+    if additional_masks_flag:
+        extraction_parameters["pre_merge"] = additional_masks
+    if filter_tile:
+        extraction_parameters["load_collection"]["tileId"] = (
+            lambda val: val == filter_tile
+        )
+    if tile_size is not None:
+        extraction_parameters["update_arguments"] = {
+            "featureflags": {"tilesize": tile_size}
+        }
+    s2_cube = build_sentinel2_l2a_extractor(
+        backend_context,
+        bands=bands,
+        fetch_type=fetch_type,
+        **extraction_parameters,
+    ).get_cube(connection, None, temporal_extent)
+    if apply_mask_flag:
+        s2_cube = s2_cube.mask(scl_dilated_mask)
+    return s2_cube
+def raw_datacube_S1(
+    connection: Connection,
+    backend_context: BackendContext,
+    temporal_extent: TemporalContext,
+    bands: List[str],
+    fetch_type: FetchType,
+    spatial_extent: Optional[SpatialContext] = None,
+    target_resolution: float = 20.0,
+    orbit_direction: Optional[str] = None,
+    tile_size: Optional[int] = None,
+    target_epsg: Optional[int] = None,
+) -> DataCube:
+    """Extract Sentinel-1 datacube from OpenEO using GFMAP routines.
+    Parameters
+    ----------
+    connection : Connection
+        OpenEO connection instance.
+    backend_context : BackendContext
+        GFMAP Backend context to use for extraction.
+    temporal_extent : TemporalContext
+        Temporal context to extract data from.
+    bands : List[str]
+        List of Sentinel-1 bands to extract.
+    fetch_type : FetchType
+        GFMAP Fetch type to use for extraction.
+    spatial_extent : Optional[SpatialContext], optional
+        Spatial context to extract data from, can be a GFMAP BoundingBoxExtent,
+        a GeoJSON dict or an URL to a publicly accessible GeoParquet file.
+    target_resolution : float, optional
+        Target resolution to resample the data to, by default 20.0.
+    orbit_direction : Optional[str], optional
+        Orbit direction to filter the data, by default None.
+    target_epsg : Optional[int], optional
+        Target EPSG to resample the data to, by default None.
+    """
+    extractor_parameters: Dict[str, Any] = {
+        "target_resolution": target_resolution,
+        "target_crs": target_epsg,
+    }
+    if orbit_direction is not None:
+        extractor_parameters["load_collection"] = {
+            "sat:orbit_state": lambda orbit: orbit == orbit_direction,
+            "polarisation": lambda pol: pol == "VV&VH",
+        }
+    else:
+        extractor_parameters["load_collection"] = {
+            "polarisation": lambda pol: pol == "VV&VH",
+        }
+    if tile_size is not None:
+        extractor_parameters["update_arguments"] = {
+            "featureflags": {"tilesize": tile_size}
+        }
+    s1_cube = build_sentinel1_grd_extractor(
+        backend_context, bands=bands, fetch_type=fetch_type, **extractor_parameters
+    ).get_cube(connection, None, temporal_extent)
+    return s1_cube
+def raw_datacube_DEM(
+    connection: Connection,
+    backend_context: BackendContext,
+    fetch_type: FetchType,
+    spatial_extent: Optional[SpatialContext] = None,
+) -> DataCube:
+    """Method to get the DEM datacube from the backend.
+    If running on CDSE backend, the slope is also loaded from the global
+    slope collection and merged with the DEM cube.
+    Returns
+    -------
+    DataCube
+        openEO datacube with the DEM data (and slope if available).
+    """
+    extractor = build_generic_extractor(
+        backend_context=backend_context,
+        bands=["COP-DEM"],
+        fetch_type=fetch_type,
+        collection_name="COPERNICUS_30",
+    )
+    cube = extractor.get_cube(connection, None, None)
+    cube = cube.rename_labels(dimension="bands", target=["elevation"])
+    if backend_context.backend in [Backend.CDSE, Backend.CDSE_STAGING]:
+        # On CDSE we can load the slope from a global slope collection
+        slope = connection.load_stac(
+            "https://stac.openeo.vito.be/collections/COPERNICUS30_DEM_SLOPE",
+            bands=["Slope"],
+        ).rename_labels(dimension="bands", target=["slope"])
+        # Client fix for CDSE, the openeo client might be unsynchronized with
+        # the backend.
+        if "t" not in slope.metadata.dimension_names():
+            slope.metadata = slope.metadata.add_dimension("t", "2020-01-01", "temporal")
+        slope = slope.min_time()
+        # Note that when slope is available we use it as the base cube
+        # to merge DEM with, as it comes at 20m resolution.
+        cube = slope.merge_cubes(cube)
+    return cube
+def raw_datacube_METEO(
+    connection: Connection,
+    backend_context: BackendContext,
+    temporal_extent: TemporalContext,
+    fetch_type: FetchType,
+    spatial_extent: Optional[SpatialContext] = None,
+) -> DataCube:
+    extractor = build_generic_extractor(
+        backend_context=backend_context,
+        bands=["AGERA5-TMEAN", "AGERA5-PRECIP"],
+        fetch_type=fetch_type,
+        collection_name="AGERA5",
+    )
+    meteo_cube = extractor.get_cube(connection, None, temporal_extent)
+    return meteo_cube
+def precomposited_datacube_METEO(
+    connection: Connection,
+    temporal_extent: TemporalContext,
+    compositing_window: Literal["month", "dekad"] = "month",
+) -> DataCube:
+    """Extract the precipitation and temperature AGERA5 data from a
+    pre-composited and pre-processed collection. The data is stored in the
+    CloudFerro S3 stoage, allowing faster access and processing from the CDSE
+    backend.
+    Limitations:
+        - Only monthly composited data is available.
+        - Only two bands are available: precipitation-flux and temperature-mean.
+    """
+    temporal_extent = [temporal_extent.start_date, temporal_extent.end_date]
+    if compositing_window == "month":
+        # Load precomposited monthly meteo data
+        cube = connection.load_stac(
+            url="https://stac.openeo.vito.be/collections/agera5_monthly",
+            temporal_extent=temporal_extent,
+            bands=["precipitation-flux", "temperature-mean"],
+        )
+    elif compositing_window == "dekad":
+        # Load precomposited dekadal meteo data
+        cube = connection.load_stac(
+            url="https://stac.openeo.vito.be/collections/agera5_dekad",
+            temporal_extent=temporal_extent,
+            bands=["precipitation-flux", "temperature-mean"],
+        )
+    # cube.result_node().update_arguments(featureflags={"tilesize": 1})
+    cube = cube.rename_labels(
+        dimension="bands", target=["AGERA5-PRECIP", "AGERA5-TMEAN"]
+    )
+    return cube
+def worldcereal_preprocessed_inputs(
+    connection: Connection,
+    backend_context: BackendContext,
+    spatial_extent: Union[GeoJSON, BoundingBoxExtent, str],
+    temporal_extent: TemporalContext,
+    fetch_type: Optional[FetchType] = FetchType.TILE,
+    disable_meteo: bool = False,
+    validate_temporal_context: bool = True,
+    s1_orbit_state: Optional[str] = None,
+    tile_size: Optional[int] = None,
+    s2_tile: Optional[str] = None,
+    compositing_window: Literal["month", "dekad"] = "month",
+    target_epsg: Optional[int] = None,
+) -> DataCube:
+    # First validate the temporal context
+    if validate_temporal_context:
+        _validate_temporal_context(temporal_extent)
+    # See if requested compositing method is supported
+    assert compositing_window in [
+        "month",
+        "dekad",
+    ], 'Compositing window must be either "month" or "dekad"'
+    # Extraction of S2 from GFMAP
+    s2_data = raw_datacube_S2(
+        connection=connection,
+        backend_context=backend_context,
+        temporal_extent=temporal_extent,
+        bands=WORLDCEREAL_S2_BANDS,
+        fetch_type=fetch_type,
+        filter_tile=s2_tile,
+        distance_to_cloud_flag=False if fetch_type == FetchType.POINT else True,
+        additional_masks_flag=False,
+        apply_mask_flag=True,
+        tile_size=tile_size,
+        target_epsg=target_epsg,
+    )
+    s2_data = median_compositing(s2_data, period=compositing_window)
+    # Cast to uint16
+    s2_data = s2_data.linear_scale_range(0, 65534, 0, 65534)
+    # Extraction of the S1 data
+    # Decides on the orbit direction from the maximum overlapping area of
+    # available products.
+    if s1_orbit_state is None and backend_context.backend in [
+        Backend.CDSE,
+        Backend.CDSE_STAGING,
+        Backend.FED,
+    ]:
+        s1_orbit_state = select_best_s1_orbit_direction(
+            backend_context, spatial_extent, temporal_extent
+        )
+    s1_data = raw_datacube_S1(
+        connection=connection,
+        backend_context=backend_context,
+        temporal_extent=temporal_extent,
+        bands=WORLDCEREAL_S1_BANDS,
+        fetch_type=fetch_type,
+        target_resolution=20.0,  # Compute the backscatter at 20m resolution, then upsample nearest neighbor when merging cubes
+        orbit_direction=s1_orbit_state,  # If None, make the query on the catalogue for the best orbit
+        tile_size=tile_size,
+        target_epsg=target_epsg,
+    )
+    s1_data = mean_compositing(s1_data, period=compositing_window)
+    s1_data = compress_backscatter_uint16(backend_context, s1_data)
+    dem_data = raw_datacube_DEM(
+        connection=connection,
+        backend_context=backend_context,
+        fetch_type=fetch_type,
+    )
+    # Explicitly resample DEM with bilinear interpolation and based on S2 grid
+    # note: we use s2_data here as base to avoid issues at the edges because source
+    # data is not in UTM projection.
+    dem_data = dem_data.resample_cube_spatial(s2_data, method="bilinear")
+    # Cast DEM to UINT16
+    dem_data = dem_data.linear_scale_range(0, 65534, 0, 65534)
+    data = s2_data.merge_cubes(s1_data)
+    data = data.merge_cubes(dem_data)
+    if not disable_meteo:
+        meteo_data = precomposited_datacube_METEO(
+            connection=connection,
+            temporal_extent=temporal_extent,
+            compositing_window=compositing_window,
+        )
+        # Explicitly resample meteo with bilinear interpolation and based on S2 grid
+        # note: we use s2_data here as base to avoid issues at the edges because source
+        # data is not in UTM projection.
+        meteo_data = meteo_data.resample_cube_spatial(s2_data, method="bilinear")
+        data = data.merge_cubes(meteo_data)
+    return data
+def _validate_temporal_context(temporal_context: TemporalContext) -> None:
+    """validation method to ensure proper specification of temporal context.
+    which requires that the start and end date are at the first and last day of a month.
+    We also check if the temporal context does not span more than a year which is
+    currently not supported.
+    Parameters
+    ----------
+    temporal_context : TemporalContext
+        temporal context to validate
+    Raises
+    ------
+    InvalidTemporalContextError
+        if start_date is not on the first day of a month or end_date
+        is not on the last day of a month or the span is more than
+        one year.
+    """
+    start_date, end_date = temporal_context.to_datetime()
+    if start_date != start_date.replace(
+        day=1
+    ) or end_date != end_date + pd.offsets.MonthEnd(0):
+        error_msg = (
+            "WorldCereal uses monthly compositing. For this to work properly, "
+            "requested temporal range should start and end at the first and last "
+            "day of a month. Instead, got: "
+            f"{temporal_context.start_date} - {temporal_context.end_date}. "
+            "You may use `worldcereal.preprocessing.correct_temporal_context()` "
+            "to correct the temporal context."
+        )
+        raise InvalidTemporalContextError(error_msg)
+    if pd.Timedelta(end_date - start_date).days > 365:
+        error_msg = (
+            "WorldCereal currently does not support temporal ranges spanning "
+            "more than a year. Got: "
+            f"{temporal_context.start_date} - {temporal_context.end_date}."
+        )
+        raise InvalidTemporalContextError(error_msg)
+def correct_temporal_context(temporal_context: TemporalContext) -> TemporalContext:
+    """Corrects the temporal context to ensure that the start and end date are
+    at the first and last day of a month as required by the WorldCereal processing.
+    Parameters
+    ----------
+    temporal_context : TemporalContext
+        temporal context to correct
+    Returns
+    -------
+    TemporalContext
+        corrected temporal context
+    """
+    start_date, end_date = temporal_context.to_datetime()
+    start_date = start_date.replace(day=1)
+    end_date = end_date + pd.offsets.MonthEnd(0)
+    return TemporalContext(
+        start_date=start_date.strftime("%Y-%m-%d"),
+        end_date=end_date.strftime("%Y-%m-%d"),
+    )

worldcereal/openeo/udf_distance_to_cloud.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# /// script
+# dependencies = [
+#   "scikit-image",
+# ]
+# ///
+import numpy as np
+import xarray as xr
+from openeo.udf import XarrayDataCube
+from scipy.ndimage import distance_transform_cdt
+from skimage.morphology import binary_erosion, footprints
+def apply_datacube(cube: XarrayDataCube, context: dict) -> XarrayDataCube:
+    cube_array: xr.DataArray = cube.get_array()
+    cube_array = cube_array.transpose("bands", "y", "x")
+    clouds: xr.DataArray = np.logical_or(
+        np.logical_and(cube_array < 11, cube_array >= 8), cube_array == 3
+    ).isel(
+        bands=0
+    )  # type: ignore
+    # Calculate the Distance To Cloud score
+    # Erode
+    er = footprints.disk(3)
+    # Define a function to apply binary erosion
+    def erode(image, selem):
+        return ~binary_erosion(image, selem)
+    # Use apply_ufunc to apply the erosion operation
+    eroded = xr.apply_ufunc(
+        erode,  # function to apply
+        clouds,  # input DataArray
+        input_core_dims=[["y", "x"]],  # dimensions over which to apply function
+        output_core_dims=[["y", "x"]],  # dimensions of the output
+        vectorize=True,  # vectorize the function over non-core dimensions
+        dask="parallelized",  # enable dask parallelization
+        output_dtypes=[np.int32],  # data type of the output
+        kwargs={"selem": er},  # additional keyword arguments to pass to erode
+    )
+    # Distance to cloud in manhattan distance measure
+    distance = xr.apply_ufunc(
+        distance_transform_cdt,
+        eroded,
+        input_core_dims=[["y", "x"]],
+        output_core_dims=[["y", "x"]],
+        vectorize=True,
+        dask="parallelized",
+        output_dtypes=[np.int32],
+    )
+    distance_da = xr.DataArray(
+        distance,
+        coords={
+            "y": cube_array.coords["y"],
+            "x": cube_array.coords["x"],
+        },
+        dims=["y", "x"],
+    )
+    distance_da = distance_da.expand_dims(
+        dim={
+            "bands": cube_array.coords["bands"],
+        },
+    )
+    distance_da = distance_da.transpose("bands", "y", "x")
+    return XarrayDataCube(distance_da)

worldcereal/parameters.py ADDED Viewed

	@@ -0,0 +1,314 @@

+from datetime import datetime
+from enum import Enum
+from typing import Optional
+from pydantic import BaseModel, Field, ValidationError, model_validator
+class WorldCerealProductType(Enum):
+    """Enum to define the different WorldCereal products."""
+    CROPLAND = "cropland"
+    CROPTYPE = "croptype"
+    EMBEDDINGS = "embeddings"
+class FeaturesParameters(BaseModel):
+    """Parameters for the feature extraction UDFs. Types are enforced by
+    Pydantic.
+    Attributes
+    ----------
+    rescale_s1 : bool (default=False)
+        Whether to rescale Sentinel-1 bands before feature extraction. Should be
+        left to False, as this is done in the Presto UDF itself.
+    presto_model_url : str
+        Public URL to the Presto model used for feature extraction. The file
+        should be a PyTorch serialized model.
+    compile_presto : bool (default=False)
+        Whether to compile the Presto encoder for speeding up large-scale inference.
+    temporal_prediction : bool (default=False)
+        Whether to use temporal-explicit predictions. If True, the time dimension
+        is preserved in Presto features and a specific timestep is selected later.
+        If False, features are pooled across time (non-temporal prediction).
+    target_date : str (default=None)
+        Target date for temporal-explicit predictions in ISO format (YYYY-MM-DD).
+        Only used when temporal_prediction=True. If None, the middle timestep is used.
+    """
+    rescale_s1: bool
+    presto_model_url: str
+    compile_presto: bool
+    temporal_prediction: bool = Field(default=False)
+    target_date: Optional[str] = Field(default=None)
+    @model_validator(mode="after")
+    def check_temporal_parameters(self):
+        """Validates temporal prediction parameters."""
+        if self.target_date is not None and not self.temporal_prediction:
+            raise ValidationError(
+                "target_date can only be specified when temporal_prediction=True"
+            )
+        if self.target_date is not None:
+            try:
+                datetime.fromisoformat(self.target_date)
+            except ValueError:
+                raise ValidationError("target_date must be in ISO format (YYYY-MM-DD)")
+        return self
+class ClassifierParameters(BaseModel):
+    """Parameters for the classifier. Types are enforced by Pydantic.
+    Attributes
+    ----------
+    classifier_url : str
+        Public URL to the classifier model. Te file should be an ONNX accepting
+        a `features` field for input data and returning either two output
+        probability arrays `true` and `false` in case of cropland mapping, or
+        a probability array per-class in case of croptype mapping.
+    """
+    classifier_url: str
+class PostprocessParameters(BaseModel):
+    """Parameters for postprocessing. Types are enforced by Pydantic.
+    Attributes
+    ----------
+    enable: bool (default=True)
+        Whether to enable postprocessing.
+    method: str (default="smooth_probabilities")
+        The method to use for postprocessing. Must be one of ["smooth_probabilities", "majority_vote"]
+    kernel_size: int (default=5)
+        Used for majority vote postprocessing. Must be an odd number, larger than 1 and smaller than 25.
+    save_intermediate: bool (default=False)
+        Whether to save intermediate results (before applying the postprocessing).
+        The intermediate results will be saved in the GeoTiff format.
+    keep_class_probs: bool (default=True)
+        If the per-class probabilities should be outputted in the final product.
+    """
+    enable: bool = Field(default=True)
+    method: str = Field(default="smooth_probabilities")
+    kernel_size: int = Field(default=5)
+    save_intermediate: bool = Field(default=False)
+    keep_class_probs: bool = Field(default=True)
+    @model_validator(mode="after")
+    def check_parameters(self):
+        """Validates parameters."""
+        if not self.enable and self.save_intermediate:
+            raise ValueError(
+                "Cannot save intermediate results if postprocessing is disabled."
+            )
+        if self.method not in ["smooth_probabilities", "majority_vote"]:
+            raise ValueError(
+                f"Method must be one of ['smooth_probabilities', 'majority_vote'], got {self.method}"
+            )
+        if self.method == "majority_vote":
+            if self.kernel_size % 2 == 0:
+                raise ValueError(
+                    f"Kernel size for majority filtering should be an odd number, got {self.kernel_size}"
+                )
+            if self.kernel_size > 25:
+                raise ValueError(
+                    f"Kernel size for majority filtering should be an odd number smaller than 25, got {self.kernel_size}"
+                )
+            if self.kernel_size < 3:
+                raise ValueError(
+                    f"Kernel size for majority filtering should be an odd number larger than 1, got {self.kernel_size}"
+                )
+        return self
+class BaseParameters(BaseModel):
+    """Base class for shared parameter logic."""
+    postprocess_parameters: PostprocessParameters = Field(
+        default_factory=lambda: PostprocessParameters()
+    )
+    @staticmethod
+    def create_feature_parameters(**kwargs):
+        defaults = {
+            "rescale_s1": False,
+            "presto_model_url": "",
+            "compile_presto": False,
+            "temporal_prediction": False,
+            "target_date": None,
+        }
+        defaults.update(kwargs)
+        return FeaturesParameters(**defaults)
+    @staticmethod
+    def create_classifier_parameters(classifier_url: str):
+        return ClassifierParameters(classifier_url=classifier_url)
+class CropLandParameters(BaseParameters):
+    """Parameters for the cropland product inference pipeline. Types are
+    enforced by Pydantic.
+    Attributes
+    ----------
+    feature_parameters : FeaturesParameters
+        Parameters for the feature extraction UDF. Will be serialized into a
+        dictionary and passed in the process graph.
+    classifier_parameters : ClassifierParameters
+        Parameters for the classifier UDF. Will be serialized into a dictionary
+        and passed in the process graph.
+    """
+    feature_parameters: FeaturesParameters = BaseParameters.create_feature_parameters(
+        rescale_s1=False,
+        presto_model_url="https://s3.waw3-1.cloudferro.com/swift/v1/openeo-ml-models-prod/worldcereal/presto-prometheo-landcover-MulticlassWithCroplandAuxBCELoss-labelsmoothing=0.05-month-LANDCOVER10-augment=True-balance=True-timeexplicit=False-masking=enabled-run=202510301004_encoder.pt",  # NOQA
+        compile_presto=False,
+        temporal_prediction=False,
+        target_date=None,
+    )
+    @staticmethod
+    def _default_classifier_parameters() -> ClassifierParameters:
+        return BaseParameters.create_classifier_parameters(
+            classifier_url="https://s3.waw3-1.cloudferro.com/swift/v1/openeo-ml-models-prod/worldcereal/PrestoDownstreamCatBoost_temporary-crops_v201-prestorun=202510301004.onnx"  # NOQA
+        )
+    classifier_parameters: ClassifierParameters = Field(
+        default_factory=lambda: CropLandParameters._default_classifier_parameters()
+    )
+    def __init__(self, classifier_url: Optional[str] = None, **kwargs):
+        # Allow overriding classifier URL unless explicit classifier_parameters provided
+        if "classifier_parameters" not in kwargs and classifier_url is not None:
+            kwargs["classifier_parameters"] = (
+                BaseParameters.create_classifier_parameters(
+                    classifier_url=classifier_url
+                )
+            )
+        super().__init__(**kwargs)
+class CropTypeParameters(BaseParameters):
+    """Parameters for the croptype product inference pipeline. Types are
+    enforced by Pydantic.
+    Attributes
+    ----------
+    feature_parameters : FeaturesParameters
+        Parameters for the feature extraction UDF. Will be serialized into a
+        dictionary and passed in the process graph.
+    classifier_parameters : ClassifierParameters
+        Parameters for the classifier UDF. Will be serialized into a dictionary
+        and passed in the process graph.
+    mask_cropland : bool (default=True)
+        Whether or not to mask the cropland pixels before running crop type inference.
+    save_mask : bool (default=False)
+        Whether or not to save the cropland mask as an intermediate result.
+    """
+    @staticmethod
+    def _default_feature_parameters() -> FeaturesParameters:
+        """Single source of truth for default croptype feature parameters."""
+        return BaseParameters.create_feature_parameters(
+            rescale_s1=False,
+            presto_model_url="https://s3.waw3-1.cloudferro.com/swift/v1/openeo-ml-models-prod/worldcereal/presto-prometheo-croptype-with-nocrop-FocalLoss-labelsmoothing%3D0.05-month-CROPTYPE27-augment%3DTrue-balance%3DTrue-timeexplicit%3DFalse-masking%3Denabled-run%3D202510301004_encoder.pt",  # NOQA
+            compile_presto=False,
+            temporal_prediction=False,
+            target_date=None,  # By default take the middle date
+        )
+    @staticmethod
+    def _default_classifier_parameters() -> ClassifierParameters:
+        return BaseParameters.create_classifier_parameters(
+            classifier_url="https://s3.waw3-1.cloudferro.com/swift/v1/openeo-ml-models-prod/worldcereal/PrestoDownstreamCatBoost_croptype_v201-prestorun%3D202510301004.onnx"
+        )
+    feature_parameters: FeaturesParameters = Field(
+        default_factory=lambda: CropTypeParameters._default_feature_parameters()
+    )
+    classifier_parameters: ClassifierParameters = Field(
+        default_factory=lambda: CropTypeParameters._default_classifier_parameters()
+    )
+    mask_cropland: bool = Field(default=True)
+    save_mask: bool = Field(default=False)
+    def __init__(
+        self,
+        target_date: Optional[str] = None,
+        classifier_url: Optional[str] = None,
+        **kwargs,
+    ):
+        # Override feature target_date if feature_parameters not supplied
+        if "feature_parameters" not in kwargs:
+            fp = self._default_feature_parameters().model_copy()
+            fp.target_date = target_date  # type: ignore[attr-defined]
+            kwargs["feature_parameters"] = fp
+        # Override classifier URL if classifier_parameters not supplied
+        if "classifier_parameters" not in kwargs and classifier_url is not None:
+            kwargs["classifier_parameters"] = (
+                BaseParameters.create_classifier_parameters(
+                    classifier_url=classifier_url
+                )
+            )
+        super().__init__(**kwargs)
+    @model_validator(mode="after")
+    def check_mask_parameters(self):
+        """Validates the mask-related parameters."""
+        if not self.mask_cropland and self.save_mask:
+            raise ValidationError("Cannot save mask if mask_cropland is disabled.")
+        return self
+class EmbeddingsParameters(BaseParameters):
+    """Parameters for the embeddings product inference pipeline. Types are
+    enforced by Pydantic.
+    Attributes
+    ----------
+    feature_parameters : FeaturesParameters
+        Parameters for the feature extraction UDF. Will be serialized into a
+        dictionary and passed in the process graph.
+    classifier_parameters : ClassifierParameters
+        Parameters for the classifier UDF. Will be serialized into a dictionary
+        and passed in the process graph.
+    """
+    @staticmethod
+    def _default_feature_parameters() -> FeaturesParameters:
+        """Internal helper returning the default feature parameters instance.
+        Centralizes the defaults so they are declared only once.
+        """
+        return BaseParameters.create_feature_parameters(
+            rescale_s1=False,
+            presto_model_url="https://s3.waw3-1.cloudferro.com/swift/v1/openeo-ml-models-prod/worldcereal/presto-prometheo-landcover-month-LANDCOVER10-augment%3DTrue-balance%3DTrue-timeexplicit%3DFalse-run%3D202507170930_encoder.pt",  # NOQA
+            compile_presto=False,
+            temporal_prediction=False,
+            target_date=None,
+        )
+    feature_parameters: FeaturesParameters = Field(
+        # Wrap staticmethod call so pydantic receives a true zero-arg callable
+        default_factory=lambda: EmbeddingsParameters._default_feature_parameters()
+    )
+    def __init__(self, presto_model_url: Optional[str] = None, **kwargs):
+        """Allow initialization with a custom Presto model URL without
+        duplicating the default argument list.
+        Users may still pass an explicit `feature_parameters` to override all
+        aspects; in that case `presto_model_url` is ignored.
+        """
+        if "feature_parameters" not in kwargs and presto_model_url is not None:
+            fp = self._default_feature_parameters().model_copy()
+            fp.presto_model_url = presto_model_url  # type: ignore[attr-defined]
+            kwargs["feature_parameters"] = fp
+        super().__init__(**kwargs)

worldcereal/utils/models.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""Utilities around models for the WorldCereal package."""
+import json
+from functools import lru_cache
+import onnxruntime as ort
+import requests
+@lru_cache(maxsize=2)
+def load_model_onnx(model_url) -> ort.InferenceSession:
+    """Load an ONNX model from a URL.
+    Parameters
+    ----------
+    model_url: str
+        URL to the ONNX model.
+    Returns
+    -------
+    ort.InferenceSession
+        ONNX model loaded with ONNX runtime.
+    """
+    # Two minutes timeout to download the model
+    response = requests.get(model_url, timeout=120)
+    model = response.content
+    return ort.InferenceSession(model)
+def validate_cb_model(model_url: str) -> ort.InferenceSession:
+    """Validate a catboost model by loading it and checking if the required
+    metadata is present. Checks for the `class_names` and `class_to_labels`
+    fields are present in the `class_params` field of the custom metadata of
+    the model. By default, the CatBoost module should include those fields
+    when exporting a model to ONNX.
+    Raises an exception if the model is not valid.
+    Parameters
+    ----------
+    model_url : str
+        URL to the ONNX model.
+    Returns
+    -------
+    ort.InferenceSession
+        ONNX model loaded with ONNX runtime.
+    """
+    model = load_model_onnx(model_url=model_url)
+    metadata = model.get_modelmeta().custom_metadata_map
+    if "class_params" not in metadata:
+        raise ValueError("Could not find class names in the model metadata.")
+    class_params = json.loads(metadata["class_params"])
+    if "class_names" not in class_params:
+        raise ValueError("Could not find class names in the model metadata.")
+    if "class_to_label" not in class_params:
+        raise ValueError("Could not find class to labels in the model metadata.")
+    return model
+def load_model_lut(model_url: str) -> dict:
+    """Load the class names to labels mapping from a CatBoost model.
+    Parameters
+    ----------
+    model_url : str
+        URL to the ONNX model.
+    Returns
+    -------
+    dict
+        Look-up table with class names and labels.
+    """
+    model = validate_cb_model(model_url=model_url)
+    metadata = model.get_modelmeta().custom_metadata_map
+    class_params = json.loads(metadata["class_params"])
+    lut = dict(zip(class_params["class_names"], class_params["class_to_label"]))
+    sorted_lut = {k: v for k, v in sorted(lut.items(), key=lambda item: item[1])}
+    return sorted_lut