"""GFS → ERA5 variable name, unit, and pressure-level conversion tables.

These are pure data. Tests exercise them directly. The fetcher in gfs.py
consumes them.

ERA5 (via ARCO Zarr, what GraphCast + GenCast are trained on) uses long
human-readable names: ``2m_temperature``, ``10m_u_component_of_wind``, etc.
GFS GRIB2 uses short codes: ``t2m``, ``u10``, etc. Both encode the same
physical quantities but with different conventions.

Everything here is single-source-of-truth: if GFS introduces a new short-code
convention, it only gets added here and the fetcher picks it up automatically.
"""

from __future__ import annotations

from typing import Dict, Tuple


# ---------------------------------------------------------------------------
# Surface variables (single-level)
# ---------------------------------------------------------------------------

# Each entry: ERA5 canonical name → (GFS cfgrib short-name, unit conversion)
# Unit conversion is a (scale, offset) tuple applied as gfs_value * scale + offset.
# When no conversion is needed (same units), use (1.0, 0.0).
#
# GFS units from cfgrib:
#   t2m:  K      (matches ERA5)
#   u10:  m/s    (matches ERA5)
#   v10:  m/s    (matches ERA5)
#   prmsl:Pa     (matches ERA5)
#   tp:   kg/m^2 (≡ mm of liquid water, ERA5 has m — divide by 1000)
#
# GFS GRIB2 has total_precipitation as an accumulated quantity over the forecast
# step — cfgrib exposes ``tp`` in kg/m² which is numerically the same as mm of
# liquid water. ERA5 reports precipitation in m. We divide by 1000 to match.

#
# ``ecCodes`` shortName differs from cfgrib's output variable name for a
# handful of surface fields. The GRIB *message* is indexed by the ecCodes
# shortName (``2t``, ``10u``, ``10v``, ``prmsl``, ``tp``) which is what
# ``filter_by_keys`` takes. When cfgrib materialises the filtered message as
# an xarray variable, it uses its own naming (``t2m``, ``u10``, ``v10``,
# ``prmsl``, ``tp``). We carry the ecCodes shortName here because the filter
# is load-bearing — get this wrong and filter_by_keys returns an empty
# dataset, which was the Phase-2-take-1 failure mode.

SURFACE_VARS: Dict[str, Tuple[str, Tuple[float, float]]] = {
    "2m_temperature":               ("2t",    (1.0,    0.0)),
    "10m_u_component_of_wind":      ("10u",   (1.0,    0.0)),
    "10m_v_component_of_wind":      ("10v",   (1.0,    0.0)),
    "mean_sea_level_pressure":      ("prmsl", (1.0,    0.0)),
    "total_precipitation_6hr":      ("tp",    (1e-3,   0.0)),  # mm → m
}


# ---------------------------------------------------------------------------
# Pressure-level variables (3D: time × level × lat × lon)
# ---------------------------------------------------------------------------
#
# GFS units on pressure levels:
#   t:   K       (matches ERA5)
#   q:   kg/kg   (matches ERA5)
#   u:   m/s     (matches ERA5)
#   v:   m/s     (matches ERA5)
#   w:   Pa/s    (matches ERA5 vertical_velocity)
#   gh:  gpm     (geopotential HEIGHT in meters — ERA5 has geopotential in m²/s²)
#                → multiply by g = 9.80665 to get geopotential.

PRESSURE_LEVEL_VARS: Dict[str, Tuple[str, Tuple[float, float]]] = {
    "temperature":              ("t",  (1.0,       0.0)),
    "specific_humidity":        ("q",  (1.0,       0.0)),
    "u_component_of_wind":      ("u",  (1.0,       0.0)),
    "v_component_of_wind":      ("v",  (1.0,       0.0)),
    "vertical_velocity":        ("w",  (1.0,       0.0)),
    "geopotential":             ("gh", (9.80665,   0.0)),  # gpm → m²/s²
}


# ---------------------------------------------------------------------------
# Static variables (no time dim)
# ---------------------------------------------------------------------------
#
# These aren't in standard GFS pgrb2 files. They're loaded from a small ERA5
# snapshot cached in the repo. See static_vars.py.

STATIC_VARS: Tuple[str, ...] = (
    "geopotential_at_surface",
    "land_sea_mask",
)


# ---------------------------------------------------------------------------
# Pressure levels
# ---------------------------------------------------------------------------
#
# GraphCast operational uses 13 levels. GenCast 1.0° uses the same 13 (the
# checkpoint's task_config lists them explicitly). Both models' task_configs
# carry the level list, so the fetcher never hard-codes it — but the levels
# here are what we select DOWN to from GFS's richer level set.
#
# GFS pgrb2.0p25 natively provides: 10, 20, 30, 40, 50, 70, 100, 150, 200,
# 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 925,
# 950, 975, 1000 hPa — all 13 GraphCast levels are in there.

GRAPHCAST_PRESSURE_LEVELS: Tuple[int, ...] = (
    50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 850, 925, 1000,
)


# ---------------------------------------------------------------------------
# Convenience helpers
# ---------------------------------------------------------------------------

def gfs_short_name(era5_name: str) -> str:
    """Return the GFS short-name for an ERA5 variable, or raise KeyError."""
    if era5_name in SURFACE_VARS:
        return SURFACE_VARS[era5_name][0]
    if era5_name in PRESSURE_LEVEL_VARS:
        return PRESSURE_LEVEL_VARS[era5_name][0]
    raise KeyError(f"No GFS mapping for ERA5 variable {era5_name!r}")


def unit_convert(era5_name: str, gfs_values):
    """Apply the stored (scale, offset) conversion to a GFS value/array."""
    if era5_name in SURFACE_VARS:
        scale, offset = SURFACE_VARS[era5_name][1]
    elif era5_name in PRESSURE_LEVEL_VARS:
        scale, offset = PRESSURE_LEVEL_VARS[era5_name][1]
    else:
        raise KeyError(f"No unit conversion for {era5_name!r}")
    return gfs_values * scale + offset


def all_era5_names() -> Tuple[str, ...]:
    """All ERA5 canonical names handled by this mapping (surface + pressure)."""
    return tuple(SURFACE_VARS.keys()) + tuple(PRESSURE_LEVEL_VARS.keys())