{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "0dd2c5d4", "metadata": {}, "outputs": [], "source": [ "import xarray as xr\n", "import geopandas as gpd\n", "from shapely.geometry import box\n", "import rioxarray as rxr # Make sure you have rioxarray installed (pip install rioxarray)\n", "import numpy as np\n", "import ibis\n", "ibis.options.interactive = True" ] }, { "cell_type": "code", "execution_count": 2, "id": "d615f835", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con = ibis.duckdb.connect()\n", "con.raw_sql('INSTALL spatial;')\n", "con.raw_sql('LOAD spatial;')" ] }, { "cell_type": "markdown", "id": "700cf1f9", "metadata": {}, "source": [ "- The .rio accessor: https://corteva.github.io/rioxarray/html/rioxarray.html#rioxarray-rio-accessors\n", "\n", "- Affine( pixel_width, 0, top_left_x_coord,\n", " 0, -pixel_height, top_left_y_coord)\n", "\n", "- Rasterio Affine Docs (https://affine.readthedocs.io/en/latest/)" ] }, { "cell_type": "code", "execution_count": 3, "id": "cf514138", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Size: 52MB\n", "[6483600 values with dtype=float64]\n", "Coordinates:\n", " * band (band) int64 8B 1\n", " * x (x) float64 29kB -179.9 -179.8 -179.7 ... 179.8 179.9 180.0\n", " * y (y) float64 14kB 90.0 89.9 89.8 89.7 ... -89.8 -89.9 -90.0\n", " spatial_ref int64 8B 0\n", "Attributes:\n", " _FillValue: nan\n", " scale_factor: 1.0\n", " add_offset: 0.0\n" ] } ], "source": [ "filename = 'era5land_era5.nc'\n", "sat_im = rxr.open_rasterio(filename)\n", "#sat_im = sat_im.rio.reproject(\"EPSG:3031\")\n", "transform = sat_im.rio.transform()\n", "print(sat_im)" ] }, { "cell_type": "code", "execution_count": 4, "id": "106bf063", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EPSG:4326\n" ] } ], "source": [ "print(sat_im.rio.crs)" ] }, { "cell_type": "code", "execution_count": 5, "id": "7cc14869", "metadata": {}, "outputs": [], "source": [ "tab = con.read_parquet('punkter_til_CNN.parquet')" ] }, { "cell_type": "code", "execution_count": 6, "id": "0fec2bb7", "metadata": {}, "outputs": [], "source": [ "# Let's create a dummy GeoPandas DataFrame for demonstration\n", "#num_points = 200_000\n", "#frac_points = num_points/30_000_000\n", "# Generate random points within a reasonable Antarctica extent (approx for EPSG:3031)\n", "# min_x, max_x = -2000000, 2000000\n", "# min_y, max_y = -2000000, 2000000\n", "# random_x = np.random.uniform(min_x, max_x, num_points)\n", "# random_y = np.random.uniform(min_y, max_y, num_points)\n", "# ice_thickness_data = np.random.uniform(100, 5000, num_points) # Example ice thickness\n", "# v_data = np.random.uniform(0, 1, num_points) # Example velocity\n", "# temp_data = np.random.uniform(0, 1000, num_points) # Example temperature\n", "\n", "# gdf = gpd.GeoDataFrame(\n", "# {'ice_thickness': ice_thickness_data,\n", "# 'v': v_data,\n", "# 'temp': temp_data\n", "# },\n", "# geometry=gpd.points_from_xy(random_x, random_y),\n", "# crs=\"EPSG:3031\"\n", "# )\n", "#data = tab.drop(['LON','LAT'])\n", "data = tab\n", "#random_data = data.sample(frac_points)\n", "\n", "# 3.1. Create a spatial index for your GeoDataFrame\n", "gdf = data.to_pandas()\n", "gdf.crs = \"EPSG:4326\"\n", "gdf_sindex = gdf.sindex" ] }, { "cell_type": "code", "execution_count": 7, "id": "88b2eb18", "metadata": {}, "outputs": [], "source": [ "#gdf.to_parquet(\"punkter_til_CNN.parquet\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "ae2f315d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
THICKgeometryEASTNORTHvxvyvith_bmsmbzstempgridCellId
0721.812000POINT (2526549.10533 144908.31682)2.526549e+061.449083e+05153.6345602.790444153.659899721.467940346.59805377.3864410.007674260.754211142
12486.400000POINT (1521616.52708 -1469968.82491)1.521617e+06-1.469969e+06-2.169074-4.4933654.9895102398.450355134.6223432677.8181540.002828233.46151249
2802.200000POINT (2404674.36981 -1067011.29092)2.404674e+06-1.067011e+0660.294909-142.512808154.742937688.2034811586.881584249.0428030.011071261.08704370
33023.950000POINT (1699952.88986 99913.87625)1.699953e+069.991388e+040.9349682.7436262.8985603014.00247364.3499973356.3403420.002495230.660568140
41390.175481POINT (1113434.26992 1790978.98662)1.113434e+061.790979e+060.0573207.0324957.0327291235.187350178.636116892.5120170.002860253.317246246
..........................................
1997381919.390000POINT (-1486478.85277 -414384.66781)-1.486479e+06-4.143847e+05-301.083286-156.749208339.4428671897.753276630.135663825.2096780.011550254.74278193
199739601.280000POINT (-1726950.88714 238389.96168)-1.726951e+062.383900e+05-33.2936587.75735334.185438726.8830711334.391422632.8844110.025015256.262266128
1997403022.010000POINT (1265667.68011 -1049619.52903)1.265668e+06-1.049620e+06-0.757103-1.3488581.5468102749.80271834.2026913065.9019180.000934225.99740366
1997411503.770000POINT (-934393.81087 251856.89214)-9.343938e+052.518569e+05-250.500161186.584424312.3524901464.022580158.329718211.0828700.007703248.743671131
1997422781.140000POINT (1792527.87148 317702.83796)1.792528e+063.177028e+050.8050055.0143125.0785192736.50320679.9826482865.9233390.002951236.933552158
\n", "

199743 rows × 13 columns

\n", "
" ], "text/plain": [ " THICK geometry EAST \\\n", "0 721.812000 POINT (2526549.10533 144908.31682) 2.526549e+06 \n", "1 2486.400000 POINT (1521616.52708 -1469968.82491) 1.521617e+06 \n", "2 802.200000 POINT (2404674.36981 -1067011.29092) 2.404674e+06 \n", "3 3023.950000 POINT (1699952.88986 99913.87625) 1.699953e+06 \n", "4 1390.175481 POINT (1113434.26992 1790978.98662) 1.113434e+06 \n", "... ... ... ... \n", "199738 1919.390000 POINT (-1486478.85277 -414384.66781) -1.486479e+06 \n", "199739 601.280000 POINT (-1726950.88714 238389.96168) -1.726951e+06 \n", "199740 3022.010000 POINT (1265667.68011 -1049619.52903) 1.265668e+06 \n", "199741 1503.770000 POINT (-934393.81087 251856.89214) -9.343938e+05 \n", "199742 2781.140000 POINT (1792527.87148 317702.83796) 1.792528e+06 \n", "\n", " NORTH vx vy v ith_bm \\\n", "0 1.449083e+05 153.634560 2.790444 153.659899 721.467940 \n", "1 -1.469969e+06 -2.169074 -4.493365 4.989510 2398.450355 \n", "2 -1.067011e+06 60.294909 -142.512808 154.742937 688.203481 \n", "3 9.991388e+04 0.934968 2.743626 2.898560 3014.002473 \n", "4 1.790979e+06 0.057320 7.032495 7.032729 1235.187350 \n", "... ... ... ... ... ... \n", "199738 -4.143847e+05 -301.083286 -156.749208 339.442867 1897.753276 \n", "199739 2.383900e+05 -33.293658 7.757353 34.185438 726.883071 \n", "199740 -1.049620e+06 -0.757103 -1.348858 1.546810 2749.802718 \n", "199741 2.518569e+05 -250.500161 186.584424 312.352490 1464.022580 \n", "199742 3.177028e+05 0.805005 5.014312 5.078519 2736.503206 \n", "\n", " smb z s temp gridCellId \n", "0 346.598053 77.386441 0.007674 260.754211 142 \n", "1 134.622343 2677.818154 0.002828 233.461512 49 \n", "2 1586.881584 249.042803 0.011071 261.087043 70 \n", "3 64.349997 3356.340342 0.002495 230.660568 140 \n", "4 178.636116 892.512017 0.002860 253.317246 246 \n", "... ... ... ... ... ... \n", "199738 630.135663 825.209678 0.011550 254.742781 93 \n", "199739 1334.391422 632.884411 0.025015 256.262266 128 \n", "199740 34.202691 3065.901918 0.000934 225.997403 66 \n", "199741 158.329718 211.082870 0.007703 248.743671 131 \n", "199742 79.982648 2865.923339 0.002951 236.933552 158 \n", "\n", "[199743 rows x 13 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf" ] }, { "cell_type": "code", "execution_count": 9, "id": "8cd3bb9e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "199743" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(gdf)" ] }, { "cell_type": "code", "execution_count": 10, "id": "3f8fbde2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/44/y59xjnbx6fqfgz896mcmxfw80000gn/T/ipykernel_4818/1954644510.py:34: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", " if (x_idx - half < 0 or x_idx + half + 1 > ds.dims[\"x\"] or\n", "/var/folders/44/y59xjnbx6fqfgz896mcmxfw80000gn/T/ipykernel_4818/1954644510.py:35: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.\n", " y_idx - half < 0 or y_idx + half + 1 > ds.dims[\"y\"]):\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Brugte 198407 punkter, skippede 1336.\n" ] } ], "source": [ "import xarray as xr\n", "import numpy as np\n", "import geopandas as gpd\n", "from shapely.geometry import Point\n", "\n", "# Åbn temperaturdata (ingen rioxarray nødvendig)\n", "ds = xr.open_dataset(\"era5land_era5.nc\")\n", "t2m = ds[\"t2m\"] # (y, x)\n", "\n", "# Åbn dine punkter i EPSG:4326 (forudsat det passer – ellers transformér det)\n", "gdf = gpd.read_parquet(\"punkter_til_CNN.parquet\")\n", "gdf = gdf.to_crs(\"EPSG:4326\") # Sørg for at punkter er i geografisk koordinatsystem\n", "\n", "# Billedstørrelse (27 pixels dækker 2.7° × 2.7°)\n", "size = 27\n", "half = size // 2\n", "pixel_deg = 0.1 # resolution pr. pixel i grader\n", "\n", "images = []\n", "scalar_feats = ['THICK', 'vx', 'vy', 'v', 'smb', 'z', 's', 'temp', 'ith_bm', 'gridCellId']\n", "im_data = {feat: [] for feat in scalar_feats}\n", "\n", "used = 0\n", "skipped = 0\n", "\n", "for idx, row in gdf.iterrows():\n", " lon, lat = row.geometry.x, row.geometry.y\n", "\n", " # Find indeks i datasættet tættest på punktets koordinater\n", " x_idx = int(np.argmin(np.abs(ds[\"x\"].values - lon)))\n", " y_idx = int(np.argmin(np.abs(ds[\"y\"].values - lat)))\n", "\n", " # Tjek om vi kan trække et 27×27 udsnit uden at gå ud over kanter\n", " if (x_idx - half < 0 or x_idx + half + 1 > ds.dims[\"x\"] or\n", " y_idx - half < 0 or y_idx + half + 1 > ds.dims[\"y\"]):\n", " skipped += 1\n", " continue\n", "\n", " patch = t2m.isel(\n", " y=slice(y_idx - half, y_idx + half + 1),\n", " x=slice(x_idx - half, x_idx + half + 1)\n", " )\n", "\n", " images.append(patch.values)\n", " for feat in scalar_feats:\n", " im_data[feat].append(row[feat])\n", " used += 1\n", "\n", "print(f\"Brugte {used} punkter, skippede {skipped}.\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "0c5e8c24", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Gemte dataset med labels som 'conv_temp_ithbm_4326.nc'\n" ] } ], "source": [ "# Konverter billeder til en samlet 3D-array\n", "image_array = np.stack(images)\n", "\n", "# Vælg hvilken feature du vil bruge som label\n", "label_feature = \"THICK\" # ← Skift dette hvis du ønsker noget andet\n", "\n", "# Lav DataArray til billeder\n", "images_da = xr.DataArray(\n", " image_array,\n", " dims=[\"sample\", \"x\", \"y\"],\n", " coords={\"sample\": np.arange(image_array.shape[0]),\n", " \"x\": np.arange(27),\n", " \"y\": np.arange(27)},\n", " name=\"images\"\n", ")\n", "\n", "# Scalar-variabler og label\n", "scalar_data = {\n", " feat: xr.DataArray(\n", " np.array(im_data[feat]),\n", " dims=[\"sample\"],\n", " coords={\"sample\": np.arange(image_array.shape[0])},\n", " name=feat\n", " )\n", " for feat in scalar_feats\n", "}\n", "\n", "# Tilføj label som separat variabel (samme som label_feature)\n", "labels_da = xr.DataArray(\n", " np.array(im_data[label_feature]),\n", " dims=[\"sample\"],\n", " coords={\"sample\": np.arange(image_array.shape[0])},\n", " name=\"labels\"\n", ")\n", "\n", "# Saml alt i ét dataset\n", "final_ds = xr.Dataset(\n", " data_vars={\n", " \"images\": images_da,\n", " \"labels\": labels_da,\n", " **scalar_data\n", " },\n", " attrs={\n", " \"description\": f\"CNN data med temperaturbilleder og '{label_feature}' som labels.\"\n", " }\n", ")\n", "\n", "# Gem som NetCDF\n", "final_ds.to_netcdf(\"conv_temp_ithbm_4326.nc\")\n", "print(\"✅ Gemte dataset med labels som 'conv_temp_ithbm_4326.nc'\")\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "6cc1c242", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'training_data_ds' is not defined", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[12]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtraining_data_ds\u001b[49m\n", "\u001b[31mNameError\u001b[39m: name 'training_data_ds' is not defined" ] } ], "source": [ "training_data_ds" ] }, { "cell_type": "code", "execution_count": null, "id": "c0812faa", "metadata": {}, "outputs": [], "source": [ "training_data_ds.to_netcdf('conv_temp_1_01.nc')" ] }, { "cell_type": "code", "execution_count": null, "id": "bdb4d03a", "metadata": {}, "outputs": [], "source": [ "test_import = xr.open_dataset('conv_temp.nc') " ] }, { "cell_type": "code", "execution_count": null, "id": "6bf789a6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset> Size: 1GB\n",
       "Dimensions:     (x: 27, y: 27, sample: 199743)\n",
       "Coordinates:\n",
       "  * x           (x) int64 216B 0 1 2 3 4 5 6 7 8 ... 18 19 20 21 22 23 24 25 26\n",
       "  * y           (y) int64 216B 0 1 2 3 4 5 6 7 8 ... 18 19 20 21 22 23 24 25 26\n",
       "  * sample      (sample) int64 2MB 0 1 2 3 4 ... 199739 199740 199741 199742\n",
       "Data variables:\n",
       "    images      (sample, x, y) float64 1GB ...\n",
       "    labels      (sample) float64 2MB ...\n",
       "    vx          (sample) float64 2MB ...\n",
       "    vy          (sample) float64 2MB ...\n",
       "    v           (sample) float64 2MB ...\n",
       "    smb         (sample) float64 2MB ...\n",
       "    z           (sample) float64 2MB ...\n",
       "    s           (sample) float64 2MB ...\n",
       "    temp        (sample) float64 2MB ...\n",
       "    gridCellId  (sample) int64 2MB ...\n",
       "Attributes:\n",
       "    description:  CNN data with temperature images. Scalar features are every...
" ], "text/plain": [ " Size: 1GB\n", "Dimensions: (x: 27, y: 27, sample: 199743)\n", "Coordinates:\n", " * x (x) int64 216B 0 1 2 3 4 5 6 7 8 ... 18 19 20 21 22 23 24 25 26\n", " * y (y) int64 216B 0 1 2 3 4 5 6 7 8 ... 18 19 20 21 22 23 24 25 26\n", " * sample (sample) int64 2MB 0 1 2 3 4 ... 199739 199740 199741 199742\n", "Data variables:\n", " images (sample, x, y) float64 1GB ...\n", " labels (sample) float64 2MB ...\n", " vx (sample) float64 2MB ...\n", " vy (sample) float64 2MB ...\n", " v (sample) float64 2MB ...\n", " smb (sample) float64 2MB ...\n", " z (sample) float64 2MB ...\n", " s (sample) float64 2MB ...\n", " temp (sample) float64 2MB ...\n", " gridCellId (sample) int64 2MB ...\n", "Attributes:\n", " description: CNN data with temperature images. Scalar features are every..." ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_import" ] } ], "metadata": { "kernelspec": { "display_name": "appml", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }