{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ae6d5585", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from matplotlib import pyplot as plt\n", "import numpy as np\n", "from sklearn.decomposition import PCA\n", "from skimage.measure import regionprops_table\n", "import os\n", "import numpy as np\n", "import rasterio\n", "from skimage.segmentation import quickshift\n", "import pandas as pd\n", "import tqdm " ] }, { "cell_type": "code", "execution_count": null, "id": "68a3d89e", "metadata": {}, "outputs": [], "source": [ "def normalize_band(img, mean, std):\n", " \"\"\"Min-max normalization using mean ± 2sigma .\"\"\"\n", " min_v = mean - 2 * std\n", " max_v = mean + 2 * std\n", " img = (img - min_v) / (max_v - min_v + 1e-6)\n", " return np.clip(img, 0, 1).astype(np.float32)\n", "\n", "band_stats_file = \"band_stats.json\"\n", "\n", "if os.path.exists(band_stats_file):\n", " with open(band_stats_file, \"r\") as f:\n", " band_stats = json.load(f)\n", "\n", "def nomralize_image(s1, s2, dem, band_stats):\n", " \"\"\"Normalize each band of the image using precomputed statistics.\"\"\"\n", " normalized_bands = []\n", " for i in range(s1.shape[2]):\n", " band_mean = band_stats['S1']['mean'][i]\n", " band_std = band_stats['S1']['std'][i]\n", " normalized_band = normalize_band(s1[:, :, i], band_mean, band_std)\n", " normalized_bands.append(normalized_band)\n", "\n", " for i in range(s2.shape[2]):\n", " band_mean = band_stats['S2']['mean'][i]\n", " band_std = band_stats['S2']['std'][i]\n", " normalized_band = normalize_band(s2[:, :, i], band_mean, band_std)\n", " normalized_bands.append(normalized_band)\n", "\n", " for i in range(dem.shape[2]):\n", " band_mean = band_stats['DEM']['mean'][i]\n", " band_std = band_stats['DEM']['std'][i]\n", " normalized_band = normalize_band(dem[:, :, i], band_mean, band_std)\n", " normalized_bands.append(normalized_band)\n", "\n", " return np.stack(normalized_bands, axis=-1)\n", "\n", "\n", "def load_image(location, date, folder):\n", " s1 = f\"{folder}/{location}/{date}_{location}_s1.tif\"\n", " s2 = f\"{folder}/{location}/{date}_{location}_s2.tif\"\n", " dem = f\"{folder}/{location}/{location}_dem.tif\"\n", " \n", " if not os.path.exists(s1):\n", " print(f\"The file {s1} does not exist.\")\n", " return None\n", " if not os.path.exists(s2):\n", " print(f\"The file {s2} does not exist.\")\n", " return None\n", " if not os.path.exists(dem):\n", " print(f\"The file {dem} does not exist.\")\n", " return None\n", " \n", " s1 = rasterio.open(s1).read()\n", " s1 = np.moveaxis(s1, 0, -1)\n", " s2 = rasterio.open(s2).read()\n", " s2 = np.moveaxis(s2, 0, -1)\n", " dem = rasterio.open(dem).read()\n", " dem = np.moveaxis(dem, 0, -1)\n", "\n", " bands = []\n", " for i in range(s1.shape[2]):\n", " bands.append(s1[:, :, i])\n", " for i in range(s2.shape[2]):\n", " bands.append(s2[:, :, i])\n", " for i in range(dem.shape[2]):\n", " bands.append(dem[:, :, i])\n", "\n", " normalized_image = nomralize_image(s1, s2, dem, band_stats)\n", "\n", " # normalized_image = np.stack(bands, axis=-1)\n", " return normalized_image" ] }, { "cell_type": "code", "execution_count": null, "id": "647d0096", "metadata": {}, "outputs": [], "source": [ "\n", "image = load_image(\"Aletsch\", \"20230618\", \"../dataset\")\n", "# image = load_image(\"Allalin\", \"20230720\", \"../dataset\")\n", "# image = load_image(\"Zmutt\", \"20230720\", \"../dataset\")\n", "image = load_image(\"Saas-Tal\", \"20231016\", \"../dataset\")\n", "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n", "ax.imshow(image[:, :, [8, 4, 12]])\n", "ax.set_title('Superpixel Boundaries on Original Image')\n", "ax.axis('off')\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "34be574b", "metadata": {}, "outputs": [], "source": [ "\n", "# remove nan values\n", "image = np.nan_to_num(image)\n", "\n", "H, W, B = image.shape \n", "X = image.reshape(-1, B).astype(np.float32)\n", "\n", "pca = PCA(n_components=3, svd_solver='randomized', whiten=False)\n", "X_pca = pca.fit_transform(X)\n", "image_pca = X_pca.reshape(H, W, 3)\n", "\n", "\n", "def normalize(img):\n", " img_min = img.min(axis=(0,1), keepdims=True)\n", " img_max = img.max(axis=(0,1), keepdims=True)\n", " return (img - img_min) / (img_max - img_min + 1e-8)\n", "\n", "image_pca_norm = normalize(image_pca)\n", "\n", "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n", "ax.imshow(image_pca_norm)\n", "ax.set_title('Superpixel Boundaries on PCA Image')\n", "ax.axis('off')\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "34deeb23", "metadata": {}, "outputs": [], "source": [ "print(\"Explained variance ratio:\", pca.explained_variance_ratio_)\n", "print(\"Total variance retained:\", pca.explained_variance_ratio_.sum())" ] }, { "cell_type": "code", "execution_count": null, "id": "47e1d4a8", "metadata": {}, "outputs": [], "source": [ "def compute_pca(image, n_components=3):\n", " H, W, B = image.shape \n", " X = image.reshape(-1, B).astype(np.float32)\n", "\n", " pca = PCA(n_components=n_components, svd_solver='randomized', whiten=False)\n", " X_pca = pca.fit_transform(X)\n", " image_pca = X_pca.reshape(H, W, n_components)\n", " \n", " return image_pca" ] }, { "cell_type": "code", "execution_count": null, "id": "92fa64b6", "metadata": {}, "outputs": [], "source": [ "chanels = [\n", " image[:, :, [4,3,2]],\n", " image[:, :, [8, 4, 12]],\n", " compute_pca(image, n_components=3)\n", "]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "670feb88", "metadata": {}, "outputs": [], "source": [ "\n", "def extract_segment_features(image_s2, segments, mask):\n", " num_bands = image_s2.shape[2]\n", " rows = []\n", "\n", " props_table = regionprops_table(\n", " segments,\n", " properties=('label', 'perimeter', 'eccentricity', 'solidity')\n", " )\n", " props_df = pd.DataFrame(props_table).set_index('label')\n", "\n", " for segment_id in np.unique(segments):\n", " segment_mask = segments == segment_id\n", "\n", " # Skip empty segments defensively\n", " if not np.any(segment_mask):\n", " continue\n", "\n", " segment_data = {'segment_id': segment_id}\n", "\n", " # ---- Spectral features ----\n", " for band in range(num_bands):\n", " band_data = image_s2[:, :, band][segment_mask]\n", "\n", " if band_data.size == 0:\n", " continue\n", "\n", " segment_data[f'b{band+1}_mean'] = band_data.mean()\n", " segment_data[f'b{band+1}_median'] = np.median(band_data)\n", " segment_data[f'b{band+1}_std'] = band_data.std()\n", " segment_data[f'b{band+1}_min'] = band_data.min()\n", " segment_data[f'b{band+1}_max'] = band_data.max()\n", "\n", " s = pd.Series(band_data)\n", " segment_data[f'b{band+1}_skew'] = s.skew()\n", " segment_data[f'b{band+1}_kurtosis'] = s.kurtosis()\n", "\n", " segment_data[f'b{band+1}_energy'] = np.mean(band_data ** 2)\n", "\n", " hist, _ = np.histogram(band_data, bins=256, range=(0, 1), density=True)\n", " hist = hist[hist > 0]\n", " segment_data[f'b{band+1}_entropy'] = -np.sum(hist * np.log2(hist)) if hist.size else 0.0\n", "\n", " # ---- Shape features (once per segment) ----\n", " if segment_id in props_df.index:\n", " row = props_df.loc[segment_id]\n", " segment_data['perimeter'] = row.perimeter\n", " segment_data['eccentricity'] = row.eccentricity\n", " segment_data['solidity'] = row.solidity\n", " else:\n", " segment_data['perimeter'] = 0.0\n", " segment_data['eccentricity'] = 0.0\n", " segment_data['solidity'] = 0.0\n", "\n", " # ---- Mask label ----\n", " mask_data = mask[segment_mask]\n", " if mask_data.size > 0:\n", " majority = np.bincount(mask_data).argmax()\n", " segment_data['mask_value'] = 0 if majority == 0 else 1\n", " else:\n", " segment_data['mask_value'] = 0\n", "\n", " rows.append(segment_data)\n", "\n", " return pd.DataFrame(rows)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "75f7c343", "metadata": {}, "outputs": [], "source": [ "\n", "train_locs = [\"Aletsch\", \"Rhone\", \"Gorner\", \"Allalin\", \"Anzere\", \"Diablerets\", \"Gorbassiere\", \"Moiry\", \"Saas-Tal\"]\n", "val_locs = [\"PleineMorte\", \"Zmutt\"]\n", "\n", "def create_dataset(locations, folder=\"\"):\n", " df = pd.DataFrame()\n", "\n", " for location in locations:\n", " dates = os.listdir(f\"{folder}/{location}/\")\n", " dates = [date.split(\"_\")[0] for date in dates if date.endswith(\"_lake_mask.tif\")]\n", " print(f\"Processing location: {location}\")\n", " for date in tqdm.tqdm(dates): \n", " mask_pth = f\"{folder}/{location}/{date}_{location}_lake_mask.tif\"\n", " image = load_image(location, date, folder)\n", " \n", " image = np.nan_to_num(image)\n", " image_superpixel = image[:, :, [8, 4, 12]]\n", " mask = rasterio.open(mask_pth).read(1)\n", " mask = mask > 1\n", " mask = mask.astype(np.uint8) \n", " segements = quickshift(image_superpixel, kernel_size=3, max_dist=6, ratio=0.5)\n", " print(f\"Number of segments: {len(np.unique(segements))}\")\n", " df_location_date = extract_segment_features(image, segements, mask)\n", " df_location_date['location'] = location\n", " df_location_date['date'] = date\n", " df = pd.concat([df, df_location_date], ignore_index=True)\n", " print(f\"Number of lakes in {location} on {date}: {df_location_date['mask_value'].sum()}\")\n", " return df\n", "\n", "df = create_dataset(train_locs, folder=\"../dataset/\")\n", "validation_df = create_dataset(val_locs, folder=\"../dataset/\")" ] }, { "cell_type": "code", "execution_count": null, "id": "17970e63", "metadata": {}, "outputs": [], "source": [ "# save the dataframes\n", "df.to_csv(\"train_segments_features.csv\", index=False)\n", "validation_df.to_csv(\"validation_segments_features.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }