{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae6d5585",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np\n",
    "from sklearn.decomposition import PCA\n",
    "from skimage.measure import regionprops_table\n",
    "import os\n",
    "import numpy as np\n",
    "import rasterio\n",
    "from skimage.segmentation import  quickshift\n",
    "import pandas as pd\n",
    "import tqdm "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68a3d89e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize_band(img, mean, std):\n",
    "    \"\"\"Min-max normalization using mean ± 2sigma .\"\"\"\n",
    "    min_v = mean - 2 * std\n",
    "    max_v = mean + 2 * std\n",
    "    img = (img - min_v) / (max_v - min_v + 1e-6)\n",
    "    return np.clip(img, 0, 1).astype(np.float32)\n",
    "\n",
    "band_stats_file = \"band_stats.json\"\n",
    "\n",
    "if os.path.exists(band_stats_file):\n",
    "    with open(band_stats_file, \"r\") as f:\n",
    "        band_stats = json.load(f)\n",
    "\n",
    "def nomralize_image(s1, s2, dem, band_stats):\n",
    "    \"\"\"Normalize each band of the image using precomputed statistics.\"\"\"\n",
    "    normalized_bands = []\n",
    "    for i in range(s1.shape[2]):\n",
    "        band_mean = band_stats['S1']['mean'][i]\n",
    "        band_std = band_stats['S1']['std'][i]\n",
    "        normalized_band = normalize_band(s1[:, :, i], band_mean, band_std)\n",
    "        normalized_bands.append(normalized_band)\n",
    "\n",
    "    for i in range(s2.shape[2]):\n",
    "        band_mean = band_stats['S2']['mean'][i]\n",
    "        band_std = band_stats['S2']['std'][i]\n",
    "        normalized_band = normalize_band(s2[:, :, i], band_mean, band_std)\n",
    "        normalized_bands.append(normalized_band)\n",
    "\n",
    "    for i in range(dem.shape[2]):\n",
    "        band_mean = band_stats['DEM']['mean'][i]\n",
    "        band_std = band_stats['DEM']['std'][i]\n",
    "        normalized_band = normalize_band(dem[:, :, i], band_mean, band_std)\n",
    "        normalized_bands.append(normalized_band)\n",
    "\n",
    "    return np.stack(normalized_bands, axis=-1)\n",
    "\n",
    "\n",
    "def load_image(location, date, folder):\n",
    "    s1 = f\"{folder}/{location}/{date}_{location}_s1.tif\"\n",
    "    s2 = f\"{folder}/{location}/{date}_{location}_s2.tif\"\n",
    "    dem = f\"{folder}/{location}/{location}_dem.tif\"\n",
    "   \n",
    "    if not os.path.exists(s1):\n",
    "        print(f\"The file {s1} does not exist.\")\n",
    "        return None\n",
    "    if not os.path.exists(s2):\n",
    "        print(f\"The file {s2} does not exist.\")\n",
    "        return None\n",
    "    if not os.path.exists(dem):\n",
    "        print(f\"The file {dem} does not exist.\")\n",
    "        return None\n",
    "    \n",
    "    s1 = rasterio.open(s1).read()\n",
    "    s1 = np.moveaxis(s1, 0, -1)\n",
    "    s2 = rasterio.open(s2).read()\n",
    "    s2 = np.moveaxis(s2, 0, -1)\n",
    "    dem = rasterio.open(dem).read()\n",
    "    dem = np.moveaxis(dem, 0, -1)\n",
    "\n",
    "    bands = []\n",
    "    for i in range(s1.shape[2]):\n",
    "        bands.append(s1[:, :, i])\n",
    "    for i in range(s2.shape[2]):\n",
    "        bands.append(s2[:, :, i])\n",
    "    for i in range(dem.shape[2]):\n",
    "        bands.append(dem[:, :, i])\n",
    "\n",
    "    normalized_image = nomralize_image(s1, s2, dem, band_stats)\n",
    "\n",
    "    # normalized_image = np.stack(bands, axis=-1)\n",
    "    return normalized_image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "647d0096",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "image = load_image(\"Aletsch\", \"20230618\", \"../dataset\")\n",
    "# image = load_image(\"Allalin\", \"20230720\", \"../dataset\")\n",
    "# image = load_image(\"Zmutt\", \"20230720\", \"../dataset\")\n",
    "image = load_image(\"Saas-Tal\", \"20231016\", \"../dataset\")\n",
    "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n",
    "ax.imshow(image[:, :, [8, 4, 12]])\n",
    "ax.set_title('Superpixel Boundaries on Original Image')\n",
    "ax.axis('off')\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34be574b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# remove nan values\n",
    "image = np.nan_to_num(image)\n",
    "\n",
    "H, W, B = image.shape  \n",
    "X = image.reshape(-1, B).astype(np.float32)\n",
    "\n",
    "pca = PCA(n_components=3, svd_solver='randomized', whiten=False)\n",
    "X_pca = pca.fit_transform(X)\n",
    "image_pca = X_pca.reshape(H, W, 3)\n",
    "\n",
    "\n",
    "def normalize(img):\n",
    "    img_min = img.min(axis=(0,1), keepdims=True)\n",
    "    img_max = img.max(axis=(0,1), keepdims=True)\n",
    "    return (img - img_min) / (img_max - img_min + 1e-8)\n",
    "\n",
    "image_pca_norm = normalize(image_pca)\n",
    "\n",
    "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n",
    "ax.imshow(image_pca_norm)\n",
    "ax.set_title('Superpixel Boundaries on PCA Image')\n",
    "ax.axis('off')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34deeb23",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Explained variance ratio:\", pca.explained_variance_ratio_)\n",
    "print(\"Total variance retained:\", pca.explained_variance_ratio_.sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47e1d4a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_pca(image, n_components=3):\n",
    "    H, W, B = image.shape  \n",
    "    X = image.reshape(-1, B).astype(np.float32)\n",
    "\n",
    "    pca = PCA(n_components=n_components, svd_solver='randomized', whiten=False)\n",
    "    X_pca = pca.fit_transform(X)\n",
    "    image_pca = X_pca.reshape(H, W, n_components)\n",
    "    \n",
    "    return image_pca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92fa64b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "chanels = [\n",
    "   image[:, :, [4,3,2]],\n",
    "   image[:, :, [8, 4, 12]],\n",
    "   compute_pca(image, n_components=3)\n",
    "]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "670feb88",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def extract_segment_features(image_s2, segments, mask):\n",
    "    num_bands = image_s2.shape[2]\n",
    "    rows = []\n",
    "\n",
    "    props_table = regionprops_table(\n",
    "        segments,\n",
    "        properties=('label', 'perimeter', 'eccentricity', 'solidity')\n",
    "    )\n",
    "    props_df = pd.DataFrame(props_table).set_index('label')\n",
    "\n",
    "    for segment_id in np.unique(segments):\n",
    "        segment_mask = segments == segment_id\n",
    "\n",
    "        # Skip empty segments defensively\n",
    "        if not np.any(segment_mask):\n",
    "            continue\n",
    "\n",
    "        segment_data = {'segment_id': segment_id}\n",
    "\n",
    "        # ---- Spectral features ----\n",
    "        for band in range(num_bands):\n",
    "            band_data = image_s2[:, :, band][segment_mask]\n",
    "\n",
    "            if band_data.size == 0:\n",
    "                continue\n",
    "\n",
    "            segment_data[f'b{band+1}_mean'] = band_data.mean()\n",
    "            segment_data[f'b{band+1}_median'] = np.median(band_data)\n",
    "            segment_data[f'b{band+1}_std'] = band_data.std()\n",
    "            segment_data[f'b{band+1}_min'] = band_data.min()\n",
    "            segment_data[f'b{band+1}_max'] = band_data.max()\n",
    "\n",
    "            s = pd.Series(band_data)\n",
    "            segment_data[f'b{band+1}_skew'] = s.skew()\n",
    "            segment_data[f'b{band+1}_kurtosis'] = s.kurtosis()\n",
    "\n",
    "            segment_data[f'b{band+1}_energy'] = np.mean(band_data ** 2)\n",
    "\n",
    "            hist, _ = np.histogram(band_data, bins=256, range=(0, 1), density=True)\n",
    "            hist = hist[hist > 0]\n",
    "            segment_data[f'b{band+1}_entropy'] = -np.sum(hist * np.log2(hist)) if hist.size else 0.0\n",
    "\n",
    "        # ---- Shape features (once per segment) ----\n",
    "        if segment_id in props_df.index:\n",
    "            row = props_df.loc[segment_id]\n",
    "            segment_data['perimeter'] = row.perimeter\n",
    "            segment_data['eccentricity'] = row.eccentricity\n",
    "            segment_data['solidity'] = row.solidity\n",
    "        else:\n",
    "            segment_data['perimeter'] = 0.0\n",
    "            segment_data['eccentricity'] = 0.0\n",
    "            segment_data['solidity'] = 0.0\n",
    "\n",
    "        # ---- Mask label ----\n",
    "        mask_data = mask[segment_mask]\n",
    "        if mask_data.size > 0:\n",
    "            majority = np.bincount(mask_data).argmax()\n",
    "            segment_data['mask_value'] = 0 if majority == 0 else 1\n",
    "        else:\n",
    "            segment_data['mask_value'] = 0\n",
    "\n",
    "        rows.append(segment_data)\n",
    "\n",
    "    return pd.DataFrame(rows)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75f7c343",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train_locs = [\"Aletsch\", \"Rhone\",  \"Gorner\", \"Allalin\", \"Anzere\", \"Diablerets\", \"Gorbassiere\", \"Moiry\", \"Saas-Tal\"]\n",
    "val_locs = [\"PleineMorte\", \"Zmutt\"]\n",
    "\n",
    "def create_dataset(locations, folder=\"\"):\n",
    "    df = pd.DataFrame()\n",
    "\n",
    "    for location in locations:\n",
    "        dates = os.listdir(f\"{folder}/{location}/\")\n",
    "        dates = [date.split(\"_\")[0] for date in dates if date.endswith(\"_lake_mask.tif\")]\n",
    "        print(f\"Processing location: {location}\")\n",
    "        for date in tqdm.tqdm(dates): \n",
    "            mask_pth = f\"{folder}/{location}/{date}_{location}_lake_mask.tif\"\n",
    "            image = load_image(location, date, folder)\n",
    "            \n",
    "            image = np.nan_to_num(image)\n",
    "            image_superpixel = image[:, :, [8, 4, 12]]\n",
    "            mask = rasterio.open(mask_pth).read(1)\n",
    "            mask = mask > 1\n",
    "            mask = mask.astype(np.uint8) \n",
    "            segements = quickshift(image_superpixel, kernel_size=3, max_dist=6, ratio=0.5)\n",
    "            print(f\"Number of segments: {len(np.unique(segements))}\")\n",
    "            df_location_date = extract_segment_features(image, segements, mask)\n",
    "            df_location_date['location'] = location\n",
    "            df_location_date['date'] = date\n",
    "            df = pd.concat([df, df_location_date], ignore_index=True)\n",
    "            print(f\"Number of lakes in {location} on {date}: {df_location_date['mask_value'].sum()}\")\n",
    "    return df\n",
    "\n",
    "df = create_dataset(train_locs, folder=\"../dataset/\")\n",
    "validation_df = create_dataset(val_locs, folder=\"../dataset/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17970e63",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the dataframes\n",
    "df.to_csv(\"train_segments_features.csv\", index=False)\n",
    "validation_df.to_csv(\"validation_segments_features.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}