{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae6d5585",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "import rasterio\n",
    "from skimage.segmentation import mark_boundaries\n",
    "from skimage.color import label2rgb\n",
    "from skimage.segmentation import slic, felzenszwalb, quickshift\n",
    "import pandas as pd\n",
    "from skimage.measure import regionprops_table\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68a3d89e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize_band(img, mean, std):\n",
    "    \"\"\"Min-max normalization using mean ± 2sigma .\"\"\"\n",
    "    min_v = mean - 2 * std\n",
    "    max_v = mean + 2 * std\n",
    "    img = (img - min_v) / (max_v - min_v + 1e-6)\n",
    "    return np.clip(img, 0, 1).astype(np.float32)\n",
    "\n",
    "band_stats_file = \"band_stats.json\"\n",
    "\n",
    "if os.path.exists(band_stats_file):\n",
    "    with open(band_stats_file, \"r\") as f:\n",
    "        band_stats = json.load(f)\n",
    "\n",
    "def nomralize_image(s1, s2, dem, band_stats):\n",
    "    \"\"\"Normalize each band of the image using precomputed statistics.\"\"\"\n",
    "    normalized_bands = []\n",
    "    for i in range(s1.shape[2]):\n",
    "        band_mean = band_stats['S1']['mean'][i]\n",
    "        band_std = band_stats['S1']['std'][i]\n",
    "        normalized_band = normalize_band(s1[:, :, i], band_mean, band_std)\n",
    "        normalized_bands.append(normalized_band)\n",
    "\n",
    "    for i in range(s2.shape[2]):\n",
    "        band_mean = band_stats['S2']['mean'][i]\n",
    "        band_std = band_stats['S2']['std'][i]\n",
    "        normalized_band = normalize_band(s2[:, :, i], band_mean, band_std)\n",
    "        normalized_bands.append(normalized_band)\n",
    "\n",
    "    for i in range(dem.shape[2]):\n",
    "        band_mean = band_stats['DEM']['mean'][i]\n",
    "        band_std = band_stats['DEM']['std'][i]\n",
    "        normalized_band = normalize_band(dem[:, :, i], band_mean, band_std)\n",
    "        normalized_bands.append(normalized_band)\n",
    "\n",
    "    # for band in range(hillshade.shape[2]):\n",
    "    #     band_mean = band_stats['Hillshade']['mean'][band]\n",
    "    #     band_std = band_stats['Hillshade']['std'][band]\n",
    "    #     normalized_band = normalize_band(hillshade[:, :, band], band_mean, band_std)\n",
    "    #     normalized_bands.append(normalized_band)\n",
    "\n",
    "    # for band in range(cloudmask.shape[2]):\n",
    "    #     band_mean = band_stats['Cloudmask']['mean'][band]\n",
    "    #     band_std = band_stats['Cloudmask']['std'][band]\n",
    "    #     normalized_band = normalize_band(cloudmask[:, :, band], band_mean, band_std)\n",
    "    #     normalized_bands.append(normalized_band)\n",
    "\n",
    "    return np.stack(normalized_bands, axis=-1)\n",
    "\n",
    "\n",
    "def load_image(location, date, folder, load_mask=False):\n",
    "    s1 = f\"{folder}/{location}/{date}_{location}_s1.tif\"\n",
    "    s2 = f\"{folder}/{location}/{date}_{location}_s2.tif\"\n",
    "    dem = f\"{folder}/{location}/{location}_dem.tif\"\n",
    "    mask_path = f\"{folder}/{location}/{date}_{location}_lake_mask.tif\" if load_mask else None\n",
    "   \n",
    "    if not os.path.exists(s1):\n",
    "        print(f\"The file {s1} does not exist.\")\n",
    "        return None\n",
    "    if not os.path.exists(s2):\n",
    "        print(f\"The file {s2} does not exist.\")\n",
    "        return None\n",
    "    if not os.path.exists(dem):\n",
    "        print(f\"The file {dem} does not exist.\")\n",
    "        return None\n",
    "    if load_mask and not os.path.exists(mask_path):\n",
    "        print(f\"The file {mask_path} does not exist.\")\n",
    "        return None\n",
    "    \n",
    "    s1 = rasterio.open(s1).read()\n",
    "    s1 = np.moveaxis(s1, 0, -1)\n",
    "    s2 = rasterio.open(s2).read()\n",
    "    s2 = np.moveaxis(s2, 0, -1)\n",
    "    dem = rasterio.open(dem).read()\n",
    "    dem = np.moveaxis(dem, 0, -1)\n",
    "    # hillshade = rasterio.open(hillshade).read()\n",
    "    # hillshade = np.moveaxis(hillshade, 0, -1)\n",
    "    # cloudmask = rasterio.open(cloudmask).read()\n",
    "    # cloudmask = np.moveaxis(cloudmask, 0, -1)\n",
    "    normalized_image = nomralize_image(s1, s2, dem, band_stats)\n",
    "\n",
    "    if load_mask:\n",
    "        mask = rasterio.open(mask_path).read(1)\n",
    "        mask = (mask > 1).astype(np.uint8)\n",
    "        return normalized_image, mask\n",
    "\n",
    "    return normalized_image\n",
    "\n",
    "def extract_segment_features(image_s2, segments, mask):\n",
    "    num_bands = image_s2.shape[2]\n",
    "    rows = []\n",
    "\n",
    "    props_table = regionprops_table(\n",
    "        segments,\n",
    "        properties=('label', 'perimeter', 'eccentricity', 'solidity')\n",
    "    )\n",
    "    props_df = pd.DataFrame(props_table).set_index('label')\n",
    "\n",
    "    for segment_id in np.unique(segments):\n",
    "        segment_mask = segments == segment_id\n",
    "\n",
    "        # Skip empty segments defensively\n",
    "        if not np.any(segment_mask):\n",
    "            continue\n",
    "\n",
    "        segment_data = {'segment_id': segment_id}\n",
    "\n",
    "        # ---- Spectral features ----\n",
    "        for band in range(num_bands):\n",
    "            band_data = image_s2[:, :, band][segment_mask]\n",
    "\n",
    "            if band_data.size == 0:\n",
    "                continue\n",
    "\n",
    "            segment_data[f'b{band+1}_mean'] = band_data.mean()\n",
    "            segment_data[f'b{band+1}_median'] = np.median(band_data)\n",
    "            segment_data[f'b{band+1}_std'] = band_data.std()\n",
    "            segment_data[f'b{band+1}_min'] = band_data.min()\n",
    "            segment_data[f'b{band+1}_max'] = band_data.max()\n",
    "\n",
    "            s = pd.Series(band_data)\n",
    "            segment_data[f'b{band+1}_skew'] = s.skew()\n",
    "            segment_data[f'b{band+1}_kurtosis'] = s.kurtosis()\n",
    "\n",
    "            segment_data[f'b{band+1}_energy'] = np.mean(band_data ** 2)\n",
    "\n",
    "            hist, _ = np.histogram(band_data, bins=256, range=(0, 1), density=True)\n",
    "            hist = hist[hist > 0]\n",
    "            segment_data[f'b{band+1}_entropy'] = -np.sum(hist * np.log2(hist)) if hist.size else 0.0\n",
    "\n",
    "        # ---- Shape features (once per segment) ----\n",
    "        if segment_id in props_df.index:\n",
    "            row = props_df.loc[segment_id]\n",
    "            segment_data['perimeter'] = row.perimeter\n",
    "            segment_data['eccentricity'] = row.eccentricity\n",
    "            segment_data['solidity'] = row.solidity\n",
    "        else:\n",
    "            segment_data['perimeter'] = 0.0\n",
    "            segment_data['eccentricity'] = 0.0\n",
    "            segment_data['solidity'] = 0.0\n",
    "\n",
    "        # ---- Mask label ----\n",
    "        mask_data = mask[segment_mask]\n",
    "        if mask_data.size > 0:\n",
    "            majority = np.bincount(mask_data).argmax()\n",
    "            segment_data['mask_value'] = 0 if majority == 0 else 1\n",
    "        else:\n",
    "            segment_data['mask_value'] = 0\n",
    "\n",
    "        rows.append(segment_data)\n",
    "\n",
    "    return pd.DataFrame(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "647d0096",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# image = load_image(\"Aletsch\", \"20230625\", \"../dataset\")\n",
    "image, mask = load_image(\"Allalin\", \"20230720\", \"../dataset\", load_mask=True)\n",
    "# image = load_image(\"Zmutt\", \"20230720\", \"../dataset\")\n",
    "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n",
    "ax.imshow(image[:, :, [8, 4, 12]])\n",
    "ax.set_title('Superpixel Boundaries on Original Image')\n",
    "ax.axis('off')\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5a75173",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"train_segments_features.csv\")\n",
    "validation_df = pd.read_csv(\"validation_segments_features.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "369519bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a balanced dataset with equal number of lake and non-lake segments\n",
    "# replace nan with 0 \n",
    "df = df.fillna(0)\n",
    "validation_df = validation_df.fillna(0)\n",
    "lake_df = df[df['mask_value'] == 1]\n",
    "non_lake_df = df[df['mask_value'] == 0]\n",
    "non_lake_df_sampled = non_lake_df.sample(n=len(lake_df), random_state=42)\n",
    "balanced_df = pd.concat([lake_df, non_lake_df_sampled], ignore_index=True)\n",
    "balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
    "\n",
    "X = balanced_df[[col for col in balanced_df.columns if col not in ['segment_id', 'location', 'date', 'mask_value']]]\n",
    "y = balanced_df['mask_value']\n",
    "\n",
    "val_X = validation_df[[col for col in validation_df.columns if col not in ['segment_id', 'location', 'date', 'mask_value']]]\n",
    "val_y = validation_df['mask_value']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dba1ada5",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Lake segments: {len(lake_df)}, Non-lake segments (sampled): {len(non_lake_df)}, Total balanced segments: {len(balanced_df)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64ba7546",
   "metadata": {},
   "outputs": [],
   "source": [
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "sm = SMOTE(random_state=42)\n",
    "X, y = sm.fit_resample(df.drop(columns=['segment_id', 'location', 'date', 'mask_value']), df['mask_value'])\n",
    "\n",
    "# take only the first 10000\n",
    "X = X[:10000]\n",
    "y = y[:10000]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be1b30ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# train a random forest classifier to predict mask_value based on the band mean values\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn import svm\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score\n",
    "clfs = {\n",
    "    'RandomForest': RandomForestClassifier(n_estimators=100),\n",
    "    'GradientBoosting': GradientBoostingClassifier(n_estimators=100),\n",
    "    'GaussianNB': GaussianNB(),\n",
    "    'SVM': svm.SVC(),\n",
    "    'mlp': MLPClassifier(hidden_layer_sizes=(50, ), max_iter=300)\n",
    "}\n",
    "\n",
    "\n",
    "scores = {}\n",
    "\n",
    "def score(y_true, y_pred):\n",
    "    f1 = f1_score(y_true, y_pred, average='binary')\n",
    "    accuracy = accuracy_score(y_true, y_pred)\n",
    "    precision = precision_score(y_true, y_pred, average='binary')\n",
    "    recall = recall_score(y_true, y_pred, average='binary')\n",
    "    return {'f1_score': f1, 'accuracy': accuracy, 'precision': precision, 'recall': recall}\n",
    "\n",
    "\n",
    "for col in X.columns:\n",
    "    num_nan = X[col].isna().sum()\n",
    "    if num_nan > 0:\n",
    "        print(f\"{col}: {num_nan}\")\n",
    "\n",
    "for name, clf in clfs.items():\n",
    "    clf.fit(X, y)\n",
    "    val_y_pred = clf.predict(val_X)\n",
    "\n",
    "    scores[name] = score(val_y, val_y_pred)\n",
    "\n",
    "        # create a confusion matrix and classification report\n",
    "    conf = confusion_matrix(val_y, val_y_pred)\n",
    "    print(f\"Classifier: {name}\")\n",
    "    print(\"Confusion Matrix:\")\n",
    "    print(conf)\n",
    "\n",
    "    print(\"Classification Report:\")\n",
    "    print(classification_report(val_y, val_y_pred))\n",
    "\n",
    "\n",
    "    print(\"-\" * 50)\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd477409",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94360582",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Number of lake segments in training set: {df['mask_value'].sum()} - {len(df)} total segments\")\n",
    "print(f\"Number of lake segments in training set: {balanced_df['mask_value'].sum()} - {len(balanced_df)} total segments\")\n",
    "print(f\"Number of lake segments in validation set: {validation_df['mask_value'].sum()} - {len(validation_df)} total segments\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3972150d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# show feature importance\n",
    "rf_clf = clfs['RandomForest']\n",
    "importances = rf_clf.feature_importances_\n",
    "feature_names = X.columns\n",
    "\n",
    "# print 10 most important features\n",
    "print(\"Feature Importances:\")\n",
    "ordered_indices = np.argsort(importances)[::-1]\n",
    "for idx in ordered_indices[:10]:\n",
    "    print(f\"{feature_names[idx]}: {importances[idx]:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf5afbee",
   "metadata": {},
   "outputs": [],
   "source": [
    "# image, mask = load_image(\"Zmutt\", \"20230720\", \"../dataset\", True)\n",
    "image, mask = load_image(\"PleineMorte\", \"20230618\", \"../dataset\", True)\n",
    "# features = predicted_segments\n",
    "all_segments = quickshift(image[:, :, [8, 4, 12]], kernel_size=3, max_dist=6, ratio=0.5)\n",
    "# all_segments = quickshift(image_superpixel, kernel_size=5, max_dist=50, ratio=0.5)\n",
    "# print(all_segments.shape, test_image.shape)\n",
    "\n",
    "features = extract_segment_features(image, all_segments, mask)\n",
    "features = features.fillna(0)\n",
    "\n",
    "rf_clf = clfs['GradientBoosting']\n",
    "features_X = features[[col for col in features.columns if col not in ['segment_id', 'mask_value']]]\n",
    "features_y_pred = rf_clf.predict(features_X)\n",
    "features['predicted_mask_value'] = features_y_pred\n",
    "\n",
    "print(f\"Get number of lake segments in test image: {features['mask_value'].sum()} out of {len(features)} segments\")\n",
    "\n",
    "# pllot predicted segments\n",
    "predicted_mask_segment = np.zeros((image.shape[0], image.shape[1]), dtype=np.int8)\n",
    "for seg_id in np.unique(all_segments):\n",
    "    seg_mask = all_segments == seg_id\n",
    "    if seg_mask.sum() == 0:\n",
    "        continue\n",
    "    segment_row = features[features['segment_id'] == seg_id]\n",
    "    if segment_row.empty:\n",
    "        continue\n",
    "    predicted_mask_value = segment_row['predicted_mask_value'].values[0]\n",
    "    predicted_mask_segment[seg_mask] = predicted_mask_value\n",
    "\n",
    "empty_img = np.zeros((image.shape[0], image.shape[1]), dtype=np.int8)\n",
    "plot_img = image[:, :, [4, 3, 2]] * 255\n",
    "plot_img = plot_img.astype(np.uint8)\n",
    "plot_img = plot_img * 0.7 + np.stack([predicted_mask_segment * 255, empty_img, empty_img], axis=-1) * 0.3\n",
    "plot_img = plot_img.astype(np.uint8)\n",
    "fig, ax = plt.subplots(1, 1, figsize=(10, 10))\n",
    "ax.imshow(plot_img)\n",
    "ax.set_title('Predicted Mask Segments')\n",
    "ax.axis('off')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2af0989b",
   "metadata": {},
   "source": [
    "0 vv \n",
    "1 vh\n",
    "2 b02\n",
    "3 b03\n",
    "4 b04\n",
    "5 b05\n",
    "6 b06\n",
    "7 b07\n",
    "8 b08\n",
    "9 b08a\n",
    "10 b11\n",
    "11 b12\n",
    "12 ndwi\n",
    "13 ndsi\n",
    "14 dem"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}