Upload 2 files

Browse files

Files changed (2) hide show

dataset_generation.ipynb +750 -0
model_training.ipynb +0 -0

dataset_generation.ipynb ADDED Viewed

	@@ -0,0 +1,750 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Delete the lines with a brown background color in the excel files\n",
+    "The excel files are located in the Data/Classification/labeled_data folder of the MESCnn repository."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openpyxl import Workbook, load_workbook\n",
+    "import os \n",
+    "\n",
+    "path_to_excel = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n",
+    "\n",
+    "# Function to get the RGB value of a color\n",
+    "def get_rgb(color):\n",
+    "    return tuple(int(color[i:i+2], 16) for i in (0, 2, 4))\n",
+    "\n",
+    "for file in os.listdir(path_to_excel):\n",
+    "    if file.endswith(\".xlsx\") or file.endswith(\".XLSX\"):\n",
+    "        file = os.path.join(path_to_excel, file)\n",
+    "        # Load the workbook\n",
+    "        workbook = load_workbook(file)\n",
+    "        \n",
+    "        # Select the first sheet\n",
+    "        sheet = workbook.active\n",
+    " \n",
+    "        # Create a new workbook\n",
+    "        new_workbook = Workbook()\n",
+    "        new_sheet = new_workbook.active\n",
+    "        \n",
+    "        # List to store rows with RGB colors\n",
+    "        rows_with_rgb = []\n",
+    "        \n",
+    "        # Iterate through each row\n",
+    "        for row_idx, row in enumerate(sheet.iter_rows(), start=1):\n",
+    "            row_colors = []\n",
+    "            has_rgb_color = False  # Flag to check if row has any RGB color\n",
+    "            # Iterate through each cell in the row\n",
+    "            for cell in row:\n",
+    "                fill = cell.fill\n",
+    "                if fill.start_color.type == 'rgb':\n",
+    "                    rgb_value = get_rgb(fill.start_color.rgb)\n",
+    "                    row_colors.append(rgb_value)\n",
+    "                    has_rgb_color = True\n",
+    "            # Check if the row has at least one RGB color\n",
+    "            if has_rgb_color:\n",
+    "                rows_with_rgb.append(row)\n",
+    " \n",
+    "        # Write rows with RGB colors to the new workbook\n",
+    "        for row in rows_with_rgb:\n",
+    "            new_sheet.append([cell.value for cell in row])\n",
+    "        \n",
+    "        # Save the new workbook\n",
+    "        new_workbook.save(file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extract labeled data from excel files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "C1104066_JGI.XLSX\n",
+      "C1105034_JGI.XLSX\n",
+      "C1110748_JGI.xlsx\n",
+      "C1112141_JGI.XLSX\n",
+      "C1105798_JGI.xlsx\n",
+      "C1117893_JGI.xlsx\n",
+      "C1107892_JGI.xlsx\n",
+      "C1107752_JGI.xlsx\n",
+      "C1105642_JGI.XLSX\n",
+      "                                          Patch names          M    E    S  \\\n",
+      "0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg          0    0    1   \n",
+      "1   glomerulus C1104066 [142336, 49680, 744, 640]....          0    0  GGS   \n",
+      "2   glomerulus C1104066 [142772, 48280, 1100, 864]...          1    0    0   \n",
+      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg          0    0  GGS   \n",
+      "4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg          0    0    1   \n",
+      "..                                                ...        ...  ...  ...   \n",
+      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg  nan_label  noE  GGS   \n",
+      "48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg          1    0    1   \n",
+      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg  nan_label  noE  GGS   \n",
+      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg  nan_label  noE  GGS   \n",
+      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg  nan_label  noE  GGS   \n",
+      "\n",
+      "      C  \n",
+      "0     0  \n",
+      "1     0  \n",
+      "2     0  \n",
+      "3     0  \n",
+      "4     0  \n",
+      "..  ...  \n",
+      "47  noC  \n",
+      "48    0  \n",
+      "49  noC  \n",
+      "50  noC  \n",
+      "51  noC  \n",
+      "\n",
+      "[470 rows x 5 columns]\n",
+      "(470, 5)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "   \n",
+    "# Set the path to the labeled data directory\n",
+    "labeled_data_dir = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n",
+    "\n",
+    "# Get the list of excel files in the labeled data directory\n",
+    "excel_files = [file for file in os.listdir(labeled_data_dir) if file.endswith(\".xlsx\") or file.endswith(\".XLSX\")]\n",
+    "\n",
+    "# Create an empty dataframe\n",
+    "df_combined = pd.DataFrame(columns=[\"Patch names\", \"M\", \"E\", \"S\", \"C\"])\n",
+    "\n",
+    "# Iterate over the excel files\n",
+    "for file in excel_files:\n",
+    "    print(file)\n",
+    "    # Read the excel file\n",
+    "    df = pd.read_excel(os.path.join(labeled_data_dir, file))\n",
+    "    \n",
+    "    if file == \"C1107752_JGI.xlsx\": # This file raises an error for a reason I don't understand\n",
+    "        corrected_index = 61  \n",
+    "    else:\n",
+    "        # Find the index of the row with \"CORRECTED\" or \"Corrected\" value in the first column\n",
+    "        if (df.iloc[:, 0] == \"CORRECTED\").any():\n",
+    "            corrected_index = df[df.iloc[:, 0] == \"CORRECTED\"].index[0]\n",
+    "        elif (df.iloc[:, 0] == \"Corrected\").any():\n",
+    "            corrected_index = df[df.iloc[:, 0] == \"Corrected\"].index[0]\n",
+    "        elif (df.iloc[:, 0] == \"CORRECTED JGI\").any():\n",
+    "            corrected_index = df[df.iloc[:, 0] == \"CORRECTED JGI\"].index[0]\n",
+    "        else:\n",
+    "            corrected_index = df[df.iloc[:, 0] == \"filename\"].index[0]        \n",
+    "        \n",
+    "    # Skip the rows before the \"CORRECTED\" row and select the following rows\n",
+    "    df = df.iloc[corrected_index + 1:]\n",
+    "    \n",
+    "    # Get the values in the M, E, S, and C columns\n",
+    "    m_values = df[\"M\"].values\n",
+    "    e_values = df[\"E\"].values\n",
+    "    s_values = df[\"S\"].values\n",
+    "    c_values = df[\"C\"].values\n",
+    "    \n",
+    "    # Get the name of each patch in the Patch_name column\n",
+    "    patch_names = df[\"filename\"].values\n",
+    "    \n",
+    "    # Split the patch names to keep only the part after the last '\\'\n",
+    "    patch_names = [name.split('\\\\')[-1] for name in patch_names]\n",
+    "    \n",
+    "    # Create a dataframe for the current file\n",
+    "    df_current = pd.DataFrame({\n",
+    "        \"Patch names\": patch_names,\n",
+    "        \"M\": m_values,\n",
+    "        \"E\": e_values,\n",
+    "        \"S\": s_values,\n",
+    "        \"C\": c_values\n",
+    "    })\n",
+    "    \n",
+    "    # Append the current dataframe to the combined dataframe\n",
+    "    df_combined = pd.concat([df_combined, df_current])\n",
+    "\n",
+    "# Print the combined dataframe\n",
+    "print(df_combined)\n",
+    "print(df_combined.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                          Patch names          M    E     S  \\\n",
+      "0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg        noM  noE   SGS   \n",
+      "1   glomerulus C1104066 [142336, 49680, 744, 640]....        noM  noE   GGS   \n",
+      "2   glomerulus C1104066 [142772, 48280, 1100, 864]...       yesM  noE  NoGS   \n",
+      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg        noM  noE   GGS   \n",
+      "4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg        noM  noE   SGS   \n",
+      "..                                                ...        ...  ...   ...   \n",
+      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg  nan_label  noE   GGS   \n",
+      "48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg       yesM  noE   SGS   \n",
+      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg  nan_label  noE   GGS   \n",
+      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg  nan_label  noE   GGS   \n",
+      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg  nan_label  noE   GGS   \n",
+      "\n",
+      "      C  \n",
+      "0   noC  \n",
+      "1   noC  \n",
+      "2   noC  \n",
+      "3   noC  \n",
+      "4   noC  \n",
+      "..  ...  \n",
+      "47  noC  \n",
+      "48  noC  \n",
+      "49  noC  \n",
+      "50  noC  \n",
+      "51  noC  \n",
+      "\n",
+      "[470 rows x 5 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "mesc_def = {\n",
+    "    \"M\": {\n",
+    "        0: \"noM\",\n",
+    "        1: \"yesM\",\n",
+    "    },\n",
+    "    \"E\": {\n",
+    "        0: \"noE\",\n",
+    "        1: \"yesE\"\n",
+    "    },\n",
+    "    \"S\": {\n",
+    "        \"GGS\": \"GGS\",\n",
+    "        0: \"NoGS\",\n",
+    "        1: \"SGS\"\n",
+    "    },\n",
+    "    \"C\": {\n",
+    "        0: \"noC\",\n",
+    "        1: \"yesC\"\n",
+    "    }\n",
+    "}\n",
+    "df_combined[\"M\"] = df_combined[\"M\"].replace(mesc_def[\"M\"])\n",
+    "df_combined[\"E\"] = df_combined[\"E\"].replace(mesc_def[\"E\"])\n",
+    "df_combined[\"S\"] = df_combined[\"S\"].replace(mesc_def[\"S\"])\n",
+    "df_combined[\"C\"] = df_combined[\"C\"].replace(mesc_def[\"C\"])\n",
+    "print(df_combined)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['yesE', 'noM', 'noE', 'NoGS', 10, 'yesC', 'noC', 'yesM', 'SGS', 'GGS', nan, 'nan_label']\n",
+      "                                          Patch names     M    E     S    C\n",
+      "0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg   noM  noE   SGS  noC\n",
+      "1   glomerulus C1104066 [142336, 49680, 744, 640]....   NaN  NaN   GGS  NaN\n",
+      "2   glomerulus C1104066 [142772, 48280, 1100, 864]...  yesM  noE  NoGS  noC\n",
+      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg   NaN  NaN   GGS  NaN\n",
+      "4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg   noM  noE   SGS  noC\n",
+      "..                                                ...   ...  ...   ...  ...\n",
+      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg   NaN  NaN   GGS  NaN\n",
+      "48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg  yesM  noE   SGS  noC\n",
+      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg   NaN  NaN   GGS  NaN\n",
+      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg   NaN  NaN   GGS  NaN\n",
+      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg   NaN  NaN   GGS  NaN\n",
+      "\n",
+      "[470 rows x 5 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "labels = df_combined[['M', 'E', 'S', 'C']].values.flatten()\n",
+    "distinct_labels = list(set(labels))\n",
+    "print(distinct_labels)\n",
+    "\n",
+    "possible_labels = [\"noM\", \"yesM\", \"noE\", \"yesE\", \"GGS\", \"NoGS\", \"SGS\", \"noC\", \"yesC\", \"nan_label\"]\n",
+    "\n",
+    "# Replace values that are not in the possible_labels list with NaN\n",
+    "df_combined.loc[:, 'M':'C'] = df_combined.loc[:, 'M':'C'].apply(lambda x: np.where(x.isin(possible_labels), x, np.nan))\n",
+    "\n",
+    "# If the value in the S column is \"GGS\", set the value in the other columns to NaN\n",
+    "df_combined.loc[df_combined[\"S\"] == \"GGS\", [\"M\", \"E\", \"C\"]] = np.nan\n",
+    "\n",
+    "# Print the updated dataframe\n",
+    "print(df_combined)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                          Patch names     M    E     S    C\n",
+      "1   glomerulus C1104066 [142336, 49680, 744, 640]....   NaN  NaN   GGS  NaN\n",
+      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg   NaN  NaN   GGS  NaN\n",
+      "7    glomerulus C1104066 [8044, 62252, 752, 796].jpeg   NaN  NaN   GGS  NaN\n",
+      "15  glomerulus C1104066 [94652, 48228, 636, 644].jpeg   NaN  NaN   GGS  NaN\n",
+      "17  glomerulus C1105034 [150832, 29052, 600, 496]....   NaN  NaN   GGS  NaN\n",
+      "9   glomerulus C1110748 [129452, 5728, 708, 512].jpeg   NaN  NaN   GGS  NaN\n",
+      "19  glomerulus C1110748 [134904, 7652, 776, 692].jpeg   NaN  NaN   GGS  NaN\n",
+      "22  glomerulus C1110748 [136192, 55140, 788, 688]....   NaN  NaN   GGS  NaN\n",
+      "25  glomerulus C1110748 [145592, 41936, 740, 640]....   NaN  NaN   GGS  NaN\n",
+      "40  glomerulus C1110748 [154628, 24972, 804, 684]....   NaN  NaN   GGS  NaN\n",
+      "41  glomerulus C1110748 [155592, 25764, 648, 612]....   NaN  NaN   GGS  NaN\n",
+      "46  glomerulus C1110748 [156748, 71428, 812, 692]....   NaN  NaN   GGS  NaN\n",
+      "48  glomerulus C1110748 [157812, 72180, 600, 536]....   NaN  NaN   GGS  NaN\n",
+      "36  glomerulus C1112141 [78580, 16560, 656, 788].jpeg   NaN  NaN   GGS  NaN\n",
+      "43  glomerulus C1112141 [82724, 17252, 860, 808].jpeg   NaN  NaN   GGS  NaN\n",
+      "46  glomerulus C1112141 [83852, 19840, 884, 944].jpeg  yesM  NaN  NoGS  noC\n",
+      "48  glomerulus C1112141 [86140, 60432, 720, 776].jpeg   NaN  NaN   GGS  NaN\n",
+      "50  glomerulus C1112141 [87964, 20760, 672, 732].jpeg   NaN  NaN   GGS  NaN\n",
+      "55  glomerulus C1112141 [90196, 61504, 848, 804].jpeg   NaN  NaN   GGS  NaN\n",
+      "58  glomerulus C1112141 [95092, 65612, 680, 668].jpeg   NaN  NaN   GGS  NaN\n",
+      "4   glomerulus C1105798 [118952, 9668, 980, 896].jpeg   NaN  NaN   GGS  NaN\n",
+      "6   glomerulus C1105798 [120488, 15428, 684, 516]....   NaN  NaN   GGS  NaN\n",
+      "14  glomerulus C1105798 [129104, 54064, 708, 576]....   NaN  NaN   GGS  NaN\n",
+      "54  glomerulus C1105798 [76196, 61668, 740, 968].jpeg   NaN  NaN   GGS  NaN\n",
+      "28  glomerulus C1117893 [26068, 32092, 724, 708].jpeg   NaN  NaN   GGS  NaN\n",
+      "32  glomerulus C1117893 [31252, 77564, 700, 696].jpeg   NaN  NaN   GGS  NaN\n",
+      "33  glomerulus C1117893 [65224, 17120, 528, 544].jpeg   NaN  NaN   GGS  NaN\n",
+      "11  glomerulus C1107892 [126480, 27244, 588, 564]....   NaN  NaN   GGS  NaN\n",
+      "43  glomerulus C1107892 [75916, 26668, 564, 572].jpeg   NaN  NaN   GGS  NaN\n",
+      "44  glomerulus C1107892 [76200, 75040, 508, 576].jpeg   NaN  NaN   GGS  NaN\n",
+      "48  glomerulus C1107892 [77772, 25272, 740, 760].jpeg   NaN  NaN   GGS  NaN\n",
+      "49  glomerulus C1107892 [77980, 73584, 732, 724].jpeg   NaN  NaN   GGS  NaN\n",
+      "55  glomerulus C1107892 [80568, 69696, 616, 644].jpeg   NaN  NaN   GGS  NaN\n",
+      "56  glomerulus C1107892 [80608, 21544, 624, 660].jpeg   NaN  NaN   GGS  NaN\n",
+      "11  glomerulus C1105642 [136108, 72452, 612, 532]....   NaN  NaN   GGS  NaN\n",
+      "12  glomerulus C1105642 [136892, 73056, 596, 540]....   NaN  NaN   GGS  NaN\n",
+      "13  glomerulus C1105642 [137860, 71816, 640, 728]....   NaN  NaN   GGS  NaN\n",
+      "18  glomerulus C1105642 [140788, 20956, 616, 548]....   NaN  NaN   GGS  NaN\n",
+      "19  glomerulus C1105642 [141656, 21460, 620, 576]....   NaN  NaN   GGS  NaN\n",
+      "20  glomerulus C1105642 [142460, 20320, 540, 512]....   NaN  NaN   GGS  NaN\n",
+      "22  glomerulus C1105642 [14640, 21940, 524, 584].jpeg   NaN  NaN   GGS  NaN\n",
+      "29  glomerulus C1105642 [64876, 12060, 596, 648].jpeg   NaN  NaN   GGS  NaN\n",
+      "33  glomerulus C1105642 [67600, 62876, 656, 680].jpeg   NaN  NaN   GGS  NaN\n",
+      "35  glomerulus C1105642 [68388, 15580, 644, 604].jpeg   NaN  NaN   GGS  NaN\n",
+      "40  glomerulus C1105642 [70972, 66596, 652, 628].jpeg   NaN  NaN   GGS  NaN\n",
+      "41  glomerulus C1105642 [71324, 17312, 560, 556].jpeg   NaN  NaN   GGS  NaN\n",
+      "46  glomerulus C1105642 [72752, 20572, 620, 524].jpeg   NaN  NaN   GGS  NaN\n",
+      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg   NaN  NaN   GGS  NaN\n",
+      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg   NaN  NaN   GGS  NaN\n",
+      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg   NaN  NaN   GGS  NaN\n",
+      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg   NaN  NaN   GGS  NaN\n"
+     ]
+    }
+   ],
+   "source": [
+    "nan_rows = df_combined[df_combined.isnull().any(axis=1)]\n",
+    "print(nan_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Patch names</th>\n",
+       "      <th>M</th>\n",
+       "      <th>E</th>\n",
+       "      <th>S</th>\n",
+       "      <th>C</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>glomerulus C1107752 [130360, 32956, 1020, 1008...</td>\n",
+       "      <td>yesM</td>\n",
+       "      <td>yesE</td>\n",
+       "      <td>NoGS</td>\n",
+       "      <td>yesC</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>glomerulus C1107752 [135308, 69504, 1012, 1004...</td>\n",
+       "      <td>yesM</td>\n",
+       "      <td>noE</td>\n",
+       "      <td>NoGS</td>\n",
+       "      <td>yesC</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>glomerulus C1107752 [137584, 31764, 836, 872]....</td>\n",
+       "      <td>yesM</td>\n",
+       "      <td>noE</td>\n",
+       "      <td>NoGS</td>\n",
+       "      <td>yesC</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>glomerulus C1107752 [87436, 35528, 724, 844].jpeg</td>\n",
+       "      <td>yesM</td>\n",
+       "      <td>noE</td>\n",
+       "      <td>NoGS</td>\n",
+       "      <td>yesC</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>glomerulus C1105642 [120200, 56808, 1304, 1140...</td>\n",
+       "      <td>yesM</td>\n",
+       "      <td>noE</td>\n",
+       "      <td>SGS</td>\n",
+       "      <td>yesC</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          Patch names     M     E     S     C\n",
+       "1   glomerulus C1107752 [130360, 32956, 1020, 1008...  yesM  yesE  NoGS  yesC\n",
+       "6   glomerulus C1107752 [135308, 69504, 1012, 1004...  yesM   noE  NoGS  yesC\n",
+       "10  glomerulus C1107752 [137584, 31764, 836, 872]....  yesM   noE  NoGS  yesC\n",
+       "39  glomerulus C1107752 [87436, 35528, 724, 844].jpeg  yesM   noE  NoGS  yesC\n",
+       "2   glomerulus C1105642 [120200, 56808, 1304, 1140...  yesM   noE   SGS  yesC"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print the rows with yesC in the C column\n",
+    "yesC_rows = df_combined[df_combined[\"C\"] == \"yesC\"]\n",
+    "yesC_rows"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Separate the patches into train and val sets \n",
+    "Test set needs to be added but we didn't have enough data so we decided to use the validation set as the test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Seed is -828\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WSI images have been split into train and val folders.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "import shutil\n",
+    "import sys\n",
+    "\n",
+    "# Set the path to the Crop-256 folder\n",
+    "crop256_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Crops\"\n",
+    "\n",
+    "# Set the path to the Data/Classification folder\n",
+    "dataset_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification\"\n",
+    "\n",
+    "# Set the train and val ratio\n",
+    "train_ratio = 0.7\n",
+    "val_ratio = 0.3\n",
+    "\n",
+    "# Create the train and val folders\n",
+    "train_folder = os.path.join(dataset_folder, \"train\")\n",
+    "val_folder = os.path.join(dataset_folder, \"val\")\n",
+    "os.makedirs(train_folder, exist_ok=True)\n",
+    "os.makedirs(val_folder, exist_ok=True)\n",
+    "\n",
+    "# If the train and val folders are not empty, ask the user to confirm if they want to overwrite the folders\n",
+    "if len(os.listdir(train_folder)) > 0 or len(os.listdir(val_folder)) > 0:\n",
+    "    response = input(\"The train and val folders are not empty. Do you want to overwrite the folders? (yes/no): \")\n",
+    "    if response.lower() != \"yes\":\n",
+    "        print(\"Exiting the script.\")\n",
+    "        sys.exit()\n",
+    "    if response.lower() == \"yes\":\n",
+    "        # Remove the existing folders\n",
+    "        shutil.rmtree(train_folder)\n",
+    "        shutil.rmtree(val_folder)\n",
+    "        # Create the folders again\n",
+    "        os.makedirs(train_folder, exist_ok=True)\n",
+    "        os.makedirs(val_folder, exist_ok=True)\n",
+    "        \n",
+    "# Get the list of WSI folders in the Crop-256 folder\n",
+    "wsi_folders = [wsi for wsi in os.listdir(crop256_folder)]\n",
+    "\n",
+    "# Shuffle the list of WSI images\n",
+    "seed = random.randint(-1000, 1000)\n",
+    "print(f\"Seed is {seed}\")\n",
+    "random.seed(seed) # Allows for reproducibility\n",
+    "\n",
+    "imgs = []\n",
+    "os.makedirs(os.path.join(train_folder), exist_ok=True)\n",
+    "for wsi in wsi_folders:\n",
+    "    # Copy the images to the train folder\n",
+    "    for image in os.listdir(os.path.join(crop256_folder, wsi)):\n",
+    "        src_path = os.path.join(crop256_folder, wsi, image)\n",
+    "        dst_path = os.path.join(dataset_folder, image)\n",
+    "        imgs.append(image)\n",
+    "        shutil.copy(src_path, dst_path)\n",
+    "\n",
+    "# Shuffle the list of image paths\n",
+    "random.seed(seed) # Allows for reproducibility\n",
+    "random.shuffle(imgs)\n",
+    "\n",
+    "# Split the image paths into train and val sets\n",
+    "train_size = int(train_ratio * len(imgs))\n",
+    "train_imgs = imgs[:train_size]\n",
+    "val_imgs = imgs[train_size:]\n",
+    "\n",
+    "# Copy the train images to the train folder\n",
+    "os.makedirs(os.path.join(train_folder), exist_ok=True)\n",
+    "# Copy the images to the train folder\n",
+    "for image in train_imgs:\n",
+    "    src_path = os.path.join(dataset_folder, image)\n",
+    "    dst_path = os.path.join(train_folder, image)\n",
+    "    shutil.copy(src_path, dst_path)\n",
+    "        \n",
+    "# Create the folder in the val folder\n",
+    "os.makedirs(os.path.join(val_folder), exist_ok=True)\n",
+    "# Copy the images to the val folder\n",
+    "for image in val_imgs:\n",
+    "    src_path = os.path.join(dataset_folder, image)\n",
+    "    dst_path = os.path.join(val_folder, image)\n",
+    "    shutil.copy(src_path, dst_path)\n",
+    "\n",
+    "# Remove the images from the dataset folder\n",
+    "for image in imgs:\n",
+    "    os.remove(os.path.join(dataset_folder, image))\n",
+    "\n",
+    "print(\"WSI images have been split into train and val folders.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sort the patches into their respective classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set the path to the train and val folders\n",
+    "train_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/train\"\n",
+    "val_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/val\"\n",
+    "\n",
+    "# Create new subdirectories for the labels in the train and val folders \n",
+    "for label in possible_labels:\n",
+    "    os.makedirs(os.path.join(train_folder, label), exist_ok=True)\n",
+    "    os.makedirs(os.path.join(val_folder, label), exist_ok=True)\n",
+    "    \n",
+    "# Iterate over the rows in the df_combined dataframe\n",
+    "for index, row in df_combined.iterrows():\n",
+    "    # Get the labels of the current row\n",
+    "    labels = row[[\"M\", \"E\", \"S\", \"C\"]]\n",
+    "    \n",
+    "    # Get the name of the current patch\n",
+    "    patch_name = row[\"Patch names\"]\n",
+    "    \n",
+    "    # Set the source path of the image\n",
+    "    if patch_name in os.listdir(train_folder):\n",
+    "        source_path = os.path.join(train_folder, patch_name)\n",
+    "    elif patch_name in os.listdir(val_folder):\n",
+    "        source_path = os.path.join(val_folder, patch_name)\n",
+    "    \n",
+    "    # Set the destination paths of the image\n",
+    "    for label in labels:\n",
+    "        if label in possible_labels:\n",
+    "            if source_path.split(\"/\")[-2] == \"train\":\n",
+    "                dest_path = os.path.join(train_folder, label)\n",
+    "            else:\n",
+    "                dest_path = os.path.join(val_folder, label)\n",
+    "            if patch_name in os.listdir(dest_path):\n",
+    "                pass\n",
+    "            else:\n",
+    "                shutil.copy(source_path, dest_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Delete all the images in the train and val folders that are not in subdirectories\n",
+    "for image in os.listdir(train_folder):\n",
+    "    if os.path.isfile(os.path.join(train_folder, image)):\n",
+    "        os.remove(os.path.join(train_folder, image))\n",
+    "        \n",
+    "for image in os.listdir(val_folder):\n",
+    "    if os.path.isfile(os.path.join(val_folder, image)):\n",
+    "        os.remove(os.path.join(val_folder, image))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create folders for each type of lesion\n",
+    "lesion_folders = [\"M\", \"E\", \"S\", \"C\"]\n",
+    "for lesion in lesion_folders:\n",
+    "    lesion_path = os.path.join(dataset_folder, lesion)\n",
+    "    os.makedirs(lesion_path, exist_ok=True)\n",
+    "    for step in [\"train\", \"val\"]:\n",
+    "        os.makedirs(os.path.join(lesion_path, step), exist_ok=True)\n",
+    "        if lesion == \"M\":\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"nan_label\"), exist_ok=True)\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"noM\"), exist_ok=True)\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"yesM\"), exist_ok=True)\n",
+    "        if lesion == \"E\":\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"noE\"), exist_ok=True)\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"yesE\"), exist_ok=True)\n",
+    "        if lesion == \"S\":\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"GGS\"), exist_ok=True)\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"NoGS\"), exist_ok=True)\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"SGS\"), exist_ok=True)\n",
+    "        if lesion == \"C\":\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"noC\"), exist_ok=True)\n",
+    "            os.makedirs(os.path.join(lesion_path, step, \"yesC\"), exist_ok=True)\n",
+    "            \n",
+    "# Move the images to the appropriate folders\n",
+    "lesion_labels_dict = {\n",
+    "    \"M\": [\"nan_label\", \"noM\", \"yesM\"],\n",
+    "    \"E\": [\"noE\", \"yesE\"],\n",
+    "    \"S\": [\"GGS\", \"NoGS\", \"SGS\"],\n",
+    "    \"C\": [\"noC\", \"yesC\"]\n",
+    "}\n",
+    "\n",
+    "# Add the possibility to empty the folders if they are not empty\n",
+    "for lesion in lesion_folders:\n",
+    "    for step in [\"train\", \"val\"]:\n",
+    "        for label in lesion_labels_dict[lesion]:\n",
+    "            if len(os.listdir(os.path.join(dataset_folder, lesion, step, label))) > 0:\n",
+    "                response = input(f\"The {lesion}/{step}/{label} folder is not empty. Do you want to empty the folder? (yes/no): \")\n",
+    "                if response.lower() == \"yes\":\n",
+    "                    shutil.rmtree(os.path.join(dataset_folder, lesion, step, label))\n",
+    "                    os.makedirs(os.path.join(dataset_folder, lesion, step, label), exist_ok=True)\n",
+    "                    \n",
+    "# Move the images to the appropriate folders                  \n",
+    "for lesion in lesion_labels_dict.keys():\n",
+    "    for step in [\"train\", \"val\"]:\n",
+    "        for label in lesion_labels_dict[lesion]:\n",
+    "            source_folder = os.path.join(dataset_folder, step, label)\n",
+    "            destination_folder = os.path.join(dataset_folder, lesion, step, label)\n",
+    "            for image in os.listdir(source_folder):\n",
+    "                source_path = os.path.join(source_folder, image)\n",
+    "                destination_path = os.path.join(destination_folder, image)\n",
+    "                shutil.move(source_path, destination_path)\n",
+    "            os.rmdir(source_folder)\n",
+    "\n",
+    "os.rmdir(train_folder)\n",
+    "os.rmdir(val_folder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "M: 416 images\n",
+      "E: 414 images\n",
+      "S: 465 images\n",
+      "C: 415 images\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Give the amount of images by lesion\n",
+    "for lesion in lesion_folders:\n",
+    "    num_images = 0\n",
+    "    for step in [\"train\", \"val\"]:\n",
+    "        for label in lesion_labels_dict[lesion]:\n",
+    "            num_images += len(os.listdir(os.path.join(dataset_folder, lesion, step, label)))\n",
+    "    print(f\"{lesion}: {num_images} images\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "segmentation",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

model_training.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff