{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "d3a1f52b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from dotenv import load_dotenv\n",
    "from pathlib import Path\n",
    "import json\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "bca20bcc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ImageId</th>\n",
       "      <th>EncodedPixels</th>\n",
       "      <th>Height</th>\n",
       "      <th>Width</th>\n",
       "      <th>ClassId</th>\n",
       "      <th>AttributesIds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>00000663ed1ff0c4e0132b9b9ac53f6e</td>\n",
       "      <td>6068157 7 6073371 20 6078584 34 6083797 48 608...</td>\n",
       "      <td>5214</td>\n",
       "      <td>3676</td>\n",
       "      <td>6</td>\n",
       "      <td>115,136,143,154,230,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>00000663ed1ff0c4e0132b9b9ac53f6e</td>\n",
       "      <td>6323163 11 6328356 32 6333549 53 6338742 75 63...</td>\n",
       "      <td>5214</td>\n",
       "      <td>3676</td>\n",
       "      <td>0</td>\n",
       "      <td>115,136,142,146,225,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>00000663ed1ff0c4e0132b9b9ac53f6e</td>\n",
       "      <td>8521389 10 8526585 30 8531789 42 8537002 46 85...</td>\n",
       "      <td>5214</td>\n",
       "      <td>3676</td>\n",
       "      <td>28</td>\n",
       "      <td>163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>00000663ed1ff0c4e0132b9b9ac53f6e</td>\n",
       "      <td>12903854 2 12909064 7 12914275 10 12919485 15 ...</td>\n",
       "      <td>5214</td>\n",
       "      <td>3676</td>\n",
       "      <td>31</td>\n",
       "      <td>160,204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>00000663ed1ff0c4e0132b9b9ac53f6e</td>\n",
       "      <td>10837337 5 10842542 14 10847746 24 10852951 33...</td>\n",
       "      <td>5214</td>\n",
       "      <td>3676</td>\n",
       "      <td>32</td>\n",
       "      <td>219</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            ImageId  \\\n",
       "0  00000663ed1ff0c4e0132b9b9ac53f6e   \n",
       "1  00000663ed1ff0c4e0132b9b9ac53f6e   \n",
       "2  00000663ed1ff0c4e0132b9b9ac53f6e   \n",
       "3  00000663ed1ff0c4e0132b9b9ac53f6e   \n",
       "4  00000663ed1ff0c4e0132b9b9ac53f6e   \n",
       "\n",
       "                                       EncodedPixels  Height  Width  ClassId  \\\n",
       "0  6068157 7 6073371 20 6078584 34 6083797 48 608...    5214   3676        6   \n",
       "1  6323163 11 6328356 32 6333549 53 6338742 75 63...    5214   3676        0   \n",
       "2  8521389 10 8526585 30 8531789 42 8537002 46 85...    5214   3676       28   \n",
       "3  12903854 2 12909064 7 12914275 10 12919485 15 ...    5214   3676       31   \n",
       "4  10837337 5 10842542 14 10847746 24 10852951 33...    5214   3676       32   \n",
       "\n",
       "                     AttributesIds  \n",
       "0  115,136,143,154,230,295,316,317  \n",
       "1  115,136,142,146,225,295,316,317  \n",
       "2                              163  \n",
       "3                          160,204  \n",
       "4                              219  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get project root (one level up from notebooks/ if running from notebooks directory)\n",
    "current_dir = Path.cwd()\n",
    "PROJECT_ROOT = current_dir.parent if current_dir.name == \"notebooks\" else current_dir\n",
    "DATA_PATH = PROJECT_ROOT / \"data\"\n",
    "\n",
    "fashion_df = pd.read_csv(DATA_PATH / \"train.csv\")\n",
    "fashion_df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "58f4f7b4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>supercategory</th>\n",
       "      <th>level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>shirt, blouse</td>\n",
       "      <td>upperbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>top, t-shirt, sweatshirt</td>\n",
       "      <td>upperbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>sweater</td>\n",
       "      <td>upperbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>cardigan</td>\n",
       "      <td>upperbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>jacket</td>\n",
       "      <td>upperbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>vest</td>\n",
       "      <td>upperbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>pants</td>\n",
       "      <td>lowerbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7</td>\n",
       "      <td>shorts</td>\n",
       "      <td>lowerbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8</td>\n",
       "      <td>skirt</td>\n",
       "      <td>lowerbody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>9</td>\n",
       "      <td>coat</td>\n",
       "      <td>wholebody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10</td>\n",
       "      <td>dress</td>\n",
       "      <td>wholebody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>jumpsuit</td>\n",
       "      <td>wholebody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>12</td>\n",
       "      <td>cape</td>\n",
       "      <td>wholebody</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>13</td>\n",
       "      <td>glasses</td>\n",
       "      <td>head</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>14</td>\n",
       "      <td>hat</td>\n",
       "      <td>head</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>15</td>\n",
       "      <td>headband, head covering, hair accessory</td>\n",
       "      <td>head</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>16</td>\n",
       "      <td>tie</td>\n",
       "      <td>neck</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>17</td>\n",
       "      <td>glove</td>\n",
       "      <td>arms and hands</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>18</td>\n",
       "      <td>watch</td>\n",
       "      <td>arms and hands</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>19</td>\n",
       "      <td>belt</td>\n",
       "      <td>waist</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>20</td>\n",
       "      <td>leg warmer</td>\n",
       "      <td>legs and feet</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>21</td>\n",
       "      <td>tights, stockings</td>\n",
       "      <td>legs and feet</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>22</td>\n",
       "      <td>sock</td>\n",
       "      <td>legs and feet</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>23</td>\n",
       "      <td>shoe</td>\n",
       "      <td>legs and feet</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>24</td>\n",
       "      <td>bag, wallet</td>\n",
       "      <td>others</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>25</td>\n",
       "      <td>scarf</td>\n",
       "      <td>others</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>26</td>\n",
       "      <td>umbrella</td>\n",
       "      <td>others</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>27</td>\n",
       "      <td>hood</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>28</td>\n",
       "      <td>collar</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>29</td>\n",
       "      <td>lapel</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>30</td>\n",
       "      <td>epaulette</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>31</td>\n",
       "      <td>sleeve</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>32</td>\n",
       "      <td>pocket</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>33</td>\n",
       "      <td>neckline</td>\n",
       "      <td>garment parts</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>34</td>\n",
       "      <td>buckle</td>\n",
       "      <td>closures</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>35</td>\n",
       "      <td>zipper</td>\n",
       "      <td>closures</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>36</td>\n",
       "      <td>applique</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>37</td>\n",
       "      <td>bead</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>38</td>\n",
       "      <td>bow</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>39</td>\n",
       "      <td>flower</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>40</td>\n",
       "      <td>fringe</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>41</td>\n",
       "      <td>ribbon</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>42</td>\n",
       "      <td>rivet</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>43</td>\n",
       "      <td>ruffle</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>44</td>\n",
       "      <td>sequin</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>45</td>\n",
       "      <td>tassel</td>\n",
       "      <td>decorations</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id                                     name   supercategory  level\n",
       "0    0                            shirt, blouse       upperbody      2\n",
       "1    1                 top, t-shirt, sweatshirt       upperbody      2\n",
       "2    2                                  sweater       upperbody      2\n",
       "3    3                                 cardigan       upperbody      2\n",
       "4    4                                   jacket       upperbody      2\n",
       "5    5                                     vest       upperbody      2\n",
       "6    6                                    pants       lowerbody      2\n",
       "7    7                                   shorts       lowerbody      2\n",
       "8    8                                    skirt       lowerbody      2\n",
       "9    9                                     coat       wholebody      2\n",
       "10  10                                    dress       wholebody      2\n",
       "11  11                                 jumpsuit       wholebody      2\n",
       "12  12                                     cape       wholebody      2\n",
       "13  13                                  glasses            head      2\n",
       "14  14                                      hat            head      2\n",
       "15  15  headband, head covering, hair accessory            head      2\n",
       "16  16                                      tie            neck      2\n",
       "17  17                                    glove  arms and hands      2\n",
       "18  18                                    watch  arms and hands      2\n",
       "19  19                                     belt           waist      2\n",
       "20  20                               leg warmer   legs and feet      2\n",
       "21  21                        tights, stockings   legs and feet      2\n",
       "22  22                                     sock   legs and feet      2\n",
       "23  23                                     shoe   legs and feet      2\n",
       "24  24                              bag, wallet          others      2\n",
       "25  25                                    scarf          others      2\n",
       "26  26                                 umbrella          others      2\n",
       "27  27                                     hood   garment parts      2\n",
       "28  28                                   collar   garment parts      2\n",
       "29  29                                    lapel   garment parts      2\n",
       "30  30                                epaulette   garment parts      2\n",
       "31  31                                   sleeve   garment parts      2\n",
       "32  32                                   pocket   garment parts      2\n",
       "33  33                                 neckline   garment parts      2\n",
       "34  34                                   buckle        closures      2\n",
       "35  35                                   zipper        closures      2\n",
       "36  36                                 applique     decorations      2\n",
       "37  37                                     bead     decorations      2\n",
       "38  38                                      bow     decorations      2\n",
       "39  39                                   flower     decorations      2\n",
       "40  40                                   fringe     decorations      2\n",
       "41  41                                   ribbon     decorations      2\n",
       "42  42                                    rivet     decorations      2\n",
       "43  43                                   ruffle     decorations      2\n",
       "44  44                                   sequin     decorations      2\n",
       "45  45                                   tassel     decorations      2"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_descriptions = json.load(open(DATA_PATH / \"label_descriptions.json\"))\n",
    "\n",
    "categories_df = pd.DataFrame(label_descriptions[\"categories\"])\n",
    "categories_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "48d7ab2b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>supercategory</th>\n",
       "      <th>level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>classic (t-shirt)</td>\n",
       "      <td>nickname</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>polo (shirt)</td>\n",
       "      <td>nickname</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>undershirt</td>\n",
       "      <td>nickname</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>henley (shirt)</td>\n",
       "      <td>nickname</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>ringer (t-shirt)</td>\n",
       "      <td>nickname</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>289</th>\n",
       "      <td>336</td>\n",
       "      <td>peacock</td>\n",
       "      <td>animal</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>290</th>\n",
       "      <td>337</td>\n",
       "      <td>zebra</td>\n",
       "      <td>animal</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>291</th>\n",
       "      <td>338</td>\n",
       "      <td>giraffe</td>\n",
       "      <td>animal</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>292</th>\n",
       "      <td>339</td>\n",
       "      <td>toile de jouy</td>\n",
       "      <td>textile pattern</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>293</th>\n",
       "      <td>340</td>\n",
       "      <td>plant</td>\n",
       "      <td>textile pattern</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>294 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id               name    supercategory  level\n",
       "0      0  classic (t-shirt)         nickname      1\n",
       "1      1       polo (shirt)         nickname      1\n",
       "2      2         undershirt         nickname      1\n",
       "3      3     henley (shirt)         nickname      1\n",
       "4      4   ringer (t-shirt)         nickname      1\n",
       "..   ...                ...              ...    ...\n",
       "289  336            peacock           animal      2\n",
       "290  337              zebra           animal      2\n",
       "291  338            giraffe           animal      2\n",
       "292  339      toile de jouy  textile pattern      1\n",
       "293  340              plant  textile pattern      1\n",
       "\n",
       "[294 rows x 4 columns]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "attributes_df = pd.DataFrame(label_descriptions[\"attributes\"])\n",
    "attributes_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "b178ce00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 0 potentially SUGGESTIVE categories\n",
      "Found 4 potentially SUGGESTIVE attributes\n",
      "\n",
      "============================================================\n",
      "POTENTIALLY SUGGESTIVE CATEGORIES:\n",
      "============================================================\n",
      "Empty DataFrame\n",
      "Columns: [id, name, supercategory]\n",
      "Index: []\n",
      "\n",
      "============================================================\n",
      "POTENTIALLY SUGGESTIVE ATTRIBUTES:\n",
      "============================================================\n",
      " id                name supercategory\n",
      " 51      booty (shorts)      nickname\n",
      "106     bodycon (dress)      nickname\n",
      "148      micro (length)        length\n",
      "192 plunging (neckline) neckline type\n"
     ]
    }
   ],
   "source": [
    "# ULTRA-RESTRICTIVE criteria for UNCERTAIN SUGGESTIVE content\n",
    "# Only items that are clearly and unambiguously SUGGESTIVE/revealing\n",
    "# Removed: crop tops, halter tops, tube tops, mini length, tight fit, etc. - too many false positives\n",
    "\n",
    "# Only the most obviously SUGGESTIVE items\n",
    "revealing_keywords = {\n",
    "    'categories': [\n",
    "        # Only inherently revealing categories - removed most as they had false positives\n",
    "        # 'booty',  # removed - checking attribute instead\n",
    "        # 'bodycon',  # removed - checking attribute instead\n",
    "    ],\n",
    "    'attributes': [\n",
    "        # ONLY the most clearly SUGGESTIVE items\n",
    "        'booty (shorts)',  # very specific and clearly SUGGESTIVE\n",
    "        'bodycon (dress)',  # form-fitting, often revealing\n",
    "        # Removed: crop (top), halter (top), tube (top), camisole, slip (dress) - too many false positives\n",
    "    ]\n",
    "}\n",
    "\n",
    "# Only the most revealing patterns\n",
    "revealing_patterns = {\n",
    "    'length': [\n",
    "        # Removed: mini (length) - too broad, many modest mini skirts\n",
    "        'micro (length)',  # only very short - but still might have false positives\n",
    "    ],\n",
    "    'neckline type': [\n",
    "        'plunging (neckline)',  # only the most revealing neckline\n",
    "        # Removed: off-the-shoulder, one shoulder - can be modest\n",
    "    ],\n",
    "    'silhouette': [\n",
    "        # Removed: tight (fit) - way too broad, many normal clothes are tight\n",
    "    ],\n",
    "    'nickname': [\n",
    "        # Only the most clearly SUGGESTIVE\n",
    "        'booty (shorts)',\n",
    "        'bodycon (dress)',\n",
    "    ]\n",
    "}\n",
    "\n",
    "def is_potentially_SUGGESTIVE(name: str, supercategory: str = None) -> bool:\n",
    "    \"\"\"Check if a category or attribute name suggests potentially SUGGESTIVE content.\n",
    "    ULTRA-RESTRICTIVE: Only matches clearly SUGGESTIVE items to avoid false positives.\"\"\"\n",
    "    name_lower = name.lower()\n",
    "    \n",
    "    # Special case: check for \"booty\" in any context (shorts) - very specific\n",
    "    if 'booty' in name_lower:\n",
    "        return True\n",
    "    \n",
    "    # Check for revealing patterns by supercategory - very restrictive\n",
    "    if supercategory:\n",
    "        if supercategory == 'length':\n",
    "            # Only micro length (very short) - removed mini as too broad\n",
    "            if 'micro (length)' in name_lower:\n",
    "                return True\n",
    "        elif supercategory == 'neckline type':\n",
    "            # Only plunging neckline - most revealing\n",
    "            if 'plunging (neckline)' in name_lower:\n",
    "                return True\n",
    "        elif supercategory == 'nickname':\n",
    "            # Only the most clearly SUGGESTIVE styles\n",
    "            for pattern in revealing_patterns['nickname']:\n",
    "                if pattern.lower() in name_lower:\n",
    "                    return True\n",
    "    \n",
    "    # Check categories for inherently revealing items\n",
    "    for keyword in revealing_keywords['categories']:\n",
    "        if keyword.lower() in name_lower:\n",
    "            return True\n",
    "    \n",
    "    # Check attributes for inherently revealing items\n",
    "    for keyword in revealing_keywords['attributes']:\n",
    "        if keyword.lower() in name_lower:\n",
    "            return True\n",
    "    \n",
    "    return False\n",
    "\n",
    "# Filter categories\n",
    "SUGGESTIVE_categories = categories_df[\n",
    "    categories_df['name'].apply(lambda x: is_potentially_SUGGESTIVE(x))\n",
    "].copy()\n",
    "\n",
    "# Filter attributes\n",
    "SUGGESTIVE_attributes = attributes_df[\n",
    "    attributes_df.apply(lambda row: is_potentially_SUGGESTIVE(row['name'], row['supercategory']), axis=1)\n",
    "].copy()\n",
    "\n",
    "print(f\"Found {len(SUGGESTIVE_categories)} potentially SUGGESTIVE categories\")\n",
    "print(f\"Found {len(SUGGESTIVE_attributes)} potentially SUGGESTIVE attributes\")\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"POTENTIALLY SUGGESTIVE CATEGORIES:\")\n",
    "print(\"=\"*60)\n",
    "print(SUGGESTIVE_categories[['id', 'name', 'supercategory']].to_string(index=False))\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"POTENTIALLY SUGGESTIVE ATTRIBUTES:\")\n",
    "print(\"=\"*60)\n",
    "print(SUGGESTIVE_attributes[['id', 'name', 'supercategory']].to_string(index=False))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "7ff15a1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============================================================\n",
      "BREAKDOWN BY SUPERCATEGORY:\n",
      "============================================================\n",
      "\n",
      "Attributes by supercategory:\n",
      "supercategory\n",
      "nickname         2\n",
      "length           1\n",
      "neckline type    1\n",
      "dtype: int64\n",
      "\n",
      "Detailed attribute breakdown:\n",
      "\n",
      "nickname:\n",
      "  - booty (shorts) (id: 51)\n",
      "  - bodycon (dress) (id: 106)\n",
      "\n",
      "length:\n",
      "  - micro (length) (id: 148)\n",
      "\n",
      "neckline type:\n",
      "  - plunging (neckline) (id: 192)\n",
      "\n",
      "============================================================\n",
      "SUMMARY DATAFRAME (for export):\n",
      "============================================================\n",
      "        type   id                 name  supercategory\n",
      "0  attribute   51       booty (shorts)       nickname\n",
      "1  attribute  106      bodycon (dress)       nickname\n",
      "2  attribute  148       micro (length)         length\n",
      "3  attribute  192  plunging (neckline)  neckline type\n"
     ]
    }
   ],
   "source": [
    "# Create a detailed breakdown by supercategory\n",
    "print(\"=\"*60)\n",
    "print(\"BREAKDOWN BY SUPERCATEGORY:\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "if len(SUGGESTIVE_attributes) > 0:\n",
    "    print(\"\\nAttributes by supercategory:\")\n",
    "    print(SUGGESTIVE_attributes.groupby('supercategory').size().sort_values(ascending=False))\n",
    "    \n",
    "    print(\"\\nDetailed attribute breakdown:\")\n",
    "    for supercat in SUGGESTIVE_attributes['supercategory'].unique():\n",
    "        print(f\"\\n{supercat}:\")\n",
    "        subset = SUGGESTIVE_attributes[SUGGESTIVE_attributes['supercategory'] == supercat]\n",
    "        for _, row in subset.iterrows():\n",
    "            print(f\"  - {row['name']} (id: {row['id']})\")\n",
    "\n",
    "# Create summary DataFrames for export\n",
    "SUGGESTIVE_summary = {\n",
    "    'type': ['category'] * len(SUGGESTIVE_categories) + ['attribute'] * len(SUGGESTIVE_attributes),\n",
    "    'id': list(SUGGESTIVE_categories['id']) + list(SUGGESTIVE_attributes['id']),\n",
    "    'name': list(SUGGESTIVE_categories['name']) + list(SUGGESTIVE_attributes['name']),\n",
    "    'supercategory': list(SUGGESTIVE_categories['supercategory']) + list(SUGGESTIVE_attributes['supercategory'])\n",
    "}\n",
    "\n",
    "SUGGESTIVE_summary_df = pd.DataFrame(SUGGESTIVE_summary)\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"SUMMARY DATAFRAME (for export):\")\n",
    "print(\"=\"*60)\n",
    "print(SUGGESTIVE_summary_df)\n",
    "\n",
    "# Optionally save to CSV\n",
    "# SUGGESTIVE_summary_df.to_csv(DATA_PATH / \"SUGGESTIVE_labels.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "a46d616c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SUGGESTIVE category IDs: set()\n",
      "SUGGESTIVE attribute IDs: {192, 106, 51, 148}\n",
      "\n",
      "Total unique SUGGESTIVE category IDs: 0\n",
      "Total unique SUGGESTIVE attribute IDs: 4\n",
      "\n",
      "============================================================\n",
      "FILTERING RESULTS:\n",
      "============================================================\n",
      "Total images in fashion_df: 333401\n",
      "Images with SUGGESTIVE content: 5218\n",
      "Percentage: 1.57%\n",
      "\n",
      "Breakdown:\n",
      "  - Matched by category only: 0\n",
      "  - Matched by attribute only: 5218\n",
      "  - Matched by both: 0\n",
      "\n",
      "============================================================\n",
      "SAMPLE OF SUGGESTIVE IMAGES (first 10 rows):\n",
      "============================================================\n",
      "                              ImageId  ClassId  \\\n",
      "49   000b3a87508b0fa185fbd53ecbe2e4c6       33   \n",
      "147  001a66b16b12f12dc45e2bba40e04683       10   \n",
      "180  00211c06b1fe730097dde122cd4d3f8c        7   \n",
      "304  003ae3da258f7ba7267af5f159dd3502       10   \n",
      "369  0048f6c47de85cc4dc263912bd0ff6f5       33   \n",
      "372  0048f6c47de85cc4dc263912bd0ff6f5        7   \n",
      "445  005380bd939eb68085af3f804d387824       10   \n",
      "456  0054564ae183ad9a1b152eef0bc11e1d       10   \n",
      "465  0055347a114b215f8f469fec9e38c272       10   \n",
      "526  005e9b75edcee7d655c390ea5416641d       33   \n",
      "\n",
      "                           AttributesIds  \n",
      "49                                   192  \n",
      "147      106,115,127,142,149,229,295,316  \n",
      "180   50,115,136,142,148,230,295,300,317  \n",
      "304          106,127,141,150,295,316,317  \n",
      "369                                  192  \n",
      "372               50,115,136,142,148,317  \n",
      "445  106,114,127,142,150,229,295,311,317  \n",
      "456  106,115,127,142,149,229,295,316,317  \n",
      "465  106,115,127,142,149,229,295,316,317  \n",
      "526                                  192  \n"
     ]
    }
   ],
   "source": [
    "# Get IDs of SUGGESTIVE categories and attributes\n",
    "SUGGESTIVE_category_ids = set(SUGGESTIVE_categories['id'].tolist())\n",
    "SUGGESTIVE_attribute_ids = set(SUGGESTIVE_attributes['id'].tolist())\n",
    "\n",
    "print(f\"SUGGESTIVE category IDs: {SUGGESTIVE_category_ids}\")\n",
    "print(f\"SUGGESTIVE attribute IDs: {SUGGESTIVE_attribute_ids}\")\n",
    "print(f\"\\nTotal unique SUGGESTIVE category IDs: {len(SUGGESTIVE_category_ids)}\")\n",
    "print(f\"Total unique SUGGESTIVE attribute IDs: {len(SUGGESTIVE_attribute_ids)}\")\n",
    "\n",
    "# Function to check if AttributesIds string contains any SUGGESTIVE attribute ID\n",
    "def has_SUGGESTIVE_attribute(attributes_str: str) -> bool:\n",
    "    \"\"\"Check if the comma-separated attributes string contains any SUGGESTIVE attribute ID.\"\"\"\n",
    "    if pd.isna(attributes_str) or attributes_str == '':\n",
    "        return False\n",
    "    # Parse comma-separated string and convert to integers\n",
    "    try:\n",
    "        attr_ids = [int(x.strip()) for x in str(attributes_str).split(',')]\n",
    "        return bool(SUGGESTIVE_attribute_ids.intersection(set(attr_ids)))\n",
    "    except (ValueError, AttributeError):\n",
    "        return False\n",
    "\n",
    "# Filter fashion_df for SUGGESTIVE images\n",
    "# An image is SUGGESTIVE if:\n",
    "# 1. Its ClassId matches a SUGGESTIVE category, OR\n",
    "# 2. Its AttributesIds contains any SUGGESTIVE attribute ID\n",
    "\n",
    "SUGGESTIVE_mask = (\n",
    "    fashion_df['ClassId'].isin(SUGGESTIVE_category_ids) |\n",
    "    fashion_df['AttributesIds'].apply(has_SUGGESTIVE_attribute)\n",
    ")\n",
    "\n",
    "SUGGESTIVE_fashion_df = fashion_df[SUGGESTIVE_mask].copy()\n",
    "\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"FILTERING RESULTS:\")\n",
    "print(\"=\"*60)\n",
    "print(f\"Total images in fashion_df: {len(fashion_df)}\")\n",
    "print(f\"Images with SUGGESTIVE content: {len(SUGGESTIVE_fashion_df)}\")\n",
    "print(f\"Percentage: {len(SUGGESTIVE_fashion_df) / len(fashion_df) * 100:.2f}%\")\n",
    "\n",
    "# Show breakdown by type of match\n",
    "category_matches = fashion_df['ClassId'].isin(SUGGESTIVE_category_ids).sum()\n",
    "attribute_matches = fashion_df['AttributesIds'].apply(has_SUGGESTIVE_attribute).sum()\n",
    "both_matches = ((fashion_df['ClassId'].isin(SUGGESTIVE_category_ids)) & \n",
    "                (fashion_df['AttributesIds'].apply(has_SUGGESTIVE_attribute))).sum()\n",
    "\n",
    "print(f\"\\nBreakdown:\")\n",
    "print(f\"  - Matched by category only: {category_matches - both_matches}\")\n",
    "print(f\"  - Matched by attribute only: {attribute_matches - both_matches}\")\n",
    "print(f\"  - Matched by both: {both_matches}\")\n",
    "\n",
    "# Show sample of SUGGESTIVE images\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"SAMPLE OF SUGGESTIVE IMAGES (first 10 rows):\")\n",
    "print(\"=\"*60)\n",
    "print(SUGGESTIVE_fashion_df[['ImageId', 'ClassId', 'AttributesIds']].head(10))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "0e1b67fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============================================================\n",
      "DIAGNOSTIC: BREAKDOWN OF MATCHES\n",
      "============================================================\n",
      "\n",
      "1. Matches by Category (ClassId):\n",
      "   No category matches\n",
      "\n",
      "2. Matches by Attribute (AttributesIds):\n",
      "   - micro (length) (id: 148): 3190 matches\n",
      "   - bodycon (dress) (id: 106): 1144 matches\n",
      "   - plunging (neckline) (id: 192): 966 matches\n",
      "   - booty (shorts) (id: 51): 20 matches\n",
      "\n",
      "3. Sample rows with their matched attributes/categories:\n",
      "   (First 5 rows showing ImageId, ClassId, and matched attributes)\n",
      "\n",
      "   ImageId: 000b3a87508b0fa185fbd53ecbe2e4c6\n",
      "   ClassId: 33 -> neckline\n",
      "   Matched Attributes: ['plunging (neckline)']\n",
      "\n",
      "   ImageId: 001a66b16b12f12dc45e2bba40e04683\n",
      "   ClassId: 10 -> dress\n",
      "   Matched Attributes: ['bodycon (dress)']\n",
      "\n",
      "   ImageId: 00211c06b1fe730097dde122cd4d3f8c\n",
      "   ClassId: 7 -> shorts\n",
      "   Matched Attributes: ['micro (length)']\n",
      "\n",
      "   ImageId: 003ae3da258f7ba7267af5f159dd3502\n",
      "   ClassId: 10 -> dress\n",
      "   Matched Attributes: ['bodycon (dress)']\n",
      "\n",
      "   ImageId: 0048f6c47de85cc4dc263912bd0ff6f5\n",
      "   ClassId: 33 -> neckline\n",
      "   Matched Attributes: ['plunging (neckline)']\n"
     ]
    }
   ],
   "source": [
    "# DIAGNOSTIC: Show what's actually being matched\n",
    "# This helps identify which attributes/categories are causing matches\n",
    "\n",
    "print(\"=\"*60)\n",
    "print(\"DIAGNOSTIC: BREAKDOWN OF MATCHES\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "# Create a mapping of attribute IDs to names\n",
    "attr_id_to_name = dict(zip(attributes_df['id'], attributes_df['name']))\n",
    "cat_id_to_name = dict(zip(categories_df['id'], categories_df['name']))\n",
    "\n",
    "# Analyze what's matching in the SUGGESTIVE_fashion_df\n",
    "print(\"\\n1. Matches by Category (ClassId):\")\n",
    "category_matches = SUGGESTIVE_fashion_df[SUGGESTIVE_fashion_df['ClassId'].isin(SUGGESTIVE_category_ids)]\n",
    "if len(category_matches) > 0:\n",
    "    cat_counts = category_matches['ClassId'].value_counts()\n",
    "    for cat_id, count in cat_counts.items():\n",
    "        cat_name = cat_id_to_name.get(cat_id, f\"Unknown (id: {cat_id})\")\n",
    "        print(f\"   - {cat_name} (id: {cat_id}): {count} matches\")\n",
    "else:\n",
    "    print(\"   No category matches\")\n",
    "\n",
    "print(\"\\n2. Matches by Attribute (AttributesIds):\")\n",
    "# Find which attributes are matching\n",
    "matching_attributes = {}\n",
    "for idx, row in SUGGESTIVE_fashion_df.iterrows():\n",
    "    if pd.notna(row['AttributesIds']) and row['AttributesIds'] != '':\n",
    "        try:\n",
    "            attr_ids = [int(x.strip()) for x in str(row['AttributesIds']).split(',')]\n",
    "            matching_attr_ids = SUGGESTIVE_attribute_ids.intersection(set(attr_ids))\n",
    "            for attr_id in matching_attr_ids:\n",
    "                matching_attributes[attr_id] = matching_attributes.get(attr_id, 0) + 1\n",
    "        except:\n",
    "            pass\n",
    "\n",
    "if matching_attributes:\n",
    "    for attr_id, count in sorted(matching_attributes.items(), key=lambda x: x[1], reverse=True):\n",
    "        attr_name = attr_id_to_name.get(attr_id, f\"Unknown (id: {attr_id})\")\n",
    "        print(f\"   - {attr_name} (id: {attr_id}): {count} matches\")\n",
    "else:\n",
    "    print(\"   No attribute matches\")\n",
    "\n",
    "print(\"\\n3. Sample rows with their matched attributes/categories:\")\n",
    "print(\"   (First 5 rows showing ImageId, ClassId, and matched attributes)\")\n",
    "for idx, row in SUGGESTIVE_fashion_df.head(5).iterrows():\n",
    "    print(f\"\\n   ImageId: {row['ImageId']}\")\n",
    "    print(f\"   ClassId: {row['ClassId']} -> {cat_id_to_name.get(row['ClassId'], 'Unknown')}\")\n",
    "    if pd.notna(row['AttributesIds']) and row['AttributesIds'] != '':\n",
    "        try:\n",
    "            attr_ids = [int(x.strip()) for x in str(row['AttributesIds']).split(',')]\n",
    "            matching_attr_ids = SUGGESTIVE_attribute_ids.intersection(set(attr_ids))\n",
    "            if matching_attr_ids:\n",
    "                print(f\"   Matched Attributes: {[attr_id_to_name.get(aid, f'id:{aid}') for aid in matching_attr_ids]}\")\n",
    "        except:\n",
    "            pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "aaaf424a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============================================================\n",
      "UNIQUE IMAGE ANALYSIS:\n",
      "============================================================\n",
      "Total unique images in dataset: 45623\n",
      "Unique images with SUGGESTIVE content: 5079\n",
      "Percentage of unique images: 11.13%\n",
      "\n",
      "Average annotations per SUGGESTIVE image: 1.03\n",
      "Max annotations for a single image: 4\n",
      "Min annotations for a single image: 1\n",
      "\n",
      "============================================================\n",
      "DISTRIBUTION OF SUGGESTIVE CATEGORIES IN FILTERED DATA:\n",
      "============================================================\n",
      "ClassId\n",
      "10    1502\n",
      "33     965\n",
      "7      857\n",
      "4      684\n",
      "1      322\n",
      "0      317\n",
      "9      215\n",
      "8      123\n",
      "3       83\n",
      "2       79\n",
      "11      51\n",
      "5       11\n",
      "12       8\n",
      "37       1\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Top SUGGESTIVE categories by count:\n",
      "  - dress (id: 10): 1502 annotations\n",
      "  - neckline (id: 33): 965 annotations\n",
      "  - shorts (id: 7): 857 annotations\n",
      "  - jacket (id: 4): 684 annotations\n",
      "  - top, t-shirt, sweatshirt (id: 1): 322 annotations\n",
      "  - shirt, blouse (id: 0): 317 annotations\n",
      "  - coat (id: 9): 215 annotations\n",
      "  - skirt (id: 8): 123 annotations\n",
      "  - cardigan (id: 3): 83 annotations\n",
      "  - sweater (id: 2): 79 annotations\n"
     ]
    }
   ],
   "source": [
    "# Get unique image IDs (since same image can have multiple annotations)\n",
    "unique_SUGGESTIVE_image_ids = SUGGESTIVE_fashion_df['ImageId'].unique()\n",
    "unique_total_image_ids = fashion_df['ImageId'].unique()\n",
    "\n",
    "print(\"=\"*60)\n",
    "print(\"UNIQUE IMAGE ANALYSIS:\")\n",
    "print(\"=\"*60)\n",
    "print(f\"Total unique images in dataset: {len(unique_total_image_ids)}\")\n",
    "print(f\"Unique images with SUGGESTIVE content: {len(unique_SUGGESTIVE_image_ids)}\")\n",
    "print(f\"Percentage of unique images: {len(unique_SUGGESTIVE_image_ids) / len(unique_total_image_ids) * 100:.2f}%\")\n",
    "\n",
    "# Count how many annotations per SUGGESTIVE image\n",
    "annotations_per_image = SUGGESTIVE_fashion_df.groupby('ImageId').size().sort_values(ascending=False)\n",
    "print(f\"\\nAverage annotations per SUGGESTIVE image: {annotations_per_image.mean():.2f}\")\n",
    "print(f\"Max annotations for a single image: {annotations_per_image.max()}\")\n",
    "print(f\"Min annotations for a single image: {annotations_per_image.min()}\")\n",
    "\n",
    "# Show distribution of SUGGESTIVE categories in the filtered data\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"DISTRIBUTION OF SUGGESTIVE CATEGORIES IN FILTERED DATA:\")\n",
    "print(\"=\"*60)\n",
    "category_counts = SUGGESTIVE_fashion_df['ClassId'].value_counts()\n",
    "print(category_counts)\n",
    "\n",
    "# Map category IDs to names for better readability\n",
    "category_id_to_name = dict(zip(categories_df['id'], categories_df['name']))\n",
    "print(\"\\nTop SUGGESTIVE categories by count:\")\n",
    "for cat_id, count in category_counts.head(10).items():\n",
    "    cat_name = category_id_to_name.get(cat_id, f\"Unknown (id: {cat_id})\")\n",
    "    print(f\"  - {cat_name} (id: {cat_id}): {count} annotations\")\n",
    "\n",
    "# Save the filtered DataFrame\n",
    "# SUGGESTIVE_fashion_df.to_csv(DATA_PATH / \"SUGGESTIVE_train.csv\", index=False)\n",
    "# pd.Series(unique_SUGGESTIVE_image_ids).to_csv(DATA_PATH / \"SUGGESTIVE_image_ids.csv\", index=False, header=['ImageId'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "a4736cf0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============================================================\n",
      "CREATING NEW DATASET\n",
      "============================================================\n",
      "Total unique SUGGESTIVE image IDs: 5079\n",
      "\n",
      "Created folder: /Users/youniss/Documents/GitHub/haram-police/data/new_dataset\n",
      "\n",
      "Copying images from /Users/youniss/Documents/GitHub/haram-police/data/train and /Users/youniss/Documents/GitHub/haram-police/data/test...\n",
      "\n",
      "✓ Successfully copied: 5079 images\n",
      "\n",
      "✓ Saved DataFrame to: /Users/youniss/Documents/GitHub/haram-police/data/SUGGESTIVE_fashion.csv\n",
      "   Total rows: 5218\n",
      "\n",
      "============================================================\n",
      "SUMMARY:\n",
      "============================================================\n",
      "  - Images folder: /Users/youniss/Documents/GitHub/haram-police/data/new_dataset\n",
      "  - Images copied: 5079\n",
      "  - CSV file: /Users/youniss/Documents/GitHub/haram-police/data/SUGGESTIVE_fashion.csv\n",
      "  - CSV rows: 5218\n",
      "  - Unique images: 5079\n"
     ]
    }
   ],
   "source": [
    "# Create new_dataset folder and copy all SUGGESTIVE images from train and test\n",
    "import shutil\n",
    "\n",
    "# Get unique image IDs from SUGGESTIVE_fashion_df\n",
    "unique_SUGGESTIVE_image_ids = set(SUGGESTIVE_fashion_df['ImageId'].unique())\n",
    "\n",
    "print(\"=\"*60)\n",
    "print(\"CREATING NEW DATASET\")\n",
    "print(\"=\"*60)\n",
    "print(f\"Total unique SUGGESTIVE image IDs: {len(unique_SUGGESTIVE_image_ids)}\")\n",
    "\n",
    "# Create new_dataset folder\n",
    "NEW_DATASET_PATH = DATA_PATH / \"new_dataset\"\n",
    "NEW_DATASET_PATH.mkdir(exist_ok=True)\n",
    "print(f\"\\nCreated folder: {NEW_DATASET_PATH}\")\n",
    "\n",
    "# Paths to source folders\n",
    "TRAIN_IMAGE_PATH = DATA_PATH / \"train\"\n",
    "TEST_IMAGE_PATH = DATA_PATH / \"test\"\n",
    "\n",
    "# Copy images from train and test folders\n",
    "copied_count = 0\n",
    "not_found_count = 0\n",
    "not_found_ids = []\n",
    "\n",
    "print(f\"\\nCopying images from {TRAIN_IMAGE_PATH} and {TEST_IMAGE_PATH}...\")\n",
    "\n",
    "for image_id in unique_SUGGESTIVE_image_ids:\n",
    "    image_filename = f\"{image_id}.jpg\"\n",
    "    source_path = None\n",
    "    \n",
    "    # Try train folder first\n",
    "    train_path = TRAIN_IMAGE_PATH / image_filename\n",
    "    if train_path.exists():\n",
    "        source_path = train_path\n",
    "    else:\n",
    "        # Try test folder\n",
    "        test_path = TEST_IMAGE_PATH / image_filename\n",
    "        if test_path.exists():\n",
    "            source_path = test_path\n",
    "    \n",
    "    if source_path:\n",
    "        dest_path = NEW_DATASET_PATH / image_filename\n",
    "        shutil.copy2(source_path, dest_path)\n",
    "        copied_count += 1\n",
    "    else:\n",
    "        not_found_count += 1\n",
    "        not_found_ids.append(image_id)\n",
    "\n",
    "print(f\"\\n✓ Successfully copied: {copied_count} images\")\n",
    "if not_found_count > 0:\n",
    "    print(f\"⚠ Not found: {not_found_count} images\")\n",
    "    print(f\"   First 10 missing IDs: {not_found_ids[:10]}\")\n",
    "\n",
    "# Save the SUGGESTIVE_fashion_df to CSV\n",
    "csv_path = DATA_PATH / \"SUGGESTIVE_fashion.csv\"\n",
    "SUGGESTIVE_fashion_df.to_csv(csv_path, index=False)\n",
    "print(f\"\\n✓ Saved DataFrame to: {csv_path}\")\n",
    "print(f\"   Total rows: {len(SUGGESTIVE_fashion_df)}\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*60)\n",
    "print(\"SUMMARY:\")\n",
    "print(\"=\"*60)\n",
    "print(f\"  - Images folder: {NEW_DATASET_PATH}\")\n",
    "print(f\"  - Images copied: {copied_count}\")\n",
    "print(f\"  - CSV file: {csv_path}\")\n",
    "print(f\"  - CSV rows: {len(SUGGESTIVE_fashion_df)}\")\n",
    "print(f\"  - Unique images: {len(unique_SUGGESTIVE_image_ids)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "a4bdc53c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ImageId</th>\n",
       "      <th>EncodedPixels</th>\n",
       "      <th>Height</th>\n",
       "      <th>Width</th>\n",
       "      <th>ClassId</th>\n",
       "      <th>AttributesIds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>000b3a87508b0fa185fbd53ecbe2e4c6</td>\n",
       "      <td>457283 2 458562 6 459841 9 461120 13 462400 15...</td>\n",
       "      <td>1280</td>\n",
       "      <td>852</td>\n",
       "      <td>33</td>\n",
       "      <td>192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>001a66b16b12f12dc45e2bba40e04683</td>\n",
       "      <td>64049 3 64548 10 65048 17 65548 23 65754 36 66...</td>\n",
       "      <td>500</td>\n",
       "      <td>375</td>\n",
       "      <td>10</td>\n",
       "      <td>106,115,127,142,149,229,295,316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>180</th>\n",
       "      <td>00211c06b1fe730097dde122cd4d3f8c</td>\n",
       "      <td>296470 1 297469 3 298468 5 299467 8 300466 10 ...</td>\n",
       "      <td>1000</td>\n",
       "      <td>665</td>\n",
       "      <td>7</td>\n",
       "      <td>50,115,136,142,148,230,295,300,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>304</th>\n",
       "      <td>003ae3da258f7ba7267af5f159dd3502</td>\n",
       "      <td>129565 3 130583 9 131602 14 132621 19 133641 2...</td>\n",
       "      <td>1024</td>\n",
       "      <td>683</td>\n",
       "      <td>10</td>\n",
       "      <td>106,127,141,150,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>0048f6c47de85cc4dc263912bd0ff6f5</td>\n",
       "      <td>4777361 1 4781320 3 4785279 5 4789239 7 479319...</td>\n",
       "      <td>3960</td>\n",
       "      <td>2640</td>\n",
       "      <td>33</td>\n",
       "      <td>192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>372</th>\n",
       "      <td>0048f6c47de85cc4dc263912bd0ff6f5</td>\n",
       "      <td>3982550 2 3986509 8 3990469 13 3994429 18 3998...</td>\n",
       "      <td>3960</td>\n",
       "      <td>2640</td>\n",
       "      <td>7</td>\n",
       "      <td>50,115,136,142,148,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>445</th>\n",
       "      <td>005380bd939eb68085af3f804d387824</td>\n",
       "      <td>2317673 15 2320644 45 2323624 67 2326613 79 23...</td>\n",
       "      <td>3000</td>\n",
       "      <td>2001</td>\n",
       "      <td>10</td>\n",
       "      <td>106,114,127,142,150,229,295,311,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>456</th>\n",
       "      <td>0054564ae183ad9a1b152eef0bc11e1d</td>\n",
       "      <td>195071 2 196093 5 197115 8 198134 13 199151 20...</td>\n",
       "      <td>1024</td>\n",
       "      <td>683</td>\n",
       "      <td>10</td>\n",
       "      <td>106,115,127,142,149,229,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>465</th>\n",
       "      <td>0055347a114b215f8f469fec9e38c272</td>\n",
       "      <td>236337 20 237832 26 239327 33 240823 38 242320...</td>\n",
       "      <td>1500</td>\n",
       "      <td>1000</td>\n",
       "      <td>10</td>\n",
       "      <td>106,115,127,142,149,229,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>526</th>\n",
       "      <td>005e9b75edcee7d655c390ea5416641d</td>\n",
       "      <td>480863 2 481943 3 483023 4 484102 6 485182 7 4...</td>\n",
       "      <td>1080</td>\n",
       "      <td>1080</td>\n",
       "      <td>33</td>\n",
       "      <td>192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>593</th>\n",
       "      <td>006bb85ca0935680110f4ce67d88b4ee</td>\n",
       "      <td>2461619 6 2463717 17 2465814 24 2467912 23 247...</td>\n",
       "      <td>2096</td>\n",
       "      <td>3000</td>\n",
       "      <td>10</td>\n",
       "      <td>106,115,127,142,149,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>802</th>\n",
       "      <td>009447b79fce7da1ee19a54401517cde</td>\n",
       "      <td>23802163 9 23807451 27 23812746 37 23818049 40...</td>\n",
       "      <td>5304</td>\n",
       "      <td>7952</td>\n",
       "      <td>7</td>\n",
       "      <td>50,115,136,142,148,230,295,298,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>00af8f65bb93f4131499dc9807129a24</td>\n",
       "      <td>1313044 41 1315994 123 1318943 207 1321893 289...</td>\n",
       "      <td>3000</td>\n",
       "      <td>2000</td>\n",
       "      <td>4</td>\n",
       "      <td>17,115,135,145,148,225,281,311,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1201</th>\n",
       "      <td>00f7d06a8db722b86961d911fb9f1d96</td>\n",
       "      <td>54821 5 55316 16 55811 27 56307 37 56802 48 57...</td>\n",
       "      <td>500</td>\n",
       "      <td>375</td>\n",
       "      <td>10</td>\n",
       "      <td>106,115,127,142,151,229,283,311</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1217</th>\n",
       "      <td>00f843a44365248e179ad2a489734913</td>\n",
       "      <td>1690956 5 1693572 11 1696188 15 1698804 18 170...</td>\n",
       "      <td>2617</td>\n",
       "      <td>1500</td>\n",
       "      <td>33</td>\n",
       "      <td>192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1260</th>\n",
       "      <td>01098396b79639e29db8de146c2d0064</td>\n",
       "      <td>959078 8 962065 22 965051 38 968037 54 971024 ...</td>\n",
       "      <td>3000</td>\n",
       "      <td>2000</td>\n",
       "      <td>7</td>\n",
       "      <td>50,148,234,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1274</th>\n",
       "      <td>010db49ecc226102e63815fcf5627319</td>\n",
       "      <td>1205717 31 1208132 69 1210571 82 1213011 94 12...</td>\n",
       "      <td>2448</td>\n",
       "      <td>2448</td>\n",
       "      <td>10</td>\n",
       "      <td>112,115,119,145,148,229,295,306,323,325</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1311</th>\n",
       "      <td>0116a12304c7f94686978f86100076f3</td>\n",
       "      <td>32646 31 33110 78 33587 109 34065 136 34542 15...</td>\n",
       "      <td>492</td>\n",
       "      <td>354</td>\n",
       "      <td>10</td>\n",
       "      <td>106,115,127,142,150,229,295,316,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1356</th>\n",
       "      <td>011afec5e443599a79261ece1a662043</td>\n",
       "      <td>629287 14 631022 16 632757 18 634492 21 636227...</td>\n",
       "      <td>1737</td>\n",
       "      <td>1157</td>\n",
       "      <td>4</td>\n",
       "      <td>17,148,225,281,311,317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1371</th>\n",
       "      <td>011c59f7c25d18027f4f9b2b1cffd44a</td>\n",
       "      <td>168345 3 169142 7 169938 12 170735 17 171531 2...</td>\n",
       "      <td>800</td>\n",
       "      <td>800</td>\n",
       "      <td>10</td>\n",
       "      <td>101,115,129,145,148,289,301,317</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               ImageId  \\\n",
       "49    000b3a87508b0fa185fbd53ecbe2e4c6   \n",
       "147   001a66b16b12f12dc45e2bba40e04683   \n",
       "180   00211c06b1fe730097dde122cd4d3f8c   \n",
       "304   003ae3da258f7ba7267af5f159dd3502   \n",
       "369   0048f6c47de85cc4dc263912bd0ff6f5   \n",
       "372   0048f6c47de85cc4dc263912bd0ff6f5   \n",
       "445   005380bd939eb68085af3f804d387824   \n",
       "456   0054564ae183ad9a1b152eef0bc11e1d   \n",
       "465   0055347a114b215f8f469fec9e38c272   \n",
       "526   005e9b75edcee7d655c390ea5416641d   \n",
       "593   006bb85ca0935680110f4ce67d88b4ee   \n",
       "802   009447b79fce7da1ee19a54401517cde   \n",
       "884   00af8f65bb93f4131499dc9807129a24   \n",
       "1201  00f7d06a8db722b86961d911fb9f1d96   \n",
       "1217  00f843a44365248e179ad2a489734913   \n",
       "1260  01098396b79639e29db8de146c2d0064   \n",
       "1274  010db49ecc226102e63815fcf5627319   \n",
       "1311  0116a12304c7f94686978f86100076f3   \n",
       "1356  011afec5e443599a79261ece1a662043   \n",
       "1371  011c59f7c25d18027f4f9b2b1cffd44a   \n",
       "\n",
       "                                          EncodedPixels  Height  Width  \\\n",
       "49    457283 2 458562 6 459841 9 461120 13 462400 15...    1280    852   \n",
       "147   64049 3 64548 10 65048 17 65548 23 65754 36 66...     500    375   \n",
       "180   296470 1 297469 3 298468 5 299467 8 300466 10 ...    1000    665   \n",
       "304   129565 3 130583 9 131602 14 132621 19 133641 2...    1024    683   \n",
       "369   4777361 1 4781320 3 4785279 5 4789239 7 479319...    3960   2640   \n",
       "372   3982550 2 3986509 8 3990469 13 3994429 18 3998...    3960   2640   \n",
       "445   2317673 15 2320644 45 2323624 67 2326613 79 23...    3000   2001   \n",
       "456   195071 2 196093 5 197115 8 198134 13 199151 20...    1024    683   \n",
       "465   236337 20 237832 26 239327 33 240823 38 242320...    1500   1000   \n",
       "526   480863 2 481943 3 483023 4 484102 6 485182 7 4...    1080   1080   \n",
       "593   2461619 6 2463717 17 2465814 24 2467912 23 247...    2096   3000   \n",
       "802   23802163 9 23807451 27 23812746 37 23818049 40...    5304   7952   \n",
       "884   1313044 41 1315994 123 1318943 207 1321893 289...    3000   2000   \n",
       "1201  54821 5 55316 16 55811 27 56307 37 56802 48 57...     500    375   \n",
       "1217  1690956 5 1693572 11 1696188 15 1698804 18 170...    2617   1500   \n",
       "1260  959078 8 962065 22 965051 38 968037 54 971024 ...    3000   2000   \n",
       "1274  1205717 31 1208132 69 1210571 82 1213011 94 12...    2448   2448   \n",
       "1311  32646 31 33110 78 33587 109 34065 136 34542 15...     492    354   \n",
       "1356  629287 14 631022 16 632757 18 634492 21 636227...    1737   1157   \n",
       "1371  168345 3 169142 7 169938 12 170735 17 171531 2...     800    800   \n",
       "\n",
       "      ClassId                            AttributesIds  \n",
       "49         33                                      192  \n",
       "147        10          106,115,127,142,149,229,295,316  \n",
       "180         7       50,115,136,142,148,230,295,300,317  \n",
       "304        10              106,127,141,150,295,316,317  \n",
       "369        33                                      192  \n",
       "372         7                   50,115,136,142,148,317  \n",
       "445        10      106,114,127,142,150,229,295,311,317  \n",
       "456        10      106,115,127,142,149,229,295,316,317  \n",
       "465        10      106,115,127,142,149,229,295,316,317  \n",
       "526        33                                      192  \n",
       "593        10          106,115,127,142,149,295,316,317  \n",
       "802         7       50,115,136,142,148,230,295,298,317  \n",
       "884         4       17,115,135,145,148,225,281,311,317  \n",
       "1201       10          106,115,127,142,151,229,283,311  \n",
       "1217       33                                      192  \n",
       "1260        7                   50,148,234,295,316,317  \n",
       "1274       10  112,115,119,145,148,229,295,306,323,325  \n",
       "1311       10      106,115,127,142,150,229,295,316,317  \n",
       "1356        4                   17,148,225,281,311,317  \n",
       "1371       10          101,115,129,145,148,289,301,317  "
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SUGGESTIVE_fashion_df.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "a9e2133d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show the \"EncodedPixels\" feature of the first SUGGESTIVE_fashion_df \n",
    "from PIL import Image\n",
    "\n",
    "# Get the first image ID from the SUGGESTIVE_fashion_df\n",
    "first_image_id = SUGGESTIVE_fashion_df.iloc[0]['ImageId']\n",
    "\n",
    "# Load the image\n",
    "image = Image.open((DATA_PATH / \"train\" / f\"{first_image_id}.jpg\"))\n",
    "image.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d6351ab",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}