{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# 🎓 Smart Study Planner — Notebook 1: Data Creation\n",
        "### AI for Big Data Management | Group Project\n",
        "\n",
        "This notebook loads the real student dataset, cleans it, generates synthetic variables, creates synthetic student comments, and saves everything as CSV files ready for Notebook 2."
      ],
      "metadata": {
        "id": "zDj8RQ79Jhmf"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "id": "J1qr8Xb3JaYG",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "34a30072-720e-48f7-a8b8-860667bab690"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
          ]
        }
      ],
      "source": [
        "!pip install pandas numpy"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import random\n",
        "import warnings\n",
        "\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "random.seed(2025)\n",
        "np.random.seed(2025)\n",
        "\n",
        "print(\"✅ Libraries loaded\")"
      ],
      "metadata": {
        "id": "0qli7ZjdJmUE",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "c050aa84-f7b1-4b69-adc2-c5d9643d9326"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Libraries loaded\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ============================================================\n",
        "# REAL-WORLD DATA: Web Scraping — Quotes to Scrape\n",
        "# Source: https://quotes.toscrape.com\n",
        "# Purpose: Collect motivational quotes to enrich student comments\n",
        "# ============================================================\n",
        "\n",
        "import requests\n",
        "from bs4 import BeautifulSoup\n",
        "import random\n",
        "\n",
        "random.seed(2025)\n",
        "\n",
        "def scrape_motivational_quotes(max_pages=5):\n",
        "    quotes = []\n",
        "    for page in range(1, max_pages + 1):\n",
        "        url = f\"https://quotes.toscrape.com/page/{page}/\"\n",
        "        response = requests.get(url)\n",
        "        if response.status_code != 200:\n",
        "            print(f\"⚠️ Could not reach page {page}\")\n",
        "            break\n",
        "        soup = BeautifulSoup(response.text, \"html.parser\")\n",
        "        for q in soup.find_all(\"div\", class_=\"quote\"):\n",
        "            text = q.find(\"span\", class_=\"text\").get_text(strip=True)\n",
        "            author = q.find(\"small\", class_=\"author\").get_text(strip=True)\n",
        "            tags = [t.get_text(strip=True) for t in q.find_all(\"a\", class_=\"tag\")]\n",
        "            quotes.append({\"quote\": text, \"author\": author, \"tags\": tags})\n",
        "    return quotes\n",
        "\n",
        "raw_quotes = scrape_motivational_quotes(max_pages=5)\n",
        "df_quotes = pd.DataFrame(raw_quotes)\n",
        "\n",
        "# Filter only motivational/relevant quotes\n",
        "keywords = [\"life\", \"study\", \"success\", \"learning\", \"work\", \"knowledge\", \"education\", \"mind\", \"time\"]\n",
        "df_quotes[\"is_motivational\"] = df_quotes[\"tags\"].apply(\n",
        "    lambda tags: any(k in tags for k in keywords)\n",
        ")\n",
        "df_motivational = df_quotes[df_quotes[\"is_motivational\"]].reset_index(drop=True)\n",
        "\n",
        "print(f\"✅ Total quotes scraped: {len(df_quotes)}\")\n",
        "print(f\"✅ Motivational quotes kept: {len(df_motivational)}\")\n",
        "df_motivational.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 242
        },
        "id": "rrLCaBYF8UZw",
        "outputId": "9bc6c999-b01f-4c26-b04c-a82c1e3c47e4"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Total quotes scraped: 50\n",
            "✅ Motivational quotes kept: 11\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                               quote           author  \\\n",
              "0  “There are only two ways to live your life. On...  Albert Einstein   \n",
              "1  “Try not to become a man of success. Rather be...  Albert Einstein   \n",
              "2  “It is better to be hated for what you are tha...       André Gide   \n",
              "3  “This life is what you make it. No matter what...   Marilyn Monroe   \n",
              "4  “I may not have gone where I intended to go, b...    Douglas Adams   \n",
              "\n",
              "                                                tags  is_motivational  \n",
              "0     [inspirational, life, live, miracle, miracles]             True  \n",
              "1                        [adulthood, success, value]             True  \n",
              "2                                       [life, love]             True  \n",
              "3  [friends, heartbreak, inspirational, life, lov...             True  \n",
              "4                                 [life, navigation]             True  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-eecd31c1-9ad8-4572-8b0c-7ec5f8f892e7\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>quote</th>\n",
              "      <th>author</th>\n",
              "      <th>tags</th>\n",
              "      <th>is_motivational</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>“There are only two ways to live your life. On...</td>\n",
              "      <td>Albert Einstein</td>\n",
              "      <td>[inspirational, life, live, miracle, miracles]</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>“Try not to become a man of success. Rather be...</td>\n",
              "      <td>Albert Einstein</td>\n",
              "      <td>[adulthood, success, value]</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>“It is better to be hated for what you are tha...</td>\n",
              "      <td>André Gide</td>\n",
              "      <td>[life, love]</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>“This life is what you make it. No matter what...</td>\n",
              "      <td>Marilyn Monroe</td>\n",
              "      <td>[friends, heartbreak, inspirational, life, lov...</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>“I may not have gone where I intended to go, b...</td>\n",
              "      <td>Douglas Adams</td>\n",
              "      <td>[life, navigation]</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-eecd31c1-9ad8-4572-8b0c-7ec5f8f892e7')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-eecd31c1-9ad8-4572-8b0c-7ec5f8f892e7 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-eecd31c1-9ad8-4572-8b0c-7ec5f8f892e7');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_motivational",
              "summary": "{\n  \"name\": \"df_motivational\",\n  \"rows\": 11,\n  \"fields\": [\n    {\n      \"column\": \"quote\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 11,\n        \"samples\": [\n          \"\\u201cGood friends, good books, and a sleepy conscience: this is the ideal life.\\u201d\",\n          \"\\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\\u201d\",\n          \"\\u201cAny fool can know. The point is to understand.\\u201d\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"author\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"Albert Einstein\",\n          \"Andr\\u00e9 Gide\",\n          \"Allen Saunders\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"tags\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"is_motivational\",\n      \"properties\": {\n        \"dtype\": \"boolean\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          true\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Save scraped quotes\n",
        "df_quotes.to_csv(\"scraped_quotes.csv\", index=False)\n",
        "df_motivational.to_csv(\"scraped_motivational_quotes.csv\", index=False)\n",
        "print(\"✅ Scraped quotes saved!\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4eb2pnPZ8bPq",
        "outputId": "b8ea9db3-5359-4d30-e35b-917a4969e643"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Scraped quotes saved!\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df = pd.read_csv(\"student_dataset.csv\")\n",
        "\n",
        "print(f\"Dataset shape: {df.shape}\")\n",
        "df.head()"
      ],
      "metadata": {
        "id": "TQYv4EDbJmgi",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 244
        },
        "outputId": "214a531e-9bfe-47f7-a613-88c937df9e69"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Dataset shape: (6607, 11)\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \\\n",
              "0           23.5        84.2                  Low                 Low   \n",
              "1           19.7        63.6               Medium                High   \n",
              "2           24.4        98.4               Medium              Medium   \n",
              "3           29.6        88.6               Medium                High   \n",
              "4           19.1        91.5                 High              Medium   \n",
              "\n",
              "  Extracurricular_Activities  Sleep_Hours  Previous_Scores Motivation_Level  \\\n",
              "0                        Yes          7.4             50.3           Medium   \n",
              "1                        Yes          6.7             83.1              Low   \n",
              "2                        Yes          8.0             63.5           Medium   \n",
              "3                        Yes          6.3             57.5           Medium   \n",
              "4                        Yes          6.2             66.0           Medium   \n",
              "\n",
              "  Internet_Access  Tutoring_Sessions  Final_Exam_Score  \n",
              "0             Yes                  1              54.3  \n",
              "1             Yes                  2              59.4  \n",
              "2             Yes                  0              63.8  \n",
              "3             Yes                  2              61.0  \n",
              "4             Yes                  2              60.7  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-e808c188-ebdd-4e32-ac5d-1fa81f1c7cd6\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Hours_Studied</th>\n",
              "      <th>Attendance</th>\n",
              "      <th>Parental_Involvement</th>\n",
              "      <th>Access_to_Resources</th>\n",
              "      <th>Extracurricular_Activities</th>\n",
              "      <th>Sleep_Hours</th>\n",
              "      <th>Previous_Scores</th>\n",
              "      <th>Motivation_Level</th>\n",
              "      <th>Internet_Access</th>\n",
              "      <th>Tutoring_Sessions</th>\n",
              "      <th>Final_Exam_Score</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>23.5</td>\n",
              "      <td>84.2</td>\n",
              "      <td>Low</td>\n",
              "      <td>Low</td>\n",
              "      <td>Yes</td>\n",
              "      <td>7.4</td>\n",
              "      <td>50.3</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Yes</td>\n",
              "      <td>1</td>\n",
              "      <td>54.3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>19.7</td>\n",
              "      <td>63.6</td>\n",
              "      <td>Medium</td>\n",
              "      <td>High</td>\n",
              "      <td>Yes</td>\n",
              "      <td>6.7</td>\n",
              "      <td>83.1</td>\n",
              "      <td>Low</td>\n",
              "      <td>Yes</td>\n",
              "      <td>2</td>\n",
              "      <td>59.4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>24.4</td>\n",
              "      <td>98.4</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Yes</td>\n",
              "      <td>8.0</td>\n",
              "      <td>63.5</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Yes</td>\n",
              "      <td>0</td>\n",
              "      <td>63.8</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>29.6</td>\n",
              "      <td>88.6</td>\n",
              "      <td>Medium</td>\n",
              "      <td>High</td>\n",
              "      <td>Yes</td>\n",
              "      <td>6.3</td>\n",
              "      <td>57.5</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Yes</td>\n",
              "      <td>2</td>\n",
              "      <td>61.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>19.1</td>\n",
              "      <td>91.5</td>\n",
              "      <td>High</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Yes</td>\n",
              "      <td>6.2</td>\n",
              "      <td>66.0</td>\n",
              "      <td>Medium</td>\n",
              "      <td>Yes</td>\n",
              "      <td>2</td>\n",
              "      <td>60.7</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e808c188-ebdd-4e32-ac5d-1fa81f1c7cd6')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-e808c188-ebdd-4e32-ac5d-1fa81f1c7cd6 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-e808c188-ebdd-4e32-ac5d-1fa81f1c7cd6');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 6607,\n  \"fields\": [\n    {\n      \"column\": \"Hours_Studied\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 5.988729009131041,\n        \"min\": 1.1,\n        \"max\": 44.0,\n        \"num_unique_values\": 352,\n        \"samples\": [\n          35.9,\n          23.0,\n          28.6\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Attendance\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 11.54771705393973,\n        \"min\": 60.0,\n        \"max\": 100.0,\n        \"num_unique_values\": 401,\n        \"samples\": [\n          74.6,\n          87.9,\n          60.5\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Parental_Involvement\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"Low\",\n          \"Medium\",\n          \"High\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Access_to_Resources\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"Low\",\n          \"High\",\n          \"Medium\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Extracurricular_Activities\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"No\",\n          \"Yes\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Sleep_Hours\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1.290241270992569,\n        \"min\": 4.0,\n        \"max\": 10.0,\n        \"num_unique_values\": 61,\n        \"samples\": [\n          7.4,\n          4.8\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Previous_Scores\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 14.451962385719332,\n        \"min\": 50.0,\n        \"max\": 100.0,\n        \"num_unique_values\": 501,\n        \"samples\": [\n          65.9,\n          73.8\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Motivation_Level\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"Medium\",\n          \"Low\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Internet_Access\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"No\",\n          \"Yes\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Tutoring_Sessions\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 0,\n        \"max\": 8,\n        \"num_unique_values\": 9,\n        \"samples\": [\n          8,\n          2\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Final_Exam_Score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 8.739400303272008,\n        \"min\": 29.7,\n        \"max\": 95.2,\n        \"num_unique_values\": 477,\n        \"samples\": [\n          79.1,\n          58.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(df.info())\n",
        "print(\"\\nMissing values:\\n\", df.isnull().sum())\n",
        "print(\"\\nSummary statistics:\\n\", df.describe())"
      ],
      "metadata": {
        "id": "LpIp_ewBJmwo",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "43a86d15-11b5-4de8-afb2-85a4e5b7cad5"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 6607 entries, 0 to 6606\n",
            "Data columns (total 11 columns):\n",
            " #   Column                      Non-Null Count  Dtype  \n",
            "---  ------                      --------------  -----  \n",
            " 0   Hours_Studied               6607 non-null   float64\n",
            " 1   Attendance                  6607 non-null   float64\n",
            " 2   Parental_Involvement        6607 non-null   object \n",
            " 3   Access_to_Resources         6607 non-null   object \n",
            " 4   Extracurricular_Activities  6607 non-null   object \n",
            " 5   Sleep_Hours                 6607 non-null   float64\n",
            " 6   Previous_Scores             6607 non-null   float64\n",
            " 7   Motivation_Level            6607 non-null   object \n",
            " 8   Internet_Access             6607 non-null   object \n",
            " 9   Tutoring_Sessions           6607 non-null   int64  \n",
            " 10  Final_Exam_Score            6607 non-null   float64\n",
            "dtypes: float64(5), int64(1), object(5)\n",
            "memory usage: 567.9+ KB\n",
            "None\n",
            "\n",
            "Missing values:\n",
            " Hours_Studied                 0\n",
            "Attendance                    0\n",
            "Parental_Involvement          0\n",
            "Access_to_Resources           0\n",
            "Extracurricular_Activities    0\n",
            "Sleep_Hours                   0\n",
            "Previous_Scores               0\n",
            "Motivation_Level              0\n",
            "Internet_Access               0\n",
            "Tutoring_Sessions             0\n",
            "Final_Exam_Score              0\n",
            "dtype: int64\n",
            "\n",
            "Summary statistics:\n",
            "        Hours_Studied   Attendance  Sleep_Hours  Previous_Scores  \\\n",
            "count    6607.000000  6607.000000  6607.000000      6607.000000   \n",
            "mean       20.472726    79.979068     7.185273        74.716059   \n",
            "std         5.988729    11.547717     1.290241        14.451962   \n",
            "min         1.100000    60.000000     4.000000        50.000000   \n",
            "25%        16.500000    70.000000     6.300000        62.100000   \n",
            "50%        20.500000    79.800000     7.200000        74.600000   \n",
            "75%        24.500000    90.200000     8.100000        87.400000   \n",
            "max        44.000000   100.000000    10.000000       100.000000   \n",
            "\n",
            "       Tutoring_Sessions  Final_Exam_Score  \n",
            "count        6607.000000       6607.000000  \n",
            "mean            1.488119         60.400333  \n",
            "std             1.216175          8.739400  \n",
            "min             0.000000         29.700000  \n",
            "25%             1.000000         54.300000  \n",
            "50%             1.000000         60.400000  \n",
            "75%             2.000000         66.600000  \n",
            "max             8.000000         95.200000  \n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df.columns = df.columns.str.lower().str.replace(\" \", \"_\")\n",
        "\n",
        "ordinal_map = {\"Low\": 0, \"Medium\": 1, \"High\": 2}\n",
        "\n",
        "df[\"motivation_level\"] = df[\"motivation_level\"].map(ordinal_map)\n",
        "df[\"parental_involvement\"] = df[\"parental_involvement\"].map(ordinal_map)\n",
        "df[\"access_to_resources\"] = df[\"access_to_resources\"].map(ordinal_map)\n",
        "\n",
        "df[\"internet_access\"] = df[\"internet_access\"].map({\"Yes\": 1, \"No\": 0})\n",
        "df[\"extracurricular_activities\"] = df[\"extracurricular_activities\"].map({\"Yes\": 1, \"No\": 0})\n",
        "\n",
        "print(\"Duplicates:\", df.duplicated().sum())\n",
        "print(\"Missing values:\\n\", df.isnull().sum())\n",
        "\n",
        "df.head()"
      ],
      "metadata": {
        "id": "5_FCq_J4Jm9P",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 476
        },
        "outputId": "85f421cd-2bab-4599-cb31-631498c30e3e"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Duplicates: 0\n",
            "Missing values:\n",
            " hours_studied                 0\n",
            "attendance                    0\n",
            "parental_involvement          0\n",
            "access_to_resources           0\n",
            "extracurricular_activities    0\n",
            "sleep_hours                   0\n",
            "previous_scores               0\n",
            "motivation_level              0\n",
            "internet_access               0\n",
            "tutoring_sessions             0\n",
            "final_exam_score              0\n",
            "dtype: int64\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   hours_studied  attendance  parental_involvement  access_to_resources  \\\n",
              "0           23.5        84.2                     0                    0   \n",
              "1           19.7        63.6                     1                    2   \n",
              "2           24.4        98.4                     1                    1   \n",
              "3           29.6        88.6                     1                    2   \n",
              "4           19.1        91.5                     2                    1   \n",
              "\n",
              "   extracurricular_activities  sleep_hours  previous_scores  motivation_level  \\\n",
              "0                           1          7.4             50.3                 1   \n",
              "1                           1          6.7             83.1                 0   \n",
              "2                           1          8.0             63.5                 1   \n",
              "3                           1          6.3             57.5                 1   \n",
              "4                           1          6.2             66.0                 1   \n",
              "\n",
              "   internet_access  tutoring_sessions  final_exam_score  \n",
              "0                1                  1              54.3  \n",
              "1                1                  2              59.4  \n",
              "2                1                  0              63.8  \n",
              "3                1                  2              61.0  \n",
              "4                1                  2              60.7  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-b5369bd4-3e33-4d88-9f4d-e28537fe14ed\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>hours_studied</th>\n",
              "      <th>attendance</th>\n",
              "      <th>parental_involvement</th>\n",
              "      <th>access_to_resources</th>\n",
              "      <th>extracurricular_activities</th>\n",
              "      <th>sleep_hours</th>\n",
              "      <th>previous_scores</th>\n",
              "      <th>motivation_level</th>\n",
              "      <th>internet_access</th>\n",
              "      <th>tutoring_sessions</th>\n",
              "      <th>final_exam_score</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>23.5</td>\n",
              "      <td>84.2</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>7.4</td>\n",
              "      <td>50.3</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>54.3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>19.7</td>\n",
              "      <td>63.6</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "      <td>1</td>\n",
              "      <td>6.7</td>\n",
              "      <td>83.1</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "      <td>59.4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>24.4</td>\n",
              "      <td>98.4</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>8.0</td>\n",
              "      <td>63.5</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>63.8</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>29.6</td>\n",
              "      <td>88.6</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "      <td>1</td>\n",
              "      <td>6.3</td>\n",
              "      <td>57.5</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "      <td>61.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>19.1</td>\n",
              "      <td>91.5</td>\n",
              "      <td>2</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>6.2</td>\n",
              "      <td>66.0</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "      <td>60.7</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b5369bd4-3e33-4d88-9f4d-e28537fe14ed')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-b5369bd4-3e33-4d88-9f4d-e28537fe14ed button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-b5369bd4-3e33-4d88-9f4d-e28537fe14ed');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 6607,\n  \"fields\": [\n    {\n      \"column\": \"hours_studied\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 5.988729009131041,\n        \"min\": 1.1,\n        \"max\": 44.0,\n        \"num_unique_values\": 352,\n        \"samples\": [\n          35.9,\n          23.0,\n          28.6\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"attendance\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 11.54771705393973,\n        \"min\": 60.0,\n        \"max\": 100.0,\n        \"num_unique_values\": 401,\n        \"samples\": [\n          74.6,\n          87.9,\n          60.5\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"parental_involvement\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 2,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0,\n          1,\n          2\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"access_to_resources\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 2,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0,\n          2,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"extracurricular_activities\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sleep_hours\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1.290241270992569,\n        \"min\": 4.0,\n        \"max\": 10.0,\n        \"num_unique_values\": 61,\n        \"samples\": [\n          7.4,\n          4.8\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"previous_scores\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 14.451962385719332,\n        \"min\": 50.0,\n        \"max\": 100.0,\n        \"num_unique_values\": 501,\n        \"samples\": [\n          65.9,\n          73.8\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"motivation_level\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 2,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          1,\n          0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"internet_access\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"tutoring_sessions\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 0,\n        \"max\": 8,\n        \"num_unique_values\": 9,\n        \"samples\": [\n          8,\n          2\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"final_exam_score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 8.739400303272008,\n        \"min\": 29.7,\n        \"max\": 95.2,\n        \"num_unique_values\": 477,\n        \"samples\": [\n          79.1,\n          58.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df[\"stress_score\"] = (10 - df[\"sleep_hours\"]) + (2 - df[\"motivation_level\"])\n",
        "df[\"estimated_study_hours\"] = 10 - (df[\"previous_scores\"] / 10)\n",
        "\n",
        "df[\"burnout_risk\"] = ((df[\"stress_score\"] > 7) & (df[\"hours_studied\"] < 4)).astype(int)\n",
        "\n",
        "df[\"procrastination_risk\"] = 1 - (df[\"hours_studied\"] / df[\"hours_studied\"].max())\n",
        "\n",
        "df[\"focus_level\"] = df[\"motivation_level\"] * (df[\"sleep_hours\"] / df[\"sleep_hours\"].max())\n",
        "\n",
        "df[\"risk_level\"] = df[\"final_exam_score\"].apply(lambda x: \"high\" if x < 60 else \"low\")\n",
        "\n",
        "print(\"✅ Synthetic variables added\")"
      ],
      "metadata": {
        "id": "0SJisroHJnO3",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "58d5b523-89a3-4442-9549-6eeaeb5db328"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Synthetic variables added\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Use scraped quotes to add a motivational message per student\n",
        "motivational_pool = df_motivational[\"quote\"].tolist()\n",
        "\n",
        "# Fallback in case scraping returned nothing\n",
        "if not motivational_pool:\n",
        "    motivational_pool = [\n",
        "        \"\\u201cThe secret of getting ahead is getting started.\\u201d — Mark Twain\",\n",
        "        \"\\u201cIt always seems impossible until it\\u2019s done.\\u201d — Nelson Mandela\"\n",
        "    ]\n",
        "\n",
        "df[\"motivational_quote\"] = [random.choice(motivational_pool) for _ in range(len(df))]\n",
        "print(\"✅ Motivational quotes assigned to students!\")\n",
        "df[[\"hours_studied\", \"stress_score\", \"risk_level\", \"motivational_quote\"]].head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 224
        },
        "id": "TKJ4JPF78q5v",
        "outputId": "1a4cfa57-c8e6-4fc8-f709-377814e6bc56"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Motivational quotes assigned to students!\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   hours_studied  stress_score risk_level  \\\n",
              "0           23.5           3.6       high   \n",
              "1           19.7           5.3       high   \n",
              "2           24.4           3.0        low   \n",
              "3           29.6           4.7        low   \n",
              "4           19.1           4.8        low   \n",
              "\n",
              "                                  motivational_quote  \n",
              "0  “The more that you read, the more things you w...  \n",
              "1  “Try not to become a man of success. Rather be...  \n",
              "2  “Life is like riding a bicycle. To keep your b...  \n",
              "3  “Today you are You, that is truer than true. T...  \n",
              "4  “It is better to be hated for what you are tha...  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-68ab98b9-08d7-4240-863a-2c8bfb25c92e\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>hours_studied</th>\n",
              "      <th>stress_score</th>\n",
              "      <th>risk_level</th>\n",
              "      <th>motivational_quote</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>23.5</td>\n",
              "      <td>3.6</td>\n",
              "      <td>high</td>\n",
              "      <td>“The more that you read, the more things you w...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>19.7</td>\n",
              "      <td>5.3</td>\n",
              "      <td>high</td>\n",
              "      <td>“Try not to become a man of success. Rather be...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>24.4</td>\n",
              "      <td>3.0</td>\n",
              "      <td>low</td>\n",
              "      <td>“Life is like riding a bicycle. To keep your b...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>29.6</td>\n",
              "      <td>4.7</td>\n",
              "      <td>low</td>\n",
              "      <td>“Today you are You, that is truer than true. T...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>19.1</td>\n",
              "      <td>4.8</td>\n",
              "      <td>low</td>\n",
              "      <td>“It is better to be hated for what you are tha...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-68ab98b9-08d7-4240-863a-2c8bfb25c92e')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-68ab98b9-08d7-4240-863a-2c8bfb25c92e button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-68ab98b9-08d7-4240-863a-2c8bfb25c92e');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"df[[\\\"hours_studied\\\", \\\"stress_score\\\", \\\"risk_level\\\", \\\"motivational_quote\\\"]]\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"hours_studied\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 4.228829625321882,\n        \"min\": 19.1,\n        \"max\": 29.6,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          19.7,\n          19.1,\n          24.4\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"stress_score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.9471008394041259,\n        \"min\": 3.0,\n        \"max\": 5.3,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          5.3,\n          4.8,\n          3.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"risk_level\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"low\",\n          \"high\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"motivational_quote\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"\\u201cTry not to become a man of success. Rather become a man of value.\\u201d\",\n          \"\\u201cIt is better to be hated for what you are than to be loved for what you are not.\\u201d\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df.to_csv(\"student_dataset_enriched.csv\", index=False)\n",
        "print(\"Saved!\")"
      ],
      "metadata": {
        "id": "G2YEZjDWJuxv",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "20298b07-27da-4212-973b-706d42d0ba6c"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved!\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# --- Comment templates ---\n",
        "high_stress_comments = [\n",
        "    \"I feel completely overwhelmed this week. Two exams and three deadlines.\",\n",
        "    \"I have two deadlines tomorrow and I haven't started either of them.\",\n",
        "    \"I can't focus at all. I keep getting distracted and wasting time.\",\n",
        "    \"I'm exhausted. I've been studying non-stop and I'm barely keeping up.\",\n",
        "    \"I procrastinated all week. Now I have to cram everything in one night.\"\n",
        "]\n",
        "\n",
        "medium_stress_comments = [\n",
        "    \"It's been a busy week but I'm managing to keep up.\",\n",
        "    \"A bit stressed about the upcoming exam but still in control.\",\n",
        "    \"Some subjects are harder than others but I'm getting through it.\",\n",
        "    \"I could be more organised but things are mostly on track.\",\n",
        "    \"Feeling the pressure a little but nothing I can't handle.\"\n",
        "]\n",
        "\n",
        "low_stress_comments = [\n",
        "    \"Today was productive. I finished all my assignments ahead of schedule.\",\n",
        "    \"I'm not too worried about my exams, I think I'm well prepared.\",\n",
        "    \"Everything is fine this week. Light workload and manageable deadlines.\",\n",
        "    \"I feel on top of things. Good energy and focus today.\",\n",
        "    \"Feeling motivated and ready.\"\n",
        "]\n",
        "\n",
        "# --- Function to assign comments ---\n",
        "def assign_comment(row):\n",
        "    if row[\"stress_score\"] > 7:\n",
        "        return random.choice(high_stress_comments)\n",
        "    elif row[\"stress_score\"] > 4:\n",
        "        return random.choice(medium_stress_comments)\n",
        "    else:\n",
        "        return random.choice(low_stress_comments)\n",
        "\n",
        "# --- Create sample of students ---\n",
        "df_sample = df.sample(200, random_state=2025).reset_index(drop=True)\n",
        "\n",
        "# Add student ID\n",
        "df_sample[\"student_id\"] = df_sample.index + 1\n",
        "\n",
        "# Generate comments\n",
        "df_sample[\"comment\"] = df_sample.apply(assign_comment, axis=1)\n",
        "\n",
        "# Final dataset\n",
        "df_reviews = df_sample[[\"student_id\", \"comment\", \"stress_score\", \"risk_level\"]].copy()\n",
        "\n",
        "print(\"✅ Comments generated!\")\n",
        "df_reviews.head()"
      ],
      "metadata": {
        "id": "CUPO07rgJu9H",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 224
        },
        "outputId": "e002f896-f068-4386-d60e-24179902f3e1"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Comments generated!\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   student_id                                            comment  \\\n",
              "0           1  It's been a busy week but I'm managing to keep...   \n",
              "1           2  I'm not too worried about my exams, I think I'...   \n",
              "2           3  Today was productive. I finished all my assign...   \n",
              "3           4  I could be more organised but things are mostl...   \n",
              "4           5  Everything is fine this week. Light workload a...   \n",
              "\n",
              "   stress_score risk_level  \n",
              "0           4.6        low  \n",
              "1           3.7        low  \n",
              "2           2.3       high  \n",
              "3           5.0       high  \n",
              "4           3.3        low  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-3b8a5fc3-1a57-4d01-b227-5047f12d10ce\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>student_id</th>\n",
              "      <th>comment</th>\n",
              "      <th>stress_score</th>\n",
              "      <th>risk_level</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>It's been a busy week but I'm managing to keep...</td>\n",
              "      <td>4.6</td>\n",
              "      <td>low</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>I'm not too worried about my exams, I think I'...</td>\n",
              "      <td>3.7</td>\n",
              "      <td>low</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>Today was productive. I finished all my assign...</td>\n",
              "      <td>2.3</td>\n",
              "      <td>high</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>I could be more organised but things are mostl...</td>\n",
              "      <td>5.0</td>\n",
              "      <td>high</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>Everything is fine this week. Light workload a...</td>\n",
              "      <td>3.3</td>\n",
              "      <td>low</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3b8a5fc3-1a57-4d01-b227-5047f12d10ce')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-3b8a5fc3-1a57-4d01-b227-5047f12d10ce button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-3b8a5fc3-1a57-4d01-b227-5047f12d10ce');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_reviews",
              "summary": "{\n  \"name\": \"df_reviews\",\n  \"rows\": 200,\n  \"fields\": [\n    {\n      \"column\": \"student_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 57,\n        \"min\": 1,\n        \"max\": 200,\n        \"num_unique_values\": 200,\n        \"samples\": [\n          96,\n          16,\n          31\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"comment\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 13,\n        \"samples\": [\n          \"I feel completely overwhelmed this week. Two exams and three deadlines.\",\n          \"Feeling the pressure a little but nothing I can't handle.\",\n          \"It's been a busy week but I'm managing to keep up.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"stress_score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1.5300921245296766,\n        \"min\": 0.0,\n        \"max\": 8.0,\n        \"num_unique_values\": 65,\n        \"samples\": [\n          8.0,\n          7.0,\n          4.6\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"risk_level\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"high\",\n          \"low\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df_reviews.to_csv(\"synthetic_student_reviews.csv\", index=False)\n",
        "print(\"Saved!\")"
      ],
      "metadata": {
        "id": "zLUI4EZTJvIp",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "65162dd3-9414-48fe-9b00-9eb9c14a4c35"
      },
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved!\n"
          ]
        }
      ]
    }
  ]
}