{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# 🎓 Smart Study Planner — Notebook 1: Data Creation\n", "### AI for Big Data Management | Group Project\n", "\n", "This notebook loads the real student dataset, cleans it, generates synthetic variables, creates synthetic student comments, and saves everything as CSV files ready for Notebook 2." ], "metadata": { "id": "zDj8RQ79Jhmf" } }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "J1qr8Xb3JaYG", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "34a30072-720e-48f7-a8b8-860667bab690" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install pandas numpy" ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import numpy as np\n", "import random\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)\n", "\n", "print(\"✅ Libraries loaded\")" ], "metadata": { "id": "0qli7ZjdJmUE", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c050aa84-f7b1-4b69-adc2-c5d9643d9326" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✅ Libraries loaded\n" ] } ] }, { "cell_type": "code", "source": [ "# ============================================================\n", "# REAL-WORLD DATA: Web Scraping — Quotes to Scrape\n", "# Source: https://quotes.toscrape.com\n", "# Purpose: Collect motivational quotes to enrich student comments\n", "# ============================================================\n", "\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import random\n", "\n", "random.seed(2025)\n", "\n", "def scrape_motivational_quotes(max_pages=5):\n", " quotes = []\n", " for page in range(1, max_pages + 1):\n", " url = f\"https://quotes.toscrape.com/page/{page}/\"\n", " response = requests.get(url)\n", " if response.status_code != 200:\n", " print(f\"⚠️ Could not reach page {page}\")\n", " break\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " for q in soup.find_all(\"div\", class_=\"quote\"):\n", " text = q.find(\"span\", class_=\"text\").get_text(strip=True)\n", " author = q.find(\"small\", class_=\"author\").get_text(strip=True)\n", " tags = [t.get_text(strip=True) for t in q.find_all(\"a\", class_=\"tag\")]\n", " quotes.append({\"quote\": text, \"author\": author, \"tags\": tags})\n", " return quotes\n", "\n", "raw_quotes = scrape_motivational_quotes(max_pages=5)\n", "df_quotes = pd.DataFrame(raw_quotes)\n", "\n", "# Filter only motivational/relevant quotes\n", "keywords = [\"life\", \"study\", \"success\", \"learning\", \"work\", \"knowledge\", \"education\", \"mind\", \"time\"]\n", "df_quotes[\"is_motivational\"] = df_quotes[\"tags\"].apply(\n", " lambda tags: any(k in tags for k in keywords)\n", ")\n", "df_motivational = df_quotes[df_quotes[\"is_motivational\"]].reset_index(drop=True)\n", "\n", "print(f\"✅ Total quotes scraped: {len(df_quotes)}\")\n", "print(f\"✅ Motivational quotes kept: {len(df_motivational)}\")\n", "df_motivational.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 242 }, "id": "rrLCaBYF8UZw", "outputId": "9bc6c999-b01f-4c26-b04c-a82c1e3c47e4" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✅ Total quotes scraped: 50\n", "✅ Motivational quotes kept: 11\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " quote author \\\n", "0 “There are only two ways to live your life. On... Albert Einstein \n", "1 “Try not to become a man of success. Rather be... Albert Einstein \n", "2 “It is better to be hated for what you are tha... André Gide \n", "3 “This life is what you make it. No matter what... Marilyn Monroe \n", "4 “I may not have gone where I intended to go, b... Douglas Adams \n", "\n", " tags is_motivational \n", "0 [inspirational, life, live, miracle, miracles] True \n", "1 [adulthood, success, value] True \n", "2 [life, love] True \n", "3 [friends, heartbreak, inspirational, life, lov... True \n", "4 [life, navigation] True " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quoteauthortagsis_motivational
0“There are only two ways to live your life. On...Albert Einstein[inspirational, life, live, miracle, miracles]True
1“Try not to become a man of success. Rather be...Albert Einstein[adulthood, success, value]True
2“It is better to be hated for what you are tha...André Gide[life, love]True
3“This life is what you make it. No matter what...Marilyn Monroe[friends, heartbreak, inspirational, life, lov...True
4“I may not have gone where I intended to go, b...Douglas Adams[life, navigation]True
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_motivational", "summary": "{\n \"name\": \"df_motivational\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"quote\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 11,\n \"samples\": [\n \"\\u201cGood friends, good books, and a sleepy conscience: this is the ideal life.\\u201d\",\n \"\\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\\u201d\",\n \"\\u201cAny fool can know. The point is to understand.\\u201d\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"author\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"Albert Einstein\",\n \"Andr\\u00e9 Gide\",\n \"Allen Saunders\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"is_motivational\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 1,\n \"samples\": [\n true\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "# Save scraped quotes\n", "df_quotes.to_csv(\"scraped_quotes.csv\", index=False)\n", "df_motivational.to_csv(\"scraped_motivational_quotes.csv\", index=False)\n", "print(\"âś… Scraped quotes saved!\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4eb2pnPZ8bPq", "outputId": "b8ea9db3-5359-4d30-e35b-917a4969e643" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "âś… Scraped quotes saved!\n" ] } ] }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"student_dataset.csv\")\n", "\n", "print(f\"Dataset shape: {df.shape}\")\n", "df.head()" ], "metadata": { "id": "TQYv4EDbJmgi", "colab": { "base_uri": "https://localhost:8080/", "height": 244 }, "outputId": "214a531e-9bfe-47f7-a613-88c937df9e69" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Dataset shape: (6607, 11)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " Hours_Studied Attendance Parental_Involvement Access_to_Resources \\\n", "0 23.5 84.2 Low Low \n", "1 19.7 63.6 Medium High \n", "2 24.4 98.4 Medium Medium \n", "3 29.6 88.6 Medium High \n", "4 19.1 91.5 High Medium \n", "\n", " Extracurricular_Activities Sleep_Hours Previous_Scores Motivation_Level \\\n", "0 Yes 7.4 50.3 Medium \n", "1 Yes 6.7 83.1 Low \n", "2 Yes 8.0 63.5 Medium \n", "3 Yes 6.3 57.5 Medium \n", "4 Yes 6.2 66.0 Medium \n", "\n", " Internet_Access Tutoring_Sessions Final_Exam_Score \n", "0 Yes 1 54.3 \n", "1 Yes 2 59.4 \n", "2 Yes 0 63.8 \n", "3 Yes 2 61.0 \n", "4 Yes 2 60.7 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Hours_StudiedAttendanceParental_InvolvementAccess_to_ResourcesExtracurricular_ActivitiesSleep_HoursPrevious_ScoresMotivation_LevelInternet_AccessTutoring_SessionsFinal_Exam_Score
023.584.2LowLowYes7.450.3MediumYes154.3
119.763.6MediumHighYes6.783.1LowYes259.4
224.498.4MediumMediumYes8.063.5MediumYes063.8
329.688.6MediumHighYes6.357.5MediumYes261.0
419.191.5HighMediumYes6.266.0MediumYes260.7
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 6607,\n \"fields\": [\n {\n \"column\": \"Hours_Studied\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.988729009131041,\n \"min\": 1.1,\n \"max\": 44.0,\n \"num_unique_values\": 352,\n \"samples\": [\n 35.9,\n 23.0,\n 28.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Attendance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11.54771705393973,\n \"min\": 60.0,\n \"max\": 100.0,\n \"num_unique_values\": 401,\n \"samples\": [\n 74.6,\n 87.9,\n 60.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Parental_Involvement\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Low\",\n \"Medium\",\n \"High\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Access_to_Resources\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Low\",\n \"High\",\n \"Medium\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Extracurricular_Activities\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"No\",\n \"Yes\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sleep_Hours\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.290241270992569,\n \"min\": 4.0,\n \"max\": 10.0,\n \"num_unique_values\": 61,\n \"samples\": [\n 7.4,\n 4.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Previous_Scores\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.451962385719332,\n \"min\": 50.0,\n \"max\": 100.0,\n \"num_unique_values\": 501,\n \"samples\": [\n 65.9,\n 73.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Motivation_Level\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Medium\",\n \"Low\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Internet_Access\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"No\",\n \"Yes\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Tutoring_Sessions\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 9,\n \"samples\": [\n 8,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Final_Exam_Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.739400303272008,\n \"min\": 29.7,\n \"max\": 95.2,\n \"num_unique_values\": 477,\n \"samples\": [\n 79.1,\n 58.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "print(df.info())\n", "print(\"\\nMissing values:\\n\", df.isnull().sum())\n", "print(\"\\nSummary statistics:\\n\", df.describe())" ], "metadata": { "id": "LpIp_ewBJmwo", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "43a86d15-11b5-4de8-afb2-85a4e5b7cad5" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 6607 entries, 0 to 6606\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Hours_Studied 6607 non-null float64\n", " 1 Attendance 6607 non-null float64\n", " 2 Parental_Involvement 6607 non-null object \n", " 3 Access_to_Resources 6607 non-null object \n", " 4 Extracurricular_Activities 6607 non-null object \n", " 5 Sleep_Hours 6607 non-null float64\n", " 6 Previous_Scores 6607 non-null float64\n", " 7 Motivation_Level 6607 non-null object \n", " 8 Internet_Access 6607 non-null object \n", " 9 Tutoring_Sessions 6607 non-null int64 \n", " 10 Final_Exam_Score 6607 non-null float64\n", "dtypes: float64(5), int64(1), object(5)\n", "memory usage: 567.9+ KB\n", "None\n", "\n", "Missing values:\n", " Hours_Studied 0\n", "Attendance 0\n", "Parental_Involvement 0\n", "Access_to_Resources 0\n", "Extracurricular_Activities 0\n", "Sleep_Hours 0\n", "Previous_Scores 0\n", "Motivation_Level 0\n", "Internet_Access 0\n", "Tutoring_Sessions 0\n", "Final_Exam_Score 0\n", "dtype: int64\n", "\n", "Summary statistics:\n", " Hours_Studied Attendance Sleep_Hours Previous_Scores \\\n", "count 6607.000000 6607.000000 6607.000000 6607.000000 \n", "mean 20.472726 79.979068 7.185273 74.716059 \n", "std 5.988729 11.547717 1.290241 14.451962 \n", "min 1.100000 60.000000 4.000000 50.000000 \n", "25% 16.500000 70.000000 6.300000 62.100000 \n", "50% 20.500000 79.800000 7.200000 74.600000 \n", "75% 24.500000 90.200000 8.100000 87.400000 \n", "max 44.000000 100.000000 10.000000 100.000000 \n", "\n", " Tutoring_Sessions Final_Exam_Score \n", "count 6607.000000 6607.000000 \n", "mean 1.488119 60.400333 \n", "std 1.216175 8.739400 \n", "min 0.000000 29.700000 \n", "25% 1.000000 54.300000 \n", "50% 1.000000 60.400000 \n", "75% 2.000000 66.600000 \n", "max 8.000000 95.200000 \n" ] } ] }, { "cell_type": "code", "source": [ "df.columns = df.columns.str.lower().str.replace(\" \", \"_\")\n", "\n", "ordinal_map = {\"Low\": 0, \"Medium\": 1, \"High\": 2}\n", "\n", "df[\"motivation_level\"] = df[\"motivation_level\"].map(ordinal_map)\n", "df[\"parental_involvement\"] = df[\"parental_involvement\"].map(ordinal_map)\n", "df[\"access_to_resources\"] = df[\"access_to_resources\"].map(ordinal_map)\n", "\n", "df[\"internet_access\"] = df[\"internet_access\"].map({\"Yes\": 1, \"No\": 0})\n", "df[\"extracurricular_activities\"] = df[\"extracurricular_activities\"].map({\"Yes\": 1, \"No\": 0})\n", "\n", "print(\"Duplicates:\", df.duplicated().sum())\n", "print(\"Missing values:\\n\", df.isnull().sum())\n", "\n", "df.head()" ], "metadata": { "id": "5_FCq_J4Jm9P", "colab": { "base_uri": "https://localhost:8080/", "height": 476 }, "outputId": "85f421cd-2bab-4599-cb31-631498c30e3e" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Duplicates: 0\n", "Missing values:\n", " hours_studied 0\n", "attendance 0\n", "parental_involvement 0\n", "access_to_resources 0\n", "extracurricular_activities 0\n", "sleep_hours 0\n", "previous_scores 0\n", "motivation_level 0\n", "internet_access 0\n", "tutoring_sessions 0\n", "final_exam_score 0\n", "dtype: int64\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " hours_studied attendance parental_involvement access_to_resources \\\n", "0 23.5 84.2 0 0 \n", "1 19.7 63.6 1 2 \n", "2 24.4 98.4 1 1 \n", "3 29.6 88.6 1 2 \n", "4 19.1 91.5 2 1 \n", "\n", " extracurricular_activities sleep_hours previous_scores motivation_level \\\n", "0 1 7.4 50.3 1 \n", "1 1 6.7 83.1 0 \n", "2 1 8.0 63.5 1 \n", "3 1 6.3 57.5 1 \n", "4 1 6.2 66.0 1 \n", "\n", " internet_access tutoring_sessions final_exam_score \n", "0 1 1 54.3 \n", "1 1 2 59.4 \n", "2 1 0 63.8 \n", "3 1 2 61.0 \n", "4 1 2 60.7 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hours_studiedattendanceparental_involvementaccess_to_resourcesextracurricular_activitiessleep_hoursprevious_scoresmotivation_levelinternet_accesstutoring_sessionsfinal_exam_score
023.584.20017.450.311154.3
119.763.61216.783.101259.4
224.498.41118.063.511063.8
329.688.61216.357.511261.0
419.191.52116.266.011260.7
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 6607,\n \"fields\": [\n {\n \"column\": \"hours_studied\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.988729009131041,\n \"min\": 1.1,\n \"max\": 44.0,\n \"num_unique_values\": 352,\n \"samples\": [\n 35.9,\n 23.0,\n 28.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"attendance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11.54771705393973,\n \"min\": 60.0,\n \"max\": 100.0,\n \"num_unique_values\": 401,\n \"samples\": [\n 74.6,\n 87.9,\n 60.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"parental_involvement\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 0,\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"access_to_resources\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 0,\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"extracurricular_activities\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sleep_hours\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.290241270992569,\n \"min\": 4.0,\n \"max\": 10.0,\n \"num_unique_values\": 61,\n \"samples\": [\n 7.4,\n 4.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"previous_scores\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.451962385719332,\n \"min\": 50.0,\n \"max\": 100.0,\n \"num_unique_values\": 501,\n \"samples\": [\n 65.9,\n 73.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"motivation_level\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"internet_access\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tutoring_sessions\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 9,\n \"samples\": [\n 8,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"final_exam_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.739400303272008,\n \"min\": 29.7,\n \"max\": 95.2,\n \"num_unique_values\": 477,\n \"samples\": [\n 79.1,\n 58.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "df[\"stress_score\"] = (10 - df[\"sleep_hours\"]) + (2 - df[\"motivation_level\"])\n", "df[\"estimated_study_hours\"] = 10 - (df[\"previous_scores\"] / 10)\n", "\n", "df[\"burnout_risk\"] = ((df[\"stress_score\"] > 7) & (df[\"hours_studied\"] < 4)).astype(int)\n", "\n", "df[\"procrastination_risk\"] = 1 - (df[\"hours_studied\"] / df[\"hours_studied\"].max())\n", "\n", "df[\"focus_level\"] = df[\"motivation_level\"] * (df[\"sleep_hours\"] / df[\"sleep_hours\"].max())\n", "\n", "df[\"risk_level\"] = df[\"final_exam_score\"].apply(lambda x: \"high\" if x < 60 else \"low\")\n", "\n", "print(\"✅ Synthetic variables added\")" ], "metadata": { "id": "0SJisroHJnO3", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "58d5b523-89a3-4442-9549-6eeaeb5db328" }, "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✅ Synthetic variables added\n" ] } ] }, { "cell_type": "code", "source": [ "# Use scraped quotes to add a motivational message per student\n", "motivational_pool = df_motivational[\"quote\"].tolist()\n", "\n", "# Fallback in case scraping returned nothing\n", "if not motivational_pool:\n", " motivational_pool = [\n", " \"\\u201cThe secret of getting ahead is getting started.\\u201d — Mark Twain\",\n", " \"\\u201cIt always seems impossible until it\\u2019s done.\\u201d — Nelson Mandela\"\n", " ]\n", "\n", "df[\"motivational_quote\"] = [random.choice(motivational_pool) for _ in range(len(df))]\n", "print(\"✅ Motivational quotes assigned to students!\")\n", "df[[\"hours_studied\", \"stress_score\", \"risk_level\", \"motivational_quote\"]].head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 224 }, "id": "TKJ4JPF78q5v", "outputId": "1a4cfa57-c8e6-4fc8-f709-377814e6bc56" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✅ Motivational quotes assigned to students!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " hours_studied stress_score risk_level \\\n", "0 23.5 3.6 high \n", "1 19.7 5.3 high \n", "2 24.4 3.0 low \n", "3 29.6 4.7 low \n", "4 19.1 4.8 low \n", "\n", " motivational_quote \n", "0 “The more that you read, the more things you w... \n", "1 “Try not to become a man of success. Rather be... \n", "2 “Life is like riding a bicycle. To keep your b... \n", "3 “Today you are You, that is truer than true. T... \n", "4 “It is better to be hated for what you are tha... " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hours_studiedstress_scorerisk_levelmotivational_quote
023.53.6high“The more that you read, the more things you w...
119.75.3high“Try not to become a man of success. Rather be...
224.43.0low“Life is like riding a bicycle. To keep your b...
329.64.7low“Today you are You, that is truer than true. T...
419.14.8low“It is better to be hated for what you are tha...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df[[\\\"hours_studied\\\", \\\"stress_score\\\", \\\"risk_level\\\", \\\"motivational_quote\\\"]]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"hours_studied\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.228829625321882,\n \"min\": 19.1,\n \"max\": 29.6,\n \"num_unique_values\": 5,\n \"samples\": [\n 19.7,\n 19.1,\n 24.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"stress_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.9471008394041259,\n \"min\": 3.0,\n \"max\": 5.3,\n \"num_unique_values\": 5,\n \"samples\": [\n 5.3,\n 4.8,\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"risk_level\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"low\",\n \"high\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"motivational_quote\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\u201cTry not to become a man of success. Rather become a man of value.\\u201d\",\n \"\\u201cIt is better to be hated for what you are than to be loved for what you are not.\\u201d\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "df.to_csv(\"student_dataset_enriched.csv\", index=False)\n", "print(\"Saved!\")" ], "metadata": { "id": "G2YEZjDWJuxv", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "20298b07-27da-4212-973b-706d42d0ba6c" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved!\n" ] } ] }, { "cell_type": "code", "source": [ "# --- Comment templates ---\n", "high_stress_comments = [\n", " \"I feel completely overwhelmed this week. Two exams and three deadlines.\",\n", " \"I have two deadlines tomorrow and I haven't started either of them.\",\n", " \"I can't focus at all. I keep getting distracted and wasting time.\",\n", " \"I'm exhausted. I've been studying non-stop and I'm barely keeping up.\",\n", " \"I procrastinated all week. Now I have to cram everything in one night.\"\n", "]\n", "\n", "medium_stress_comments = [\n", " \"It's been a busy week but I'm managing to keep up.\",\n", " \"A bit stressed about the upcoming exam but still in control.\",\n", " \"Some subjects are harder than others but I'm getting through it.\",\n", " \"I could be more organised but things are mostly on track.\",\n", " \"Feeling the pressure a little but nothing I can't handle.\"\n", "]\n", "\n", "low_stress_comments = [\n", " \"Today was productive. I finished all my assignments ahead of schedule.\",\n", " \"I'm not too worried about my exams, I think I'm well prepared.\",\n", " \"Everything is fine this week. Light workload and manageable deadlines.\",\n", " \"I feel on top of things. Good energy and focus today.\",\n", " \"Feeling motivated and ready.\"\n", "]\n", "\n", "# --- Function to assign comments ---\n", "def assign_comment(row):\n", " if row[\"stress_score\"] > 7:\n", " return random.choice(high_stress_comments)\n", " elif row[\"stress_score\"] > 4:\n", " return random.choice(medium_stress_comments)\n", " else:\n", " return random.choice(low_stress_comments)\n", "\n", "# --- Create sample of students ---\n", "df_sample = df.sample(200, random_state=2025).reset_index(drop=True)\n", "\n", "# Add student ID\n", "df_sample[\"student_id\"] = df_sample.index + 1\n", "\n", "# Generate comments\n", "df_sample[\"comment\"] = df_sample.apply(assign_comment, axis=1)\n", "\n", "# Final dataset\n", "df_reviews = df_sample[[\"student_id\", \"comment\", \"stress_score\", \"risk_level\"]].copy()\n", "\n", "print(\"âś… Comments generated!\")\n", "df_reviews.head()" ], "metadata": { "id": "CUPO07rgJu9H", "colab": { "base_uri": "https://localhost:8080/", "height": 224 }, "outputId": "e002f896-f068-4386-d60e-24179902f3e1" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "âś… Comments generated!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " student_id comment \\\n", "0 1 It's been a busy week but I'm managing to keep... \n", "1 2 I'm not too worried about my exams, I think I'... \n", "2 3 Today was productive. I finished all my assign... \n", "3 4 I could be more organised but things are mostl... \n", "4 5 Everything is fine this week. Light workload a... \n", "\n", " stress_score risk_level \n", "0 4.6 low \n", "1 3.7 low \n", "2 2.3 high \n", "3 5.0 high \n", "4 3.3 low " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idcommentstress_scorerisk_level
01It's been a busy week but I'm managing to keep...4.6low
12I'm not too worried about my exams, I think I'...3.7low
23Today was productive. I finished all my assign...2.3high
34I could be more organised but things are mostl...5.0high
45Everything is fine this week. Light workload a...3.3low
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 200,\n \"fields\": [\n {\n \"column\": \"student_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 57,\n \"min\": 1,\n \"max\": 200,\n \"num_unique_values\": 200,\n \"samples\": [\n 96,\n 16,\n 31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"comment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"I feel completely overwhelmed this week. Two exams and three deadlines.\",\n \"Feeling the pressure a little but nothing I can't handle.\",\n \"It's been a busy week but I'm managing to keep up.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"stress_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.5300921245296766,\n \"min\": 0.0,\n \"max\": 8.0,\n \"num_unique_values\": 65,\n \"samples\": [\n 8.0,\n 7.0,\n 4.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"risk_level\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"high\",\n \"low\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "df_reviews.to_csv(\"synthetic_student_reviews.csv\", index=False)\n", "print(\"Saved!\")" ], "metadata": { "id": "zLUI4EZTJvIp", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "65162dd3-9414-48fe-9b00-9eb9c14a4c35" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved!\n" ] } ] } ] }