{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "Hs5iKOIKnRfW" }, "outputs": [], "source": [ "# Notebook 1: Real Data Extraction and Synthetic Enrichment\n", "\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "source": [ "# Real Data Extraction and Synthetic Enrichment\n", "\n", "## Objective\n", "This notebook loads real-world data, cleans and prepares it, creates synthetic enrichment, combines both, and saves the final dataset for further analysis.\n" ], "metadata": { "id": "47POjvLpnpiw" } }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"datareal.csv\", sep=\";\", header=0)\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "vc20fW5QnqPz", "outputId": "6e2c1789-1b81-406b-a482-0ded1df40b48" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME AVG_JOB_SATISFACTION \\\n", "0 Finland 37,2 27389.0 8,1 \n", "1 Iceland 39,4 43769.0 8,1 \n", "2 Norway 34 42203.0 8,1 \n", "3 Austria 36,7 27804.0 8 \n", "4 Switzerland 35,3 49607.0 7,9 \n", "\n", " Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 \n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTIONUnnamed: 4Unnamed: 5Unnamed: 6Unnamed: 7
0Finland37,227389.08,1NaNNaNNaNNaN
1Iceland39,443769.08,1NaNNaNNaNNaN
2Norway3442203.08,1NaNNaNNaNNaN
3Austria36,727804.08NaNNaNNaNNaN
4Switzerland35,349607.07,9NaNNaNNaNNaN
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 34,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27,\n \"samples\": [\n \"38,6\",\n \"39\",\n \"37,3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"7,2\",\n \"7\",\n \"8,1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 4\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 5\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 6\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 7\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "df = df.loc[:, ~df.columns.str.contains(\"^Unnamed\")]\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "l6ew3fwSofw4", "outputId": "fdda8a25-6528-470d-a992-f4cb3d1b2bac" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME AVG_JOB_SATISFACTION\n", "0 Finland 37,2 27389.0 8,1\n", "1 Iceland 39,4 43769.0 8,1\n", "2 Norway 34 42203.0 8,1\n", "3 Austria 36,7 27804.0 8\n", "4 Switzerland 35,3 49607.0 7,9" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTION
0Finland37,227389.08,1
1Iceland39,443769.08,1
2Norway3442203.08,1
3Austria36,727804.08
4Switzerland35,349607.07,9
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 34,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27,\n \"samples\": [\n \"38,6\",\n \"39\",\n \"37,3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"7,2\",\n \"7\",\n \"8,1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "# convert comma to dot and to float\n", "df[\"AVG_WEEKLY_WORKING_HOURS\"] = (\n", " df[\"AVG_WEEKLY_WORKING_HOURS\"]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .astype(float)\n", ")\n", "\n", "df[\"AVG_JOB_SATISFACTION\"] = (\n", " df[\"AVG_JOB_SATISFACTION\"]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .astype(float)\n", ")\n", "\n", "# income sollte schon numeric sein, aber sicher ist sicher\n", "df[\"MEAN_NET_INCOME\"] = pd.to_numeric(df[\"MEAN_NET_INCOME\"], errors=\"coerce\")\n", "\n", "df.dtypes" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 203 }, "id": "4Cpsqz_TqA4I", "outputId": "17a2119d-3cb5-4f6e-e094-c83fa3d405e5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "COUNTRY object\n", "AVG_WEEKLY_WORKING_HOURS float64\n", "MEAN_NET_INCOME float64\n", "AVG_JOB_SATISFACTION float64\n", "dtype: object" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
COUNTRYobject
AVG_WEEKLY_WORKING_HOURSfloat64
MEAN_NET_INCOMEfloat64
AVG_JOB_SATISFACTIONfloat64
\n", "

" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "print(\"Shape:\", df.shape)\n", "print(\"\\nColumns:\")\n", "print(df.columns)\n", "print(\"\\nData types:\")\n", "print(df.dtypes)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YQcfRJ_HpepE", "outputId": "608efc48-5c7c-41aa-d61b-df4367155220" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Shape: (34, 4)\n", "\n", "Columns:\n", "Index(['COUNTRY', 'AVG_WEEKLY_WORKING_HOURS', 'MEAN_NET_INCOME',\n", " 'AVG_JOB_SATISFACTION'],\n", " dtype='object')\n", "\n", "Data types:\n", "COUNTRY object\n", "AVG_WEEKLY_WORKING_HOURS object\n", "MEAN_NET_INCOME float64\n", "AVG_JOB_SATISFACTION object\n", "dtype: object\n" ] } ] }, { "cell_type": "code", "source": [ "df[\"AVG_WEEKLY_WORKING_HOURS\"] = (\n", " df[\"AVG_WEEKLY_WORKING_HOURS\"]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .astype(float)\n", ")\n", "\n", "df[\"AVG_JOB_SATISFACTION\"] = (\n", " df[\"AVG_JOB_SATISFACTION\"]\n", " .astype(str)\n", " .str.replace(\",\", \".\", regex=False)\n", " .astype(float)\n", ")\n", "\n", "df[\"MEAN_NET_INCOME\"] = pd.to_numeric(df[\"MEAN_NET_INCOME\"], errors=\"coerce\")\n", "\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "MAfzlEwEpiND", "outputId": "8720cfd2-0213-4b21-d376-b3996805ba62" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland 37.2 27389.0 \n", "1 Iceland 39.4 43769.0 \n", "2 Norway 34.0 42203.0 \n", "3 Austria 36.7 27804.0 \n", "4 Switzerland 35.3 49607.0 \n", "\n", " AVG_JOB_SATISFACTION \n", "0 8.1 \n", "1 8.1 \n", "2 8.1 \n", "3 8.0 \n", "4 7.9 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTION
0Finland37.227389.08.1
1Iceland39.443769.08.1
2Norway34.042203.08.1
3Austria36.727804.08.0
4Switzerland35.349607.07.9
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 34,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "print(df.isna().sum())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xBsNuIjepk2u", "outputId": "0a9aca9a-a11d-43ef-bb91-30b9c8144130" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "COUNTRY 1\n", "AVG_WEEKLY_WORKING_HOURS 1\n", "MEAN_NET_INCOME 1\n", "AVG_JOB_SATISFACTION 1\n", "dtype: int64\n" ] } ] }, { "cell_type": "code", "source": [ "df = df.drop_duplicates(subset=[\"COUNTRY\"])\n", "\n", "df = df.dropna(subset=[\n", " \"COUNTRY\",\n", " \"AVG_WEEKLY_WORKING_HOURS\",\n", " \"MEAN_NET_INCOME\",\n", " \"AVG_JOB_SATISFACTION\"\n", "])\n", "\n", "df = df.reset_index(drop=True)\n", "\n", "print(df.shape)\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 217 }, "id": "_k1uEgSipnii", "outputId": "c757fdae-3bc4-4c86-ad3a-c5f1fd53b717" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(33, 4)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland 37.2 27389.0 \n", "1 Iceland 39.4 43769.0 \n", "2 Norway 34.0 42203.0 \n", "3 Austria 36.7 27804.0 \n", "4 Switzerland 35.3 49607.0 \n", "\n", " AVG_JOB_SATISFACTION \n", "0 8.1 \n", "1 8.1 \n", "2 8.1 \n", "3 8.0 \n", "4 7.9 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTION
0Finland37.227389.08.1
1Iceland39.443769.08.1
2Norway34.042203.08.1
3Austria36.727804.08.0
4Switzerland35.349607.07.9
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 33,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "np.random.seed(42)\n", "\n", "df_enriched = df.copy()\n", "\n", "# Synthetic Feature 1: Savings rate (5%–30%)\n", "df_enriched[\"ESTIMATED_SAVINGS_RATE\"] = np.random.uniform(0.05, 0.30, size=len(df_enriched))\n", "\n", "# Synthetic Feature 2: Monthly savings\n", "df_enriched[\"ESTIMATED_MONTHLY_SAVINGS\"] = (\n", " df_enriched[\"MEAN_NET_INCOME\"] * df_enriched[\"ESTIMATED_SAVINGS_RATE\"]\n", ").round(2)\n", "\n", "# Synthetic Feature 3: Work-life balance score (based on working hours)\n", "df_enriched[\"WORK_LIFE_BALANCE_SCORE\"] = (\n", " 10\n", " - (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"] - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min()) /\n", " (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].max() - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min()) * 5\n", " + np.random.normal(0, 0.3, size=len(df_enriched))\n", ").round(2)\n", "\n", "# Synthetic Feature 4: Income category\n", "df_enriched[\"INCOME_CATEGORY\"] = pd.cut(\n", " df_enriched[\"MEAN_NET_INCOME\"],\n", " bins=[0, 25000, 40000, 100000],\n", " labels=[\"Low\", \"Medium\", \"High\"]\n", ")\n", "\n", "df_enriched.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "AhcYwO7yqMfA", "outputId": "8086c3a9-0096-4d87-f755-97a06519256f" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland 37.2 27389.0 \n", "1 Iceland 39.4 43769.0 \n", "2 Norway 34.0 42203.0 \n", "3 Austria 36.7 27804.0 \n", "4 Switzerland 35.3 49607.0 \n", "\n", " AVG_JOB_SATISFACTION ESTIMATED_SAVINGS_RATE ESTIMATED_MONTHLY_SAVINGS \\\n", "0 8.1 0.143635 3934.02 \n", "1 8.1 0.287679 12591.40 \n", "2 8.1 0.232998 9833.24 \n", "3 8.0 0.199665 5551.48 \n", "4 7.9 0.089005 4415.25 \n", "\n", " WORK_LIFE_BALANCE_SCORE INCOME_CATEGORY \n", "0 7.68 Medium \n", "1 7.42 High \n", "2 9.00 High \n", "3 7.84 Medium \n", "4 8.18 High " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTIONESTIMATED_SAVINGS_RATEESTIMATED_MONTHLY_SAVINGSWORK_LIFE_BALANCE_SCOREINCOME_CATEGORY
0Finland37.227389.08.10.1436353934.027.68Medium
1Iceland39.443769.08.10.28767912591.407.42High
2Norway34.042203.08.10.2329989833.249.00High
3Austria36.727804.08.00.1996655551.487.84Medium
4Switzerland35.349607.07.90.0890054415.258.18High
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_enriched", "summary": "{\n \"name\": \"df_enriched\",\n \"rows\": 33,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ESTIMATED_SAVINGS_RATE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.07051759984769274,\n \"min\": 0.055146123573950614,\n \"max\": 0.2924774630404986,\n \"num_unique_values\": 33,\n \"samples\": [\n 0.09263103092182289,\n 0.09585112746335846,\n 0.09991844553958994\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ESTIMATED_MONTHLY_SAVINGS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3018.657615281089,\n \"min\": 281.6,\n \"max\": 12591.4,\n \"num_unique_values\": 33,\n \"samples\": [\n 281.6,\n 1685.25,\n 2586.09\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.9677050326948963,\n \"min\": 4.95,\n \"max\": 10.33,\n \"num_unique_values\": 31,\n \"samples\": [\n 6.52,\n 7.4,\n 7.12\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"INCOME_CATEGORY\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Medium\",\n \"High\",\n \"Low\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "np.random.seed(42)\n", "\n", "df_enriched = df.copy()\n", "\n", "# 1. Work Pressure (mehr Stunden = mehr Druck)\n", "df_enriched[\"WORK_PRESSURE\"] = (\n", " (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"] - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min()) /\n", " (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].max() - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min())\n", ").round(2)\n", "\n", "\n", "# 2. Cost of Living (angenommen: höheres Einkommen → höhere Lebenshaltungskosten)\n", "df_enriched[\"COST_OF_LIVING_INDEX\"] = (\n", " df_enriched[\"MEAN_NET_INCOME\"] / df_enriched[\"MEAN_NET_INCOME\"].mean()\n", ").round(2)\n", "\n", "\n", "# 3. Financial Comfort (Einkommen vs. Kosten)\n", "df_enriched[\"FINANCIAL_COMFORT\"] = (\n", " df_enriched[\"MEAN_NET_INCOME\"] / (df_enriched[\"COST_OF_LIVING_INDEX\"] * 1000)\n", ").round(2)\n", "\n", "\n", "# 4. Work-Life Balance (weniger Stunden = besser)\n", "df_enriched[\"WORK_LIFE_BALANCE\"] = (\n", " 1 - df_enriched[\"WORK_PRESSURE\"] + np.random.normal(0, 0.05, size=len(df_enriched))\n", ").clip(0,1).round(2)\n", "\n", "\n", "# 5. Stress Level (kombiniert mehrere Faktoren)\n", "df_enriched[\"STRESS_LEVEL\"] = (\n", " df_enriched[\"WORK_PRESSURE\"] * 0.6 +\n", " (1 - df_enriched[\"WORK_LIFE_BALANCE\"]) * 0.4 +\n", " np.random.normal(0, 0.05, size=len(df_enriched))\n", ").clip(0,1).round(2)\n", "\n", "\n", "# 6. Satisfaction Driver Score (synthetic erklärende Variable)\n", "df_enriched[\"SATISFACTION_DRIVER_SCORE\"] = (\n", " df_enriched[\"WORK_LIFE_BALANCE\"] * 0.5 +\n", " df_enriched[\"FINANCIAL_COMFORT\"] * 0.3 -\n", " df_enriched[\"STRESS_LEVEL\"] * 0.2\n", ").round(2)\n", "\n", "\n", "df_enriched.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "fWUaiDEpqVh2", "outputId": "5f10c579-c883-424c-c0d1-a9caca968c5a" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland 37.2 27389.0 \n", "1 Iceland 39.4 43769.0 \n", "2 Norway 34.0 42203.0 \n", "3 Austria 36.7 27804.0 \n", "4 Switzerland 35.3 49607.0 \n", "\n", " AVG_JOB_SATISFACTION WORK_PRESSURE COST_OF_LIVING_INDEX \\\n", "0 8.1 0.43 1.42 \n", "1 8.1 0.57 2.26 \n", "2 8.1 0.22 2.18 \n", "3 8.0 0.39 1.44 \n", "4 7.9 0.30 2.56 \n", "\n", " FINANCIAL_COMFORT WORK_LIFE_BALANCE STRESS_LEVEL \\\n", "0 19.29 0.59 0.37 \n", "1 19.37 0.42 0.62 \n", "2 19.36 0.81 0.15 \n", "3 19.31 0.69 0.37 \n", "4 19.38 0.69 0.21 \n", "\n", " SATISFACTION_DRIVER_SCORE \n", "0 6.01 \n", "1 5.90 \n", "2 6.18 \n", "3 6.06 \n", "4 6.12 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTIONWORK_PRESSURECOST_OF_LIVING_INDEXFINANCIAL_COMFORTWORK_LIFE_BALANCESTRESS_LEVELSATISFACTION_DRIVER_SCORE
0Finland37.227389.08.10.431.4219.290.590.376.01
1Iceland39.443769.08.10.572.2619.370.420.625.90
2Norway34.042203.08.10.222.1819.360.810.156.18
3Austria36.727804.08.00.391.4419.310.690.376.06
4Switzerland35.349607.07.90.302.5619.380.690.216.12
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_enriched", "summary": "{\n \"name\": \"df_enriched\",\n \"rows\": 33,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1857264606398835,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 25,\n \"samples\": [\n 0.52,\n 0.59,\n 0.43\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COST_OF_LIVING_INDEX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6638123080382213,\n \"min\": 0.16,\n \"max\": 2.56,\n \"num_unique_values\": 32,\n \"samples\": [\n 0.47,\n 0.91,\n 0.57\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"FINANCIAL_COMFORT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.12098631903624234,\n \"min\": 19.0,\n \"max\": 19.66,\n \"num_unique_values\": 21,\n \"samples\": [\n 19.29,\n 19.57,\n 19.41\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19779657441038972,\n \"min\": 0.0,\n \"max\": 0.98,\n \"num_unique_values\": 25,\n \"samples\": [\n 0.6,\n 0.58,\n 0.59\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"STRESS_LEVEL\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19463689970628054,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 25,\n \"samples\": [\n 0.41,\n 0.43,\n 0.37\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SATISFACTION_DRIVER_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1423923038012174,\n \"min\": 5.68,\n \"max\": 6.29,\n \"num_unique_values\": 24,\n \"samples\": [\n 6.03,\n 6.05,\n 6.01\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "synthetic_rows = []\n", "\n", "for _, row in df.iterrows():\n", " for i in range(3): # 3 neue Versionen pro Land\n", "\n", " new_row = row.copy()\n", "\n", " # kleine Variationen hinzufügen\n", " new_row[\"AVG_WEEKLY_WORKING_HOURS\"] = round(\n", " max(20, row[\"AVG_WEEKLY_WORKING_HOURS\"] + np.random.normal(0, 1.5)), 1\n", " )\n", "\n", " new_row[\"MEAN_NET_INCOME\"] = round(\n", " max(5000, row[\"MEAN_NET_INCOME\"] + np.random.normal(0, 3000)), 2\n", " )\n", "\n", " new_row[\"AVG_JOB_SATISFACTION\"] = round(\n", " min(10, max(1, row[\"AVG_JOB_SATISFACTION\"] + np.random.normal(0, 0.4))), 1\n", " )\n", "\n", " # neuen Namen geben\n", " new_row[\"COUNTRY\"] = f\"{row['COUNTRY']}_synthetic_{i+1}\"\n", "\n", " synthetic_rows.append(new_row)\n", "\n", "synthetic_df = pd.DataFrame(synthetic_rows)\n", "\n", "synthetic_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "JWNeZH1Xq6Xi", "outputId": "911d9ad7-306f-4753-a8df-fb6e21cee5fc" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland_synthetic_1 37.1 30399.60 \n", "0 Finland_synthetic_2 36.2 28473.19 \n", "0 Finland_synthetic_3 37.1 32082.93 \n", "1 Iceland_synthetic_1 40.6 44030.14 \n", "1 Iceland_synthetic_2 39.5 37806.29 \n", "\n", " AVG_JOB_SATISFACTION \n", "0 8.2 \n", "0 8.7 \n", "0 7.1 \n", "1 8.0 \n", "1 8.0 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTION
0Finland_synthetic_137.130399.608.2
0Finland_synthetic_236.228473.198.7
0Finland_synthetic_337.132082.937.1
1Iceland_synthetic_140.644030.148.0
1Iceland_synthetic_239.537806.298.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "synthetic_df", "summary": "{\n \"name\": \"synthetic_df\",\n \"rows\": 99,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"Italy_synthetic_3\",\n \"Czechia_synthetic_2\",\n \"Serbia_synthetic_3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1257427894793697,\n \"min\": 29.5,\n \"max\": 46.7,\n \"num_unique_values\": 61,\n \"samples\": [\n 37.1,\n 32.8,\n 37.9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13060.359152584817,\n \"min\": 5000.0,\n \"max\": 55265.56,\n \"num_unique_values\": 91,\n \"samples\": [\n 9879.51,\n 35179.78,\n 40211.72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6525973667018435,\n \"min\": 5.6,\n \"max\": 8.8,\n \"num_unique_values\": 30,\n \"samples\": [\n 6.1,\n 7.5,\n 6.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "np.random.seed(42)\n", "\n", "synthetic_enriched = synthetic_df.copy()\n", "\n", "# WORK PRESSURE\n", "synthetic_enriched[\"WORK_PRESSURE\"] = (\n", " (synthetic_enriched[\"AVG_WEEKLY_WORKING_HOURS\"] - df[\"AVG_WEEKLY_WORKING_HOURS\"].min()) /\n", " (df[\"AVG_WEEKLY_WORKING_HOURS\"].max() - df[\"AVG_WEEKLY_WORKING_HOURS\"].min())\n", ").round(2)\n", "\n", "# COST OF LIVING\n", "synthetic_enriched[\"COST_OF_LIVING_INDEX\"] = (\n", " synthetic_enriched[\"MEAN_NET_INCOME\"] / df[\"MEAN_NET_INCOME\"].mean()\n", ").round(2)\n", "\n", "# FINANCIAL COMFORT\n", "synthetic_enriched[\"FINANCIAL_COMFORT\"] = (\n", " synthetic_enriched[\"MEAN_NET_INCOME\"] / (synthetic_enriched[\"COST_OF_LIVING_INDEX\"] * 1000)\n", ").round(2)\n", "\n", "# WORK-LIFE BALANCE\n", "synthetic_enriched[\"WORK_LIFE_BALANCE\"] = (\n", " 1 - synthetic_enriched[\"WORK_PRESSURE\"] + np.random.normal(0, 0.05, size=len(synthetic_enriched))\n", ").clip(0,1).round(2)\n", "\n", "# STRESS\n", "synthetic_enriched[\"STRESS_LEVEL\"] = (\n", " synthetic_enriched[\"WORK_PRESSURE\"] * 0.6 +\n", " (1 - synthetic_enriched[\"WORK_LIFE_BALANCE\"]) * 0.4 +\n", " np.random.normal(0, 0.05, size=len(synthetic_enriched))\n", ").clip(0,1).round(2)\n", "\n", "# DRIVER SCORE\n", "synthetic_enriched[\"SATISFACTION_DRIVER_SCORE\"] = (\n", " synthetic_enriched[\"WORK_LIFE_BALANCE\"] * 0.5 +\n", " synthetic_enriched[\"FINANCIAL_COMFORT\"] * 0.3 -\n", " synthetic_enriched[\"STRESS_LEVEL\"] * 0.2\n", ").round(2)\n", "\n", "synthetic_enriched.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "PNoV0eg4rGGT", "outputId": "30c96354-c9f1-4cf8-d5d9-142304401ba9" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland_synthetic_1 37.1 30399.60 \n", "0 Finland_synthetic_2 36.2 28473.19 \n", "0 Finland_synthetic_3 37.1 32082.93 \n", "1 Iceland_synthetic_1 40.6 44030.14 \n", "1 Iceland_synthetic_2 39.5 37806.29 \n", "\n", " AVG_JOB_SATISFACTION WORK_PRESSURE COST_OF_LIVING_INDEX \\\n", "0 8.2 0.42 1.57 \n", "0 8.7 0.36 1.47 \n", "0 7.1 0.42 1.66 \n", "1 8.0 0.65 2.27 \n", "1 8.0 0.58 1.95 \n", "\n", " FINANCIAL_COMFORT WORK_LIFE_BALANCE STRESS_LEVEL \\\n", "0 19.36 0.60 0.40 \n", "0 19.37 0.63 0.29 \n", "0 19.33 0.61 0.39 \n", "1 19.40 0.43 0.60 \n", "1 19.39 0.41 0.54 \n", "\n", " SATISFACTION_DRIVER_SCORE \n", "0 6.03 \n", "0 6.07 \n", "0 6.03 \n", "1 5.91 \n", "1 5.91 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTIONWORK_PRESSURECOST_OF_LIVING_INDEXFINANCIAL_COMFORTWORK_LIFE_BALANCESTRESS_LEVELSATISFACTION_DRIVER_SCORE
0Finland_synthetic_137.130399.608.20.421.5719.360.600.406.03
0Finland_synthetic_236.228473.198.70.361.4719.370.630.296.07
0Finland_synthetic_337.132082.937.10.421.6619.330.610.396.03
1Iceland_synthetic_140.644030.148.00.652.2719.400.430.605.91
1Iceland_synthetic_239.537806.298.00.581.9519.390.410.545.91
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "synthetic_enriched", "summary": "{\n \"name\": \"synthetic_enriched\",\n \"rows\": 99,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"Italy_synthetic_3\",\n \"Czechia_synthetic_2\",\n \"Serbia_synthetic_3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1257427894793697,\n \"min\": 29.5,\n \"max\": 46.7,\n \"num_unique_values\": 61,\n \"samples\": [\n 37.1,\n 32.8,\n 37.9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13060.359152584817,\n \"min\": 5000.0,\n \"max\": 55265.56,\n \"num_unique_values\": 91,\n \"samples\": [\n 9879.51,\n 35179.78,\n 40211.72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6525973667018435,\n \"min\": 5.6,\n \"max\": 8.8,\n \"num_unique_values\": 30,\n \"samples\": [\n 6.1,\n 7.5,\n 6.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.20582383446743815,\n \"min\": -0.08,\n \"max\": 1.05,\n \"num_unique_values\": 52,\n \"samples\": [\n 0.55,\n 0.47,\n 0.83\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COST_OF_LIVING_INDEX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6746260941696909,\n \"min\": 0.26,\n \"max\": 2.86,\n \"num_unique_values\": 74,\n \"samples\": [\n 1.95,\n 0.38,\n 1.34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"FINANCIAL_COMFORT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.10253899007535147,\n \"min\": 19.07,\n \"max\": 19.66,\n \"num_unique_values\": 35,\n \"samples\": [\n 19.22,\n 19.29,\n 19.28\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2070930427239679,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 55,\n \"samples\": [\n 0.52,\n 0.38,\n 0.42\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"STRESS_LEVEL\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.20114212129276393,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 54,\n \"samples\": [\n 0.04,\n 0.83,\n 0.76\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SATISFACTION_DRIVER_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1506471985714822,\n \"min\": 5.57,\n \"max\": 6.31,\n \"num_unique_values\": 47,\n \"samples\": [\n 5.79,\n 5.84,\n 5.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "final_df = pd.concat([df_enriched, synthetic_enriched], ignore_index=True)\n", "\n", "final_df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R3DA8YhwrTVt", "outputId": "c0c74732-bc3e-4e18-d241-b6a93f0f46cf" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(132, 10)" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "final_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 221 }, "id": "AaBdHgscrkqx", "outputId": "63fdbf91-9388-406c-d097-bffc892187ff" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n", "0 Finland 37.2 27389.0 \n", "1 Iceland 39.4 43769.0 \n", "2 Norway 34.0 42203.0 \n", "3 Austria 36.7 27804.0 \n", "4 Switzerland 35.3 49607.0 \n", "\n", " AVG_JOB_SATISFACTION WORK_PRESSURE COST_OF_LIVING_INDEX \\\n", "0 8.1 0.43 1.42 \n", "1 8.1 0.57 2.26 \n", "2 8.1 0.22 2.18 \n", "3 8.0 0.39 1.44 \n", "4 7.9 0.30 2.56 \n", "\n", " FINANCIAL_COMFORT WORK_LIFE_BALANCE STRESS_LEVEL \\\n", "0 19.29 0.59 0.37 \n", "1 19.37 0.42 0.62 \n", "2 19.36 0.81 0.15 \n", "3 19.31 0.69 0.37 \n", "4 19.38 0.69 0.21 \n", "\n", " SATISFACTION_DRIVER_SCORE \n", "0 6.01 \n", "1 5.90 \n", "2 6.18 \n", "3 6.06 \n", "4 6.12 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
COUNTRYAVG_WEEKLY_WORKING_HOURSMEAN_NET_INCOMEAVG_JOB_SATISFACTIONWORK_PRESSURECOST_OF_LIVING_INDEXFINANCIAL_COMFORTWORK_LIFE_BALANCESTRESS_LEVELSATISFACTION_DRIVER_SCORE
0Finland37.227389.08.10.431.4219.290.590.376.01
1Iceland39.443769.08.10.572.2619.370.420.625.90
2Norway34.042203.08.10.222.1819.360.810.156.18
3Austria36.727804.08.00.391.4419.310.690.376.06
4Switzerland35.349607.07.90.302.5619.380.690.216.12
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "final_df", "summary": "{\n \"name\": \"final_df\",\n \"rows\": 132,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 132,\n \"samples\": [\n \"Denmark_synthetic_3\",\n \"Spain_synthetic_3\",\n \"Poland\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.0439629980154277,\n \"min\": 29.5,\n \"max\": 46.7,\n \"num_unique_values\": 70,\n \"samples\": [\n 38.9,\n 37.2,\n 41.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12961.860338117047,\n \"min\": 3040.0,\n \"max\": 55265.56,\n \"num_unique_values\": 124,\n \"samples\": [\n 40075.0,\n 26627.68,\n 44030.14\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6194006639584302,\n \"min\": 5.6,\n \"max\": 8.8,\n \"num_unique_values\": 30,\n \"samples\": [\n 5.6,\n 8.7,\n 6.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2003536515873169,\n \"min\": -0.08,\n \"max\": 1.05,\n \"num_unique_values\": 57,\n \"samples\": [\n 0.43,\n 0.38,\n 0.37\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COST_OF_LIVING_INDEX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.66942505912527,\n \"min\": 0.16,\n \"max\": 2.86,\n \"num_unique_values\": 96,\n \"samples\": [\n 0.8,\n 2.17,\n 0.84\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"FINANCIAL_COMFORT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1069680783668568,\n \"min\": 19.0,\n \"max\": 19.66,\n \"num_unique_values\": 41,\n \"samples\": [\n 19.17,\n 19.43,\n 19.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.20410584264902806,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 61,\n \"samples\": [\n 0.59,\n 0.67,\n 0.53\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"STRESS_LEVEL\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1990030088918233,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 62,\n \"samples\": [\n 0.47,\n 0.3,\n 0.37\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SATISFACTION_DRIVER_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14815050318443265,\n \"min\": 5.57,\n \"max\": 6.31,\n \"num_unique_values\": 52,\n \"samples\": [\n 5.98,\n 5.73,\n 5.67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "final_df.to_csv(\"final_dataset.csv\", index=False)" ], "metadata": { "id": "f4Yymv0yrVyb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "ieiGGvL_rvir" }, "execution_count": null, "outputs": [] } ] }