{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Hs5iKOIKnRfW"
},
"outputs": [],
"source": [
"# Notebook 1: Real Data Extraction and Synthetic Enrichment\n",
"\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"source": [
"# Real Data Extraction and Synthetic Enrichment\n",
"\n",
"## Objective\n",
"This notebook loads real-world data, cleans and prepares it, creates synthetic enrichment, combines both, and saves the final dataset for further analysis.\n"
],
"metadata": {
"id": "47POjvLpnpiw"
}
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"datareal.csv\", sep=\";\", header=0)\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "vc20fW5QnqPz",
"outputId": "6e2c1789-1b81-406b-a482-0ded1df40b48"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME AVG_JOB_SATISFACTION \\\n",
"0 Finland 37,2 27389.0 8,1 \n",
"1 Iceland 39,4 43769.0 8,1 \n",
"2 Norway 34 42203.0 8,1 \n",
"3 Austria 36,7 27804.0 8 \n",
"4 Switzerland 35,3 49607.0 7,9 \n",
"\n",
" Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 \n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN "
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
" Unnamed: 4 | \n",
" Unnamed: 5 | \n",
" Unnamed: 6 | \n",
" Unnamed: 7 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37,2 | \n",
" 27389.0 | \n",
" 8,1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39,4 | \n",
" 43769.0 | \n",
" 8,1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34 | \n",
" 42203.0 | \n",
" 8,1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36,7 | \n",
" 27804.0 | \n",
" 8 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35,3 | \n",
" 49607.0 | \n",
" 7,9 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 34,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27,\n \"samples\": [\n \"38,6\",\n \"39\",\n \"37,3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"7,2\",\n \"7\",\n \"8,1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 4\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 5\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 6\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Unnamed: 7\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"df = df.loc[:, ~df.columns.str.contains(\"^Unnamed\")]\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "l6ew3fwSofw4",
"outputId": "fdda8a25-6528-470d-a992-f4cb3d1b2bac"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME AVG_JOB_SATISFACTION\n",
"0 Finland 37,2 27389.0 8,1\n",
"1 Iceland 39,4 43769.0 8,1\n",
"2 Norway 34 42203.0 8,1\n",
"3 Austria 36,7 27804.0 8\n",
"4 Switzerland 35,3 49607.0 7,9"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37,2 | \n",
" 27389.0 | \n",
" 8,1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39,4 | \n",
" 43769.0 | \n",
" 8,1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34 | \n",
" 42203.0 | \n",
" 8,1 | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36,7 | \n",
" 27804.0 | \n",
" 8 | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35,3 | \n",
" 49607.0 | \n",
" 7,9 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 34,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 27,\n \"samples\": [\n \"38,6\",\n \"39\",\n \"37,3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 14,\n \"samples\": [\n \"7,2\",\n \"7\",\n \"8,1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"# convert comma to dot and to float\n",
"df[\"AVG_WEEKLY_WORKING_HOURS\"] = (\n",
" df[\"AVG_WEEKLY_WORKING_HOURS\"]\n",
" .astype(str)\n",
" .str.replace(\",\", \".\", regex=False)\n",
" .astype(float)\n",
")\n",
"\n",
"df[\"AVG_JOB_SATISFACTION\"] = (\n",
" df[\"AVG_JOB_SATISFACTION\"]\n",
" .astype(str)\n",
" .str.replace(\",\", \".\", regex=False)\n",
" .astype(float)\n",
")\n",
"\n",
"# income sollte schon numeric sein, aber sicher ist sicher\n",
"df[\"MEAN_NET_INCOME\"] = pd.to_numeric(df[\"MEAN_NET_INCOME\"], errors=\"coerce\")\n",
"\n",
"df.dtypes"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 203
},
"id": "4Cpsqz_TqA4I",
"outputId": "17a2119d-3cb5-4f6e-e094-c83fa3d405e5"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"COUNTRY object\n",
"AVG_WEEKLY_WORKING_HOURS float64\n",
"MEAN_NET_INCOME float64\n",
"AVG_JOB_SATISFACTION float64\n",
"dtype: object"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
"
\n",
" \n",
" \n",
" \n",
" | COUNTRY | \n",
" object | \n",
"
\n",
" \n",
" | AVG_WEEKLY_WORKING_HOURS | \n",
" float64 | \n",
"
\n",
" \n",
" | MEAN_NET_INCOME | \n",
" float64 | \n",
"
\n",
" \n",
" | AVG_JOB_SATISFACTION | \n",
" float64 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"print(\"Shape:\", df.shape)\n",
"print(\"\\nColumns:\")\n",
"print(df.columns)\n",
"print(\"\\nData types:\")\n",
"print(df.dtypes)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YQcfRJ_HpepE",
"outputId": "608efc48-5c7c-41aa-d61b-df4367155220"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Shape: (34, 4)\n",
"\n",
"Columns:\n",
"Index(['COUNTRY', 'AVG_WEEKLY_WORKING_HOURS', 'MEAN_NET_INCOME',\n",
" 'AVG_JOB_SATISFACTION'],\n",
" dtype='object')\n",
"\n",
"Data types:\n",
"COUNTRY object\n",
"AVG_WEEKLY_WORKING_HOURS object\n",
"MEAN_NET_INCOME float64\n",
"AVG_JOB_SATISFACTION object\n",
"dtype: object\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df[\"AVG_WEEKLY_WORKING_HOURS\"] = (\n",
" df[\"AVG_WEEKLY_WORKING_HOURS\"]\n",
" .astype(str)\n",
" .str.replace(\",\", \".\", regex=False)\n",
" .astype(float)\n",
")\n",
"\n",
"df[\"AVG_JOB_SATISFACTION\"] = (\n",
" df[\"AVG_JOB_SATISFACTION\"]\n",
" .astype(str)\n",
" .str.replace(\",\", \".\", regex=False)\n",
" .astype(float)\n",
")\n",
"\n",
"df[\"MEAN_NET_INCOME\"] = pd.to_numeric(df[\"MEAN_NET_INCOME\"], errors=\"coerce\")\n",
"\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "MAfzlEwEpiND",
"outputId": "8720cfd2-0213-4b21-d376-b3996805ba62"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland 37.2 27389.0 \n",
"1 Iceland 39.4 43769.0 \n",
"2 Norway 34.0 42203.0 \n",
"3 Austria 36.7 27804.0 \n",
"4 Switzerland 35.3 49607.0 \n",
"\n",
" AVG_JOB_SATISFACTION \n",
"0 8.1 \n",
"1 8.1 \n",
"2 8.1 \n",
"3 8.0 \n",
"4 7.9 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37.2 | \n",
" 27389.0 | \n",
" 8.1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39.4 | \n",
" 43769.0 | \n",
" 8.1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34.0 | \n",
" 42203.0 | \n",
" 8.1 | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36.7 | \n",
" 27804.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35.3 | \n",
" 49607.0 | \n",
" 7.9 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 34,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"source": [
"print(df.isna().sum())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xBsNuIjepk2u",
"outputId": "0a9aca9a-a11d-43ef-bb91-30b9c8144130"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"COUNTRY 1\n",
"AVG_WEEKLY_WORKING_HOURS 1\n",
"MEAN_NET_INCOME 1\n",
"AVG_JOB_SATISFACTION 1\n",
"dtype: int64\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df = df.drop_duplicates(subset=[\"COUNTRY\"])\n",
"\n",
"df = df.dropna(subset=[\n",
" \"COUNTRY\",\n",
" \"AVG_WEEKLY_WORKING_HOURS\",\n",
" \"MEAN_NET_INCOME\",\n",
" \"AVG_JOB_SATISFACTION\"\n",
"])\n",
"\n",
"df = df.reset_index(drop=True)\n",
"\n",
"print(df.shape)\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 217
},
"id": "_k1uEgSipnii",
"outputId": "c757fdae-3bc4-4c86-ad3a-c5f1fd53b717"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(33, 4)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland 37.2 27389.0 \n",
"1 Iceland 39.4 43769.0 \n",
"2 Norway 34.0 42203.0 \n",
"3 Austria 36.7 27804.0 \n",
"4 Switzerland 35.3 49607.0 \n",
"\n",
" AVG_JOB_SATISFACTION \n",
"0 8.1 \n",
"1 8.1 \n",
"2 8.1 \n",
"3 8.0 \n",
"4 7.9 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37.2 | \n",
" 27389.0 | \n",
" 8.1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39.4 | \n",
" 43769.0 | \n",
" 8.1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34.0 | \n",
" 42203.0 | \n",
" 8.1 | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36.7 | \n",
" 27804.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35.3 | \n",
" 49607.0 | \n",
" 7.9 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 33,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"source": [
"np.random.seed(42)\n",
"\n",
"df_enriched = df.copy()\n",
"\n",
"# Synthetic Feature 1: Savings rate (5%–30%)\n",
"df_enriched[\"ESTIMATED_SAVINGS_RATE\"] = np.random.uniform(0.05, 0.30, size=len(df_enriched))\n",
"\n",
"# Synthetic Feature 2: Monthly savings\n",
"df_enriched[\"ESTIMATED_MONTHLY_SAVINGS\"] = (\n",
" df_enriched[\"MEAN_NET_INCOME\"] * df_enriched[\"ESTIMATED_SAVINGS_RATE\"]\n",
").round(2)\n",
"\n",
"# Synthetic Feature 3: Work-life balance score (based on working hours)\n",
"df_enriched[\"WORK_LIFE_BALANCE_SCORE\"] = (\n",
" 10\n",
" - (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"] - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min()) /\n",
" (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].max() - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min()) * 5\n",
" + np.random.normal(0, 0.3, size=len(df_enriched))\n",
").round(2)\n",
"\n",
"# Synthetic Feature 4: Income category\n",
"df_enriched[\"INCOME_CATEGORY\"] = pd.cut(\n",
" df_enriched[\"MEAN_NET_INCOME\"],\n",
" bins=[0, 25000, 40000, 100000],\n",
" labels=[\"Low\", \"Medium\", \"High\"]\n",
")\n",
"\n",
"df_enriched.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "AhcYwO7yqMfA",
"outputId": "8086c3a9-0096-4d87-f755-97a06519256f"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland 37.2 27389.0 \n",
"1 Iceland 39.4 43769.0 \n",
"2 Norway 34.0 42203.0 \n",
"3 Austria 36.7 27804.0 \n",
"4 Switzerland 35.3 49607.0 \n",
"\n",
" AVG_JOB_SATISFACTION ESTIMATED_SAVINGS_RATE ESTIMATED_MONTHLY_SAVINGS \\\n",
"0 8.1 0.143635 3934.02 \n",
"1 8.1 0.287679 12591.40 \n",
"2 8.1 0.232998 9833.24 \n",
"3 8.0 0.199665 5551.48 \n",
"4 7.9 0.089005 4415.25 \n",
"\n",
" WORK_LIFE_BALANCE_SCORE INCOME_CATEGORY \n",
"0 7.68 Medium \n",
"1 7.42 High \n",
"2 9.00 High \n",
"3 7.84 Medium \n",
"4 8.18 High "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
" ESTIMATED_SAVINGS_RATE | \n",
" ESTIMATED_MONTHLY_SAVINGS | \n",
" WORK_LIFE_BALANCE_SCORE | \n",
" INCOME_CATEGORY | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37.2 | \n",
" 27389.0 | \n",
" 8.1 | \n",
" 0.143635 | \n",
" 3934.02 | \n",
" 7.68 | \n",
" Medium | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39.4 | \n",
" 43769.0 | \n",
" 8.1 | \n",
" 0.287679 | \n",
" 12591.40 | \n",
" 7.42 | \n",
" High | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34.0 | \n",
" 42203.0 | \n",
" 8.1 | \n",
" 0.232998 | \n",
" 9833.24 | \n",
" 9.00 | \n",
" High | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36.7 | \n",
" 27804.0 | \n",
" 8.0 | \n",
" 0.199665 | \n",
" 5551.48 | \n",
" 7.84 | \n",
" Medium | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35.3 | \n",
" 49607.0 | \n",
" 7.9 | \n",
" 0.089005 | \n",
" 4415.25 | \n",
" 8.18 | \n",
" High | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_enriched",
"summary": "{\n \"name\": \"df_enriched\",\n \"rows\": 33,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ESTIMATED_SAVINGS_RATE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.07051759984769274,\n \"min\": 0.055146123573950614,\n \"max\": 0.2924774630404986,\n \"num_unique_values\": 33,\n \"samples\": [\n 0.09263103092182289,\n 0.09585112746335846,\n 0.09991844553958994\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ESTIMATED_MONTHLY_SAVINGS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3018.657615281089,\n \"min\": 281.6,\n \"max\": 12591.4,\n \"num_unique_values\": 33,\n \"samples\": [\n 281.6,\n 1685.25,\n 2586.09\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.9677050326948963,\n \"min\": 4.95,\n \"max\": 10.33,\n \"num_unique_values\": 31,\n \"samples\": [\n 6.52,\n 7.4,\n 7.12\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"INCOME_CATEGORY\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Medium\",\n \"High\",\n \"Low\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"source": [
"np.random.seed(42)\n",
"\n",
"df_enriched = df.copy()\n",
"\n",
"# 1. Work Pressure (mehr Stunden = mehr Druck)\n",
"df_enriched[\"WORK_PRESSURE\"] = (\n",
" (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"] - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min()) /\n",
" (df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].max() - df_enriched[\"AVG_WEEKLY_WORKING_HOURS\"].min())\n",
").round(2)\n",
"\n",
"\n",
"# 2. Cost of Living (angenommen: höheres Einkommen → höhere Lebenshaltungskosten)\n",
"df_enriched[\"COST_OF_LIVING_INDEX\"] = (\n",
" df_enriched[\"MEAN_NET_INCOME\"] / df_enriched[\"MEAN_NET_INCOME\"].mean()\n",
").round(2)\n",
"\n",
"\n",
"# 3. Financial Comfort (Einkommen vs. Kosten)\n",
"df_enriched[\"FINANCIAL_COMFORT\"] = (\n",
" df_enriched[\"MEAN_NET_INCOME\"] / (df_enriched[\"COST_OF_LIVING_INDEX\"] * 1000)\n",
").round(2)\n",
"\n",
"\n",
"# 4. Work-Life Balance (weniger Stunden = besser)\n",
"df_enriched[\"WORK_LIFE_BALANCE\"] = (\n",
" 1 - df_enriched[\"WORK_PRESSURE\"] + np.random.normal(0, 0.05, size=len(df_enriched))\n",
").clip(0,1).round(2)\n",
"\n",
"\n",
"# 5. Stress Level (kombiniert mehrere Faktoren)\n",
"df_enriched[\"STRESS_LEVEL\"] = (\n",
" df_enriched[\"WORK_PRESSURE\"] * 0.6 +\n",
" (1 - df_enriched[\"WORK_LIFE_BALANCE\"]) * 0.4 +\n",
" np.random.normal(0, 0.05, size=len(df_enriched))\n",
").clip(0,1).round(2)\n",
"\n",
"\n",
"# 6. Satisfaction Driver Score (synthetic erklärende Variable)\n",
"df_enriched[\"SATISFACTION_DRIVER_SCORE\"] = (\n",
" df_enriched[\"WORK_LIFE_BALANCE\"] * 0.5 +\n",
" df_enriched[\"FINANCIAL_COMFORT\"] * 0.3 -\n",
" df_enriched[\"STRESS_LEVEL\"] * 0.2\n",
").round(2)\n",
"\n",
"\n",
"df_enriched.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "fWUaiDEpqVh2",
"outputId": "5f10c579-c883-424c-c0d1-a9caca968c5a"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland 37.2 27389.0 \n",
"1 Iceland 39.4 43769.0 \n",
"2 Norway 34.0 42203.0 \n",
"3 Austria 36.7 27804.0 \n",
"4 Switzerland 35.3 49607.0 \n",
"\n",
" AVG_JOB_SATISFACTION WORK_PRESSURE COST_OF_LIVING_INDEX \\\n",
"0 8.1 0.43 1.42 \n",
"1 8.1 0.57 2.26 \n",
"2 8.1 0.22 2.18 \n",
"3 8.0 0.39 1.44 \n",
"4 7.9 0.30 2.56 \n",
"\n",
" FINANCIAL_COMFORT WORK_LIFE_BALANCE STRESS_LEVEL \\\n",
"0 19.29 0.59 0.37 \n",
"1 19.37 0.42 0.62 \n",
"2 19.36 0.81 0.15 \n",
"3 19.31 0.69 0.37 \n",
"4 19.38 0.69 0.21 \n",
"\n",
" SATISFACTION_DRIVER_SCORE \n",
"0 6.01 \n",
"1 5.90 \n",
"2 6.18 \n",
"3 6.06 \n",
"4 6.12 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
" WORK_PRESSURE | \n",
" COST_OF_LIVING_INDEX | \n",
" FINANCIAL_COMFORT | \n",
" WORK_LIFE_BALANCE | \n",
" STRESS_LEVEL | \n",
" SATISFACTION_DRIVER_SCORE | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37.2 | \n",
" 27389.0 | \n",
" 8.1 | \n",
" 0.43 | \n",
" 1.42 | \n",
" 19.29 | \n",
" 0.59 | \n",
" 0.37 | \n",
" 6.01 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39.4 | \n",
" 43769.0 | \n",
" 8.1 | \n",
" 0.57 | \n",
" 2.26 | \n",
" 19.37 | \n",
" 0.42 | \n",
" 0.62 | \n",
" 5.90 | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34.0 | \n",
" 42203.0 | \n",
" 8.1 | \n",
" 0.22 | \n",
" 2.18 | \n",
" 19.36 | \n",
" 0.81 | \n",
" 0.15 | \n",
" 6.18 | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36.7 | \n",
" 27804.0 | \n",
" 8.0 | \n",
" 0.39 | \n",
" 1.44 | \n",
" 19.31 | \n",
" 0.69 | \n",
" 0.37 | \n",
" 6.06 | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35.3 | \n",
" 49607.0 | \n",
" 7.9 | \n",
" 0.30 | \n",
" 2.56 | \n",
" 19.38 | \n",
" 0.69 | \n",
" 0.21 | \n",
" 6.12 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_enriched",
"summary": "{\n \"name\": \"df_enriched\",\n \"rows\": 33,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 33,\n \"samples\": [\n \"Serbia\",\n \"Cyprus\",\n \"Germany\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.826219902872729,\n \"min\": 30.7,\n \"max\": 45.9,\n \"num_unique_values\": 27,\n \"samples\": [\n 38.6,\n 39.0,\n 37.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12859.991823068705,\n \"min\": 3040.0,\n \"max\": 49607.0,\n \"num_unique_values\": 33,\n \"samples\": [\n 3040.0,\n 17582.0,\n 25882.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5123105710524828,\n \"min\": 6.2,\n \"max\": 8.1,\n \"num_unique_values\": 14,\n \"samples\": [\n 7.2,\n 7.0,\n 8.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1857264606398835,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 25,\n \"samples\": [\n 0.52,\n 0.59,\n 0.43\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COST_OF_LIVING_INDEX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6638123080382213,\n \"min\": 0.16,\n \"max\": 2.56,\n \"num_unique_values\": 32,\n \"samples\": [\n 0.47,\n 0.91,\n 0.57\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"FINANCIAL_COMFORT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.12098631903624234,\n \"min\": 19.0,\n \"max\": 19.66,\n \"num_unique_values\": 21,\n \"samples\": [\n 19.29,\n 19.57,\n 19.41\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19779657441038972,\n \"min\": 0.0,\n \"max\": 0.98,\n \"num_unique_values\": 25,\n \"samples\": [\n 0.6,\n 0.58,\n 0.59\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"STRESS_LEVEL\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19463689970628054,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 25,\n \"samples\": [\n 0.41,\n 0.43,\n 0.37\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SATISFACTION_DRIVER_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1423923038012174,\n \"min\": 5.68,\n \"max\": 6.29,\n \"num_unique_values\": 24,\n \"samples\": [\n 6.03,\n 6.05,\n 6.01\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"source": [
"synthetic_rows = []\n",
"\n",
"for _, row in df.iterrows():\n",
" for i in range(3): # 3 neue Versionen pro Land\n",
"\n",
" new_row = row.copy()\n",
"\n",
" # kleine Variationen hinzufügen\n",
" new_row[\"AVG_WEEKLY_WORKING_HOURS\"] = round(\n",
" max(20, row[\"AVG_WEEKLY_WORKING_HOURS\"] + np.random.normal(0, 1.5)), 1\n",
" )\n",
"\n",
" new_row[\"MEAN_NET_INCOME\"] = round(\n",
" max(5000, row[\"MEAN_NET_INCOME\"] + np.random.normal(0, 3000)), 2\n",
" )\n",
"\n",
" new_row[\"AVG_JOB_SATISFACTION\"] = round(\n",
" min(10, max(1, row[\"AVG_JOB_SATISFACTION\"] + np.random.normal(0, 0.4))), 1\n",
" )\n",
"\n",
" # neuen Namen geben\n",
" new_row[\"COUNTRY\"] = f\"{row['COUNTRY']}_synthetic_{i+1}\"\n",
"\n",
" synthetic_rows.append(new_row)\n",
"\n",
"synthetic_df = pd.DataFrame(synthetic_rows)\n",
"\n",
"synthetic_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "JWNeZH1Xq6Xi",
"outputId": "911d9ad7-306f-4753-a8df-fb6e21cee5fc"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland_synthetic_1 37.1 30399.60 \n",
"0 Finland_synthetic_2 36.2 28473.19 \n",
"0 Finland_synthetic_3 37.1 32082.93 \n",
"1 Iceland_synthetic_1 40.6 44030.14 \n",
"1 Iceland_synthetic_2 39.5 37806.29 \n",
"\n",
" AVG_JOB_SATISFACTION \n",
"0 8.2 \n",
"0 8.7 \n",
"0 7.1 \n",
"1 8.0 \n",
"1 8.0 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland_synthetic_1 | \n",
" 37.1 | \n",
" 30399.60 | \n",
" 8.2 | \n",
"
\n",
" \n",
" | 0 | \n",
" Finland_synthetic_2 | \n",
" 36.2 | \n",
" 28473.19 | \n",
" 8.7 | \n",
"
\n",
" \n",
" | 0 | \n",
" Finland_synthetic_3 | \n",
" 37.1 | \n",
" 32082.93 | \n",
" 7.1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland_synthetic_1 | \n",
" 40.6 | \n",
" 44030.14 | \n",
" 8.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland_synthetic_2 | \n",
" 39.5 | \n",
" 37806.29 | \n",
" 8.0 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "synthetic_df",
"summary": "{\n \"name\": \"synthetic_df\",\n \"rows\": 99,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"Italy_synthetic_3\",\n \"Czechia_synthetic_2\",\n \"Serbia_synthetic_3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1257427894793697,\n \"min\": 29.5,\n \"max\": 46.7,\n \"num_unique_values\": 61,\n \"samples\": [\n 37.1,\n 32.8,\n 37.9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13060.359152584817,\n \"min\": 5000.0,\n \"max\": 55265.56,\n \"num_unique_values\": 91,\n \"samples\": [\n 9879.51,\n 35179.78,\n 40211.72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6525973667018435,\n \"min\": 5.6,\n \"max\": 8.8,\n \"num_unique_values\": 30,\n \"samples\": [\n 6.1,\n 7.5,\n 6.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "code",
"source": [
"np.random.seed(42)\n",
"\n",
"synthetic_enriched = synthetic_df.copy()\n",
"\n",
"# WORK PRESSURE\n",
"synthetic_enriched[\"WORK_PRESSURE\"] = (\n",
" (synthetic_enriched[\"AVG_WEEKLY_WORKING_HOURS\"] - df[\"AVG_WEEKLY_WORKING_HOURS\"].min()) /\n",
" (df[\"AVG_WEEKLY_WORKING_HOURS\"].max() - df[\"AVG_WEEKLY_WORKING_HOURS\"].min())\n",
").round(2)\n",
"\n",
"# COST OF LIVING\n",
"synthetic_enriched[\"COST_OF_LIVING_INDEX\"] = (\n",
" synthetic_enriched[\"MEAN_NET_INCOME\"] / df[\"MEAN_NET_INCOME\"].mean()\n",
").round(2)\n",
"\n",
"# FINANCIAL COMFORT\n",
"synthetic_enriched[\"FINANCIAL_COMFORT\"] = (\n",
" synthetic_enriched[\"MEAN_NET_INCOME\"] / (synthetic_enriched[\"COST_OF_LIVING_INDEX\"] * 1000)\n",
").round(2)\n",
"\n",
"# WORK-LIFE BALANCE\n",
"synthetic_enriched[\"WORK_LIFE_BALANCE\"] = (\n",
" 1 - synthetic_enriched[\"WORK_PRESSURE\"] + np.random.normal(0, 0.05, size=len(synthetic_enriched))\n",
").clip(0,1).round(2)\n",
"\n",
"# STRESS\n",
"synthetic_enriched[\"STRESS_LEVEL\"] = (\n",
" synthetic_enriched[\"WORK_PRESSURE\"] * 0.6 +\n",
" (1 - synthetic_enriched[\"WORK_LIFE_BALANCE\"]) * 0.4 +\n",
" np.random.normal(0, 0.05, size=len(synthetic_enriched))\n",
").clip(0,1).round(2)\n",
"\n",
"# DRIVER SCORE\n",
"synthetic_enriched[\"SATISFACTION_DRIVER_SCORE\"] = (\n",
" synthetic_enriched[\"WORK_LIFE_BALANCE\"] * 0.5 +\n",
" synthetic_enriched[\"FINANCIAL_COMFORT\"] * 0.3 -\n",
" synthetic_enriched[\"STRESS_LEVEL\"] * 0.2\n",
").round(2)\n",
"\n",
"synthetic_enriched.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "PNoV0eg4rGGT",
"outputId": "30c96354-c9f1-4cf8-d5d9-142304401ba9"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland_synthetic_1 37.1 30399.60 \n",
"0 Finland_synthetic_2 36.2 28473.19 \n",
"0 Finland_synthetic_3 37.1 32082.93 \n",
"1 Iceland_synthetic_1 40.6 44030.14 \n",
"1 Iceland_synthetic_2 39.5 37806.29 \n",
"\n",
" AVG_JOB_SATISFACTION WORK_PRESSURE COST_OF_LIVING_INDEX \\\n",
"0 8.2 0.42 1.57 \n",
"0 8.7 0.36 1.47 \n",
"0 7.1 0.42 1.66 \n",
"1 8.0 0.65 2.27 \n",
"1 8.0 0.58 1.95 \n",
"\n",
" FINANCIAL_COMFORT WORK_LIFE_BALANCE STRESS_LEVEL \\\n",
"0 19.36 0.60 0.40 \n",
"0 19.37 0.63 0.29 \n",
"0 19.33 0.61 0.39 \n",
"1 19.40 0.43 0.60 \n",
"1 19.39 0.41 0.54 \n",
"\n",
" SATISFACTION_DRIVER_SCORE \n",
"0 6.03 \n",
"0 6.07 \n",
"0 6.03 \n",
"1 5.91 \n",
"1 5.91 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
" WORK_PRESSURE | \n",
" COST_OF_LIVING_INDEX | \n",
" FINANCIAL_COMFORT | \n",
" WORK_LIFE_BALANCE | \n",
" STRESS_LEVEL | \n",
" SATISFACTION_DRIVER_SCORE | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland_synthetic_1 | \n",
" 37.1 | \n",
" 30399.60 | \n",
" 8.2 | \n",
" 0.42 | \n",
" 1.57 | \n",
" 19.36 | \n",
" 0.60 | \n",
" 0.40 | \n",
" 6.03 | \n",
"
\n",
" \n",
" | 0 | \n",
" Finland_synthetic_2 | \n",
" 36.2 | \n",
" 28473.19 | \n",
" 8.7 | \n",
" 0.36 | \n",
" 1.47 | \n",
" 19.37 | \n",
" 0.63 | \n",
" 0.29 | \n",
" 6.07 | \n",
"
\n",
" \n",
" | 0 | \n",
" Finland_synthetic_3 | \n",
" 37.1 | \n",
" 32082.93 | \n",
" 7.1 | \n",
" 0.42 | \n",
" 1.66 | \n",
" 19.33 | \n",
" 0.61 | \n",
" 0.39 | \n",
" 6.03 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland_synthetic_1 | \n",
" 40.6 | \n",
" 44030.14 | \n",
" 8.0 | \n",
" 0.65 | \n",
" 2.27 | \n",
" 19.40 | \n",
" 0.43 | \n",
" 0.60 | \n",
" 5.91 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland_synthetic_2 | \n",
" 39.5 | \n",
" 37806.29 | \n",
" 8.0 | \n",
" 0.58 | \n",
" 1.95 | \n",
" 19.39 | \n",
" 0.41 | \n",
" 0.54 | \n",
" 5.91 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "synthetic_enriched",
"summary": "{\n \"name\": \"synthetic_enriched\",\n \"rows\": 99,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99,\n \"samples\": [\n \"Italy_synthetic_3\",\n \"Czechia_synthetic_2\",\n \"Serbia_synthetic_3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.1257427894793697,\n \"min\": 29.5,\n \"max\": 46.7,\n \"num_unique_values\": 61,\n \"samples\": [\n 37.1,\n 32.8,\n 37.9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13060.359152584817,\n \"min\": 5000.0,\n \"max\": 55265.56,\n \"num_unique_values\": 91,\n \"samples\": [\n 9879.51,\n 35179.78,\n 40211.72\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6525973667018435,\n \"min\": 5.6,\n \"max\": 8.8,\n \"num_unique_values\": 30,\n \"samples\": [\n 6.1,\n 7.5,\n 6.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.20582383446743815,\n \"min\": -0.08,\n \"max\": 1.05,\n \"num_unique_values\": 52,\n \"samples\": [\n 0.55,\n 0.47,\n 0.83\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COST_OF_LIVING_INDEX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6746260941696909,\n \"min\": 0.26,\n \"max\": 2.86,\n \"num_unique_values\": 74,\n \"samples\": [\n 1.95,\n 0.38,\n 1.34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"FINANCIAL_COMFORT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.10253899007535147,\n \"min\": 19.07,\n \"max\": 19.66,\n \"num_unique_values\": 35,\n \"samples\": [\n 19.22,\n 19.29,\n 19.28\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2070930427239679,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 55,\n \"samples\": [\n 0.52,\n 0.38,\n 0.42\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"STRESS_LEVEL\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.20114212129276393,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 54,\n \"samples\": [\n 0.04,\n 0.83,\n 0.76\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SATISFACTION_DRIVER_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1506471985714822,\n \"min\": 5.57,\n \"max\": 6.31,\n \"num_unique_values\": 47,\n \"samples\": [\n 5.79,\n 5.84,\n 5.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"source": [
"final_df = pd.concat([df_enriched, synthetic_enriched], ignore_index=True)\n",
"\n",
"final_df.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "R3DA8YhwrTVt",
"outputId": "c0c74732-bc3e-4e18-d241-b6a93f0f46cf"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(132, 10)"
]
},
"metadata": {},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"source": [
"final_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 221
},
"id": "AaBdHgscrkqx",
"outputId": "63fdbf91-9388-406c-d097-bffc892187ff"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" COUNTRY AVG_WEEKLY_WORKING_HOURS MEAN_NET_INCOME \\\n",
"0 Finland 37.2 27389.0 \n",
"1 Iceland 39.4 43769.0 \n",
"2 Norway 34.0 42203.0 \n",
"3 Austria 36.7 27804.0 \n",
"4 Switzerland 35.3 49607.0 \n",
"\n",
" AVG_JOB_SATISFACTION WORK_PRESSURE COST_OF_LIVING_INDEX \\\n",
"0 8.1 0.43 1.42 \n",
"1 8.1 0.57 2.26 \n",
"2 8.1 0.22 2.18 \n",
"3 8.0 0.39 1.44 \n",
"4 7.9 0.30 2.56 \n",
"\n",
" FINANCIAL_COMFORT WORK_LIFE_BALANCE STRESS_LEVEL \\\n",
"0 19.29 0.59 0.37 \n",
"1 19.37 0.42 0.62 \n",
"2 19.36 0.81 0.15 \n",
"3 19.31 0.69 0.37 \n",
"4 19.38 0.69 0.21 \n",
"\n",
" SATISFACTION_DRIVER_SCORE \n",
"0 6.01 \n",
"1 5.90 \n",
"2 6.18 \n",
"3 6.06 \n",
"4 6.12 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" COUNTRY | \n",
" AVG_WEEKLY_WORKING_HOURS | \n",
" MEAN_NET_INCOME | \n",
" AVG_JOB_SATISFACTION | \n",
" WORK_PRESSURE | \n",
" COST_OF_LIVING_INDEX | \n",
" FINANCIAL_COMFORT | \n",
" WORK_LIFE_BALANCE | \n",
" STRESS_LEVEL | \n",
" SATISFACTION_DRIVER_SCORE | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Finland | \n",
" 37.2 | \n",
" 27389.0 | \n",
" 8.1 | \n",
" 0.43 | \n",
" 1.42 | \n",
" 19.29 | \n",
" 0.59 | \n",
" 0.37 | \n",
" 6.01 | \n",
"
\n",
" \n",
" | 1 | \n",
" Iceland | \n",
" 39.4 | \n",
" 43769.0 | \n",
" 8.1 | \n",
" 0.57 | \n",
" 2.26 | \n",
" 19.37 | \n",
" 0.42 | \n",
" 0.62 | \n",
" 5.90 | \n",
"
\n",
" \n",
" | 2 | \n",
" Norway | \n",
" 34.0 | \n",
" 42203.0 | \n",
" 8.1 | \n",
" 0.22 | \n",
" 2.18 | \n",
" 19.36 | \n",
" 0.81 | \n",
" 0.15 | \n",
" 6.18 | \n",
"
\n",
" \n",
" | 3 | \n",
" Austria | \n",
" 36.7 | \n",
" 27804.0 | \n",
" 8.0 | \n",
" 0.39 | \n",
" 1.44 | \n",
" 19.31 | \n",
" 0.69 | \n",
" 0.37 | \n",
" 6.06 | \n",
"
\n",
" \n",
" | 4 | \n",
" Switzerland | \n",
" 35.3 | \n",
" 49607.0 | \n",
" 7.9 | \n",
" 0.30 | \n",
" 2.56 | \n",
" 19.38 | \n",
" 0.69 | \n",
" 0.21 | \n",
" 6.12 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "final_df",
"summary": "{\n \"name\": \"final_df\",\n \"rows\": 132,\n \"fields\": [\n {\n \"column\": \"COUNTRY\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 132,\n \"samples\": [\n \"Denmark_synthetic_3\",\n \"Spain_synthetic_3\",\n \"Poland\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_WEEKLY_WORKING_HOURS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.0439629980154277,\n \"min\": 29.5,\n \"max\": 46.7,\n \"num_unique_values\": 70,\n \"samples\": [\n 38.9,\n 37.2,\n 41.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEAN_NET_INCOME\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12961.860338117047,\n \"min\": 3040.0,\n \"max\": 55265.56,\n \"num_unique_values\": 124,\n \"samples\": [\n 40075.0,\n 26627.68,\n 44030.14\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AVG_JOB_SATISFACTION\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6194006639584302,\n \"min\": 5.6,\n \"max\": 8.8,\n \"num_unique_values\": 30,\n \"samples\": [\n 5.6,\n 8.7,\n 6.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2003536515873169,\n \"min\": -0.08,\n \"max\": 1.05,\n \"num_unique_values\": 57,\n \"samples\": [\n 0.43,\n 0.38,\n 0.37\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COST_OF_LIVING_INDEX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.66942505912527,\n \"min\": 0.16,\n \"max\": 2.86,\n \"num_unique_values\": 96,\n \"samples\": [\n 0.8,\n 2.17,\n 0.84\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"FINANCIAL_COMFORT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1069680783668568,\n \"min\": 19.0,\n \"max\": 19.66,\n \"num_unique_values\": 41,\n \"samples\": [\n 19.17,\n 19.43,\n 19.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORK_LIFE_BALANCE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.20410584264902806,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 61,\n \"samples\": [\n 0.59,\n 0.67,\n 0.53\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"STRESS_LEVEL\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1990030088918233,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 62,\n \"samples\": [\n 0.47,\n 0.3,\n 0.37\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SATISFACTION_DRIVER_SCORE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14815050318443265,\n \"min\": 5.57,\n \"max\": 6.31,\n \"num_unique_values\": 52,\n \"samples\": [\n 5.98,\n 5.73,\n 5.67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"source": [
"final_df.to_csv(\"final_dataset.csv\", index=False)"
],
"metadata": {
"id": "f4Yymv0yrVyb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "ieiGGvL_rvir"
},
"execution_count": null,
"outputs": []
}
]
}