{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyM6lcWDIRzwQ5fcw7a7TiiZ",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cMIjQEwLJPKQ",
"outputId": "c3c64dff-48e9-4dab-e007-fc9cf25753e9"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'The-Machine-Learning-Workshop'...\n",
"remote: Enumerating objects: 805, done.\u001b[K\n",
"remote: Counting objects: 100% (23/23), done.\u001b[K\n",
"remote: Compressing objects: 100% (15/15), done.\u001b[K\n",
"remote: Total 805 (delta 15), reused 8 (delta 8), pack-reused 782 (from 1)\u001b[K\n",
"Receiving objects: 100% (805/805), 10.36 MiB | 9.64 MiB/s, done.\n",
"Resolving deltas: 100% (293/293), done.\n"
]
}
],
"source": [
"!git clone https://github.com/MsSaidat25/The-Machine-Learning-Workshop.git"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"os.chdir('/content/The-Machine-Learning-Workshop')\n",
"!ls # see all folders/files"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fkumve32Jj-w",
"outputId": "f5f0c473-b4c3-4e25-b4e9-23db465582c1"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Chapter01 Chapter03 Chapter05 Graphics README.md\n",
"Chapter02 Chapter04 Chapter06 LICENSE requirements.txt\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8a5d3702",
"outputId": "312c313b-b695-48a4-e9f6-cad0ad63f4f9"
},
"source": [
"import os\n",
"os.chdir('/content/The-Machine-Learning-Workshop/Chapter01')\n",
"!ls"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Activity1.01 Exercise1.01 Exercise1.03\n",
"Activity1.02 Exercise1.02 Exercise1.04\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "99014702",
"outputId": "a7833725-7b7b-4655-8a28-5f0898678ede"
},
"source": [
"import json\n",
"\n",
"notebook_path = '/content/The-Machine-Learning-Workshop/Chapter01/Activity1.01/Activity1_01.ipynb'\n",
"\n",
"with open(notebook_path, 'r') as f:\n",
" notebook_content = json.load(f)\n",
"\n",
"cells_to_generate = []\n",
"for cell in notebook_content['cells']:\n",
" if cell['cell_type'] == 'code':\n",
" cells_to_generate.append({'cell_type': 'python', 'code': ''.join(cell['source'])})\n",
" elif cell['cell_type'] == 'markdown':\n",
" cells_to_generate.append({'cell_type': 'markdown', 'code': ''.join(cell['source'])})\n",
"\n",
"# This list will be used by the next command to generate the actual cells.\n",
"# For now, I will just print the first few cells to confirm the parsing.\n",
"print(f\"Found {len(cells_to_generate)} cells in the notebook. Preview of the first cell:\\n{cells_to_generate[0]['code'] if cells_to_generate else 'No cells found.'}\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Found 5 cells in the notebook. Preview of the first cell:\n",
"import seaborn as sns\n",
"titanic = sns.load_dataset('titanic')\n",
"titanic.head(10)\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "12666191",
"outputId": "cb746e2f-c18d-4518-88c9-d6e3a401b318"
},
"source": [
"import os\n",
"os.chdir('/content/The-Machine-Learning-Workshop/Chapter01/Activity1.01')\n",
"!ls"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Activity1_01.ipynb titanic.csv unit_test_activity1_01.ipynb\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 383
},
"id": "1b2938e2",
"outputId": "50844bb9-e0d9-48bf-a6ae-79e9aa0a0c8c"
},
"source": [
"import seaborn as sns\n",
"titanic = sns.load_dataset('titanic')\n",
"titanic.head(10)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" survived pclass sex age sibsp parch fare embarked class \\\n",
"0 0 3 male 22.0 1 0 7.2500 S Third \n",
"1 1 1 female 38.0 1 0 71.2833 C First \n",
"2 1 3 female 26.0 0 0 7.9250 S Third \n",
"3 1 1 female 35.0 1 0 53.1000 S First \n",
"4 0 3 male 35.0 0 0 8.0500 S Third \n",
"5 0 3 male NaN 0 0 8.4583 Q Third \n",
"6 0 1 male 54.0 0 0 51.8625 S First \n",
"7 0 3 male 2.0 3 1 21.0750 S Third \n",
"8 1 3 female 27.0 0 2 11.1333 S Third \n",
"9 1 2 female 14.0 1 0 30.0708 C Second \n",
"\n",
" who adult_male deck embark_town alive alone \n",
"0 man True NaN Southampton no False \n",
"1 woman False C Cherbourg yes False \n",
"2 woman False NaN Southampton yes True \n",
"3 woman False C Southampton yes False \n",
"4 man True NaN Southampton no True \n",
"5 man True NaN Queenstown no True \n",
"6 man True E Southampton no True \n",
"7 child False NaN Southampton no False \n",
"8 woman False NaN Southampton yes False \n",
"9 child False NaN Cherbourg yes False "
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
" class | \n",
" who | \n",
" adult_male | \n",
" deck | \n",
" embark_town | \n",
" alive | \n",
" alone | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.2500 | \n",
" S | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" False | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.2833 | \n",
" C | \n",
" First | \n",
" woman | \n",
" False | \n",
" C | \n",
" Cherbourg | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 3 | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.9250 | \n",
" S | \n",
" Third | \n",
" woman | \n",
" False | \n",
" NaN | \n",
" Southampton | \n",
" yes | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.1000 | \n",
" S | \n",
" First | \n",
" woman | \n",
" False | \n",
" C | \n",
" Southampton | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.0500 | \n",
" S | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" True | \n",
"
\n",
" \n",
" | 5 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 8.4583 | \n",
" Q | \n",
" Third | \n",
" man | \n",
" True | \n",
" NaN | \n",
" Queenstown | \n",
" no | \n",
" True | \n",
"
\n",
" \n",
" | 6 | \n",
" 0 | \n",
" 1 | \n",
" male | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" 51.8625 | \n",
" S | \n",
" First | \n",
" man | \n",
" True | \n",
" E | \n",
" Southampton | \n",
" no | \n",
" True | \n",
"
\n",
" \n",
" | 7 | \n",
" 0 | \n",
" 3 | \n",
" male | \n",
" 2.0 | \n",
" 3 | \n",
" 1 | \n",
" 21.0750 | \n",
" S | \n",
" Third | \n",
" child | \n",
" False | \n",
" NaN | \n",
" Southampton | \n",
" no | \n",
" False | \n",
"
\n",
" \n",
" | 8 | \n",
" 1 | \n",
" 3 | \n",
" female | \n",
" 27.0 | \n",
" 0 | \n",
" 2 | \n",
" 11.1333 | \n",
" S | \n",
" Third | \n",
" woman | \n",
" False | \n",
" NaN | \n",
" Southampton | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
" | 9 | \n",
" 1 | \n",
" 2 | \n",
" female | \n",
" 14.0 | \n",
" 1 | \n",
" 0 | \n",
" 30.0708 | \n",
" C | \n",
" Second | \n",
" child | \n",
" False | \n",
" NaN | \n",
" Cherbourg | \n",
" yes | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "titanic",
"summary": "{\n \"name\": \"titanic\",\n \"rows\": 891,\n \"fields\": [\n {\n \"column\": \"survived\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"female\",\n \"male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.526497332334044,\n \"min\": 0.42,\n \"max\": 80.0,\n \"num_unique_values\": 88,\n \"samples\": [\n 0.75,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sibsp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 7,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"parch\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 6,\n \"num_unique_values\": 7,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 49.693428597180905,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 248,\n \"samples\": [\n 11.2417,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embarked\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"S\",\n \"C\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"class\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Third\",\n \"First\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"who\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"man\",\n \"woman\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"adult_male\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n false,\n true\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"deck\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"C\",\n \"E\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embark_town\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Southampton\",\n \"Cherbourg\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"alive\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"yes\",\n \"no\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"alone\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n true,\n false\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "bd9b06d2",
"outputId": "b51231af-c764-4e73-bcc1-56a31c16da8b"
},
"source": [
"X = titanic[['sex', 'age', 'fare', 'class', 'embark_town', 'alone']]\n",
"display(X.head())"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" sex age fare class embark_town alone\n",
"0 male 22.0 7.2500 Third Southampton False\n",
"1 female 38.0 71.2833 First Cherbourg False\n",
"2 female 26.0 7.9250 Third Southampton True\n",
"3 female 35.0 53.1000 First Southampton False\n",
"4 male 35.0 8.0500 Third Southampton True"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" age | \n",
" fare | \n",
" class | \n",
" embark_town | \n",
" alone | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" male | \n",
" 22.0 | \n",
" 7.2500 | \n",
" Third | \n",
" Southampton | \n",
" False | \n",
"
\n",
" \n",
" | 1 | \n",
" female | \n",
" 38.0 | \n",
" 71.2833 | \n",
" First | \n",
" Cherbourg | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" female | \n",
" 26.0 | \n",
" 7.9250 | \n",
" Third | \n",
" Southampton | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" female | \n",
" 35.0 | \n",
" 53.1000 | \n",
" First | \n",
" Southampton | \n",
" False | \n",
"
\n",
" \n",
" | 4 | \n",
" male | \n",
" 35.0 | \n",
" 8.0500 | \n",
" Third | \n",
" Southampton | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"display(X\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"female\",\n \"male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6.833739825307955,\n \"min\": 22.0,\n \"max\": 38.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 38.0,\n 35.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30.5100288352535,\n \"min\": 7.25,\n \"max\": 71.2833,\n \"num_unique_values\": 5,\n \"samples\": [\n 71.2833,\n 8.05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"class\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"First\",\n \"Third\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embark_town\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Cherbourg\",\n \"Southampton\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"alone\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n true,\n false\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "15cb730c",
"outputId": "8f852295-a151-4942-b1d0-c163351a517f"
},
"source": [
"X.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(891, 6)"
]
},
"metadata": {},
"execution_count": 34
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "42d82e7e",
"outputId": "81ccfc58-d19f-4c0d-b329-de45f0706430"
},
"source": [
"Y.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(891,)"
]
},
"metadata": {},
"execution_count": 35
}
]
},
{
"cell_type": "markdown",
"source": [
"Dealing with messy data"
],
"metadata": {
"id": "BN_Y-5xcReHs"
}
},
{
"cell_type": "code",
"source": [
"import seaborn as sns\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"tips = sns.load_dataset('titanic')"
],
"metadata": {
"id": "6V7NzckGO9cR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "c79dedfd"
},
"source": [
"# Task\n",
"Check for and handle missing values and outliers in the features matrix `X`. Then, summarize the findings and the methods used to address them."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "a33d522d"
},
"source": [
"## Check Missing Values\n",
"\n",
"### Subtask:\n",
"Identify and count the number of missing values in each column of the features matrix `X`. This will help us understand the extent of missing data.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ea922db0"
},
"source": [
"**Reasoning**:\n",
"To identify and count missing values in each column of the features matrix `X`, I will use the `.isnull()` method followed by `.sum()` to get the total count for each column. This directly addresses the subtask's instructions.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "d43f1b5d",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b6879fd0-16bf-4fc1-c951-b2cedc66202b"
},
"source": [
"missing_values = X.isnull().sum()\n",
"print(\"Missing values per column in X:\")\n",
"print(missing_values)"
],
"execution_count": 37,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Missing values per column in X:\n",
"sex 0\n",
"age 177\n",
"fare 0\n",
"class 0\n",
"embark_town 2\n",
"alone 0\n",
"dtype: int64\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9dfd8c70"
},
"source": [
"# Task\n",
"Impute missing values in the 'age' column of DataFrame `X` using the median of the 'age' column."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "26083663"
},
"source": [
"## Handle Missing Values in 'age'\n",
"\n",
"### Subtask:\n",
"Impute missing values in the 'age' column of DataFrame `X` using the median of the 'age' column.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6d688010"
},
"source": [
"**Reasoning**:\n",
"To impute the missing values in the 'age' column, I will first calculate its median as specified in the instructions.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "c8c06424",
"outputId": "0b48740b-a9d0-4593-a370-c99d172892fc"
},
"source": [
"median_age = X['age'].median()\n",
"print(f\"Median age: {median_age}\")"
],
"execution_count": 40,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Median age: 28.0\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "40c208e7"
},
"source": [
"**Reasoning**:\n",
"Now that the median age has been calculated, I will use it to fill the missing values in the 'age' column of DataFrame `X`, and then verify the imputation by checking for remaining missing values.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "77b9c2d7",
"outputId": "c63b0165-7ffb-43ca-879f-979f7724b4cc"
},
"source": [
"features = [\"age\", \"fare\"]\n",
"for feature in features:\n",
" min_ = X[feature].mean() - (3 * X[feature].std())\n",
" max_ = X[feature].mean() + (3 * X[feature].std())\n",
" X = X[X[feature] <= max_]\n",
" X = X[X[feature] >= min_]\n",
" print(feature, \":\", X.shape)"
],
"execution_count": 46,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"age : (884, 6)\n",
"fare : (864, 6)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"features = [\"sex\", \"class\", \"embark_town\", \"alone\"]\n",
"for feature in features:\n",
" count_ = X[feature].value_counts()\n",
" print(feature)\n",
" print(count_, \"\\n\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_PFKkCUKW1JE",
"outputId": "4a4c1e72-57a0-4a02-a591-d5a9d8781c33"
},
"execution_count": 47,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"sex\n",
"sex\n",
"male 562\n",
"female 302\n",
"Name: count, dtype: int64 \n",
"\n",
"class\n",
"class\n",
"Third 489\n",
"First 192\n",
"Second 183\n",
"Name: count, dtype: int64 \n",
"\n",
"embark_town\n",
"embark_town\n",
"Southampton 632\n",
"Cherbourg 154\n",
"Queenstown 76\n",
"Name: count, dtype: int64 \n",
"\n",
"alone\n",
"alone\n",
"True 524\n",
"False 340\n",
"Name: count, dtype: int64 \n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"enc = LabelEncoder()\n",
"X[\"sex\"] = enc.fit_transform(X['sex'].astype('str'))\n",
"X[\"class\"] = enc.fit_transform(X['class'].astype('str'))\n",
"X[\"embark_town\"] = enc.fit_transform(X['embark_town'].\\\n",
" astype('str'))\n",
"X[\"alone\"] = enc.fit_transform(X['alone'].astype('str'))\n",
"X.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "AwsqNomXW45N",
"outputId": "990b54f7-d636-423f-8522-73af7a5b9cca"
},
"execution_count": 49,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" sex age fare class embark_town alone\n",
"0 1 22.0 7.2500 2 2 0\n",
"1 0 38.0 71.2833 0 0 0\n",
"2 0 26.0 7.9250 2 2 1\n",
"3 0 35.0 53.1000 0 2 0\n",
"4 1 35.0 8.0500 2 2 1"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" age | \n",
" fare | \n",
" class | \n",
" embark_town | \n",
" alone | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 22.0 | \n",
" 7.2500 | \n",
" 2 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 38.0 | \n",
" 71.2833 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 26.0 | \n",
" 7.9250 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 35.0 | \n",
" 53.1000 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 35.0 | \n",
" 8.0500 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "X",
"summary": "{\n \"name\": \"X\",\n \"rows\": 864,\n \"fields\": [\n {\n \"column\": \"sex\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.498758947613258,\n \"min\": 0.42,\n \"max\": 66.0,\n \"num_unique_values\": 83,\n \"samples\": [\n 5.0,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 29.400192357023762,\n \"min\": 0.0,\n \"max\": 164.8667,\n \"num_unique_values\": 239,\n \"samples\": [\n 7.8958,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"class\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 2,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"embark_town\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 3,\n \"num_unique_values\": 4,\n \"samples\": [\n 0,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"alone\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 49
}
]
}
]
}