{
"cells": [
{
"cell_type": "markdown",
"id": "4458df13-d0f7-462e-bc80-42169bb1a62b",
"metadata": {},
"source": [
"This is a starter notebook for an updated module 5 of ML Zoomcamp\n",
"\n",
"The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a16177e8-cbd2-4088-9bb0-07a0cfb3eee6",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import sklearn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "498798c7-1848-47f0-9789-5881ae3658bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"pandas==2.3.1\n",
"numpy==2.3.1\n",
"sklearn==1.7.0\n"
]
}
],
"source": [
"print(f'pandas=={pd.__version__}')\n",
"print(f'numpy=={np.__version__}')\n",
"print(f'sklearn=={sklearn.__version__}')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e9e9464c-d8ed-45ea-9e8c-70e6d73842f7",
"metadata": {},
"outputs": [],
"source": [
"# Import the necessary libraries\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.feature_extraction import DictVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "54ff5e16-47a9-43ab-975b-37605ee75d19",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" lead_source \n",
" industry \n",
" number_of_courses_viewed \n",
" annual_income \n",
" employment_status \n",
" location \n",
" interaction_count \n",
" lead_score \n",
" converted \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" paid_ads \n",
" NaN \n",
" 1 \n",
" 79450.0 \n",
" unemployed \n",
" south_america \n",
" 4 \n",
" 0.94 \n",
" 1 \n",
" \n",
" \n",
" 1 \n",
" social_media \n",
" retail \n",
" 1 \n",
" 46992.0 \n",
" employed \n",
" south_america \n",
" 1 \n",
" 0.80 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" events \n",
" healthcare \n",
" 5 \n",
" 78796.0 \n",
" unemployed \n",
" australia \n",
" 3 \n",
" 0.69 \n",
" 1 \n",
" \n",
" \n",
" 3 \n",
" paid_ads \n",
" retail \n",
" 2 \n",
" 83843.0 \n",
" NaN \n",
" australia \n",
" 1 \n",
" 0.87 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" referral \n",
" education \n",
" 3 \n",
" 85012.0 \n",
" self_employed \n",
" europe \n",
" 3 \n",
" 0.62 \n",
" 1 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" lead_source industry number_of_courses_viewed annual_income \\\n",
"0 paid_ads NaN 1 79450.0 \n",
"1 social_media retail 1 46992.0 \n",
"2 events healthcare 5 78796.0 \n",
"3 paid_ads retail 2 83843.0 \n",
"4 referral education 3 85012.0 \n",
"\n",
" employment_status location interaction_count lead_score converted \n",
"0 unemployed south_america 4 0.94 1 \n",
"1 employed south_america 1 0.80 0 \n",
"2 unemployed australia 3 0.69 1 \n",
"3 NaN australia 1 0.87 0 \n",
"4 self_employed europe 3 0.62 1 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load the data\n",
"data_url = \"https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv\"\n",
"df = pd.read_csv(data_url)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "963e0b2c-5d60-4d8a-a216-00cb869d516d",
"metadata": {},
"outputs": [],
"source": [
"# the target variable\n",
"y_train = df.converted"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "692ae989-fb9a-4219-9a01-18424176748d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n",
" ('logisticregression', LogisticRegression(solver='liblinear'))]) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. \n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" steps \n",
" [('dictvectorizer', ...), ('logisticregression', ...)] \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" transform_input \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" memory \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" False \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" dtype \n",
" <class 'numpy.float64'> \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" separator \n",
" '=' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sparse \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sort \n",
" True \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" penalty \n",
" 'l2' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" dual \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" tol \n",
" 0.0001 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" C \n",
" 1.0 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" fit_intercept \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" intercept_scaling \n",
" 1 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" class_weight \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" random_state \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" solver \n",
" 'liblinear' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" max_iter \n",
" 100 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" multi_class \n",
" 'deprecated' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" 0 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" warm_start \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" n_jobs \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" l1_ratio \n",
" None \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n",
" ('logisticregression', LogisticRegression(solver='liblinear'))])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Preprocessing using DictVectorizer and Training the model \n",
"categorical = ['lead_source']\n",
"numeric = ['number_of_courses_viewed', 'annual_income']\n",
"\n",
"df[categorical] = df[categorical].fillna('NA')\n",
"df[numeric] = df[numeric].fillna(0)\n",
"\n",
"train_dict = df[categorical + numeric].to_dict(orient='records')\n",
"\n",
"pipeline = make_pipeline(\n",
" DictVectorizer(),\n",
" LogisticRegression(solver='liblinear')\n",
")\n",
"\n",
"pipeline.fit(train_dict, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "80f2002c-433b-4e77-9df7-965839859d4a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'lead_source': 'paid_ads',\n",
" 'number_of_courses_viewed': 1,\n",
" 'annual_income': 79450.0}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dict[0]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7bbf2adb-11c4-4853-8f1b-fd22b5cf09b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"number_of_courses_viewed\n",
"1 417\n",
"2 388\n",
"3 269\n",
"0 181\n",
"4 109\n",
"5 67\n",
"6 22\n",
"7 6\n",
"8 2\n",
"9 1\n",
"Name: count, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.number_of_courses_viewed.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "5a613b8d-47bb-4e5a-8b80-117b49221d6c",
"metadata": {},
"outputs": [],
"source": [
"# sample customer data\n",
"customer = {\n",
" 'lead_source': 'organic_search',\n",
" 'number_of_courses_viewed': 3,\n",
" 'annual_income': 50450.0}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "b91d20df-46a2-4580-9de0-f17d5bdc7f65",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(0.6644010536277872)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# probability of this customer to get converted\n",
"pipeline.predict_proba(customer)[0, 1] "
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "96a4d3ac-d5e4-4890-a085-00298c231e28",
"metadata": {},
"outputs": [],
"source": [
"# save the model\n",
"import pickle\n",
"\n",
"with open('model.bin', 'wb') as f:\n",
" pickle.dump(pipeline, f)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "7f99bdbb-1304-49e1-9f6f-fdc1fdcdba54",
"metadata": {},
"outputs": [],
"source": [
"# load the model\n",
"\n",
"with open('model.bin', 'rb') as f_in:\n",
" model = pickle.load(f_in)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0ac0af36-e4e8-475f-896d-645a63877aff",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n",
" ('logisticregression', LogisticRegression(solver='liblinear'))]) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. \n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" steps \n",
" [('dictvectorizer', ...), ('logisticregression', ...)] \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" transform_input \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" memory \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" False \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" dtype \n",
" <class 'numpy.float64'> \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" separator \n",
" '=' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sparse \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sort \n",
" True \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" penalty \n",
" 'l2' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" dual \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" tol \n",
" 0.0001 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" C \n",
" 1.0 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" fit_intercept \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" intercept_scaling \n",
" 1 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" class_weight \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" random_state \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" solver \n",
" 'liblinear' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" max_iter \n",
" 100 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" multi_class \n",
" 'deprecated' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" 0 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" warm_start \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" n_jobs \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" l1_ratio \n",
" None \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n",
" ('logisticregression', LogisticRegression(solver='liblinear'))])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4452cb3-f563-430c-ae69-09e2a5e24475",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "8720b7f9-438b-436e-8151-b6b4e64850bd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}