{ "cells": [ { "cell_type": "markdown", "id": "4458df13-d0f7-462e-bc80-42169bb1a62b", "metadata": {}, "source": [ "This is a starter notebook for an updated module 5 of ML Zoomcamp\n", "\n", "The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)" ] }, { "cell_type": "code", "execution_count": 1, "id": "a16177e8-cbd2-4088-9bb0-07a0cfb3eee6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": 2, "id": "498798c7-1848-47f0-9789-5881ae3658bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pandas==2.3.1\n", "numpy==2.3.1\n", "sklearn==1.7.0\n" ] } ], "source": [ "print(f'pandas=={pd.__version__}')\n", "print(f'numpy=={np.__version__}')\n", "print(f'sklearn=={sklearn.__version__}')" ] }, { "cell_type": "code", "execution_count": 4, "id": "e9e9464c-d8ed-45ea-9e8c-70e6d73842f7", "metadata": {}, "outputs": [], "source": [ "# Import the necessary libraries\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.feature_extraction import DictVectorizer" ] }, { "cell_type": "code", "execution_count": 8, "id": "54ff5e16-47a9-43ab-975b-37605ee75d19", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lead_sourceindustrynumber_of_courses_viewedannual_incomeemployment_statuslocationinteraction_countlead_scoreconverted
0paid_adsNaN179450.0unemployedsouth_america40.941
1social_mediaretail146992.0employedsouth_america10.800
2eventshealthcare578796.0unemployedaustralia30.691
3paid_adsretail283843.0NaNaustralia10.870
4referraleducation385012.0self_employedeurope30.621
\n", "
" ], "text/plain": [ " lead_source industry number_of_courses_viewed annual_income \\\n", "0 paid_ads NaN 1 79450.0 \n", "1 social_media retail 1 46992.0 \n", "2 events healthcare 5 78796.0 \n", "3 paid_ads retail 2 83843.0 \n", "4 referral education 3 85012.0 \n", "\n", " employment_status location interaction_count lead_score converted \n", "0 unemployed south_america 4 0.94 1 \n", "1 employed south_america 1 0.80 0 \n", "2 unemployed australia 3 0.69 1 \n", "3 NaN australia 1 0.87 0 \n", "4 self_employed europe 3 0.62 1 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the data\n", "data_url = \"https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv\"\n", "df = pd.read_csv(data_url)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 11, "id": "963e0b2c-5d60-4d8a-a216-00cb869d516d", "metadata": {}, "outputs": [], "source": [ "# the target variable\n", "y_train = df.converted" ] }, { "cell_type": "code", "execution_count": 14, "id": "692ae989-fb9a-4219-9a01-18424176748d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n",
       "                ('logisticregression', LogisticRegression(solver='liblinear'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n", " ('logisticregression', LogisticRegression(solver='liblinear'))])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Preprocessing using DictVectorizer and Training the model \n", "categorical = ['lead_source']\n", "numeric = ['number_of_courses_viewed', 'annual_income']\n", "\n", "df[categorical] = df[categorical].fillna('NA')\n", "df[numeric] = df[numeric].fillna(0)\n", "\n", "train_dict = df[categorical + numeric].to_dict(orient='records')\n", "\n", "pipeline = make_pipeline(\n", " DictVectorizer(),\n", " LogisticRegression(solver='liblinear')\n", ")\n", "\n", "pipeline.fit(train_dict, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "id": "80f2002c-433b-4e77-9df7-965839859d4a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'lead_source': 'paid_ads',\n", " 'number_of_courses_viewed': 1,\n", " 'annual_income': 79450.0}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dict[0]" ] }, { "cell_type": "code", "execution_count": 21, "id": "7bbf2adb-11c4-4853-8f1b-fd22b5cf09b2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "number_of_courses_viewed\n", "1 417\n", "2 388\n", "3 269\n", "0 181\n", "4 109\n", "5 67\n", "6 22\n", "7 6\n", "8 2\n", "9 1\n", "Name: count, dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.number_of_courses_viewed.value_counts()" ] }, { "cell_type": "code", "execution_count": 26, "id": "5a613b8d-47bb-4e5a-8b80-117b49221d6c", "metadata": {}, "outputs": [], "source": [ "# sample customer data\n", "customer = {\n", " 'lead_source': 'organic_search',\n", " 'number_of_courses_viewed': 3,\n", " 'annual_income': 50450.0}" ] }, { "cell_type": "code", "execution_count": 28, "id": "b91d20df-46a2-4580-9de0-f17d5bdc7f65", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.float64(0.6644010536277872)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# probability of this customer to get converted\n", "pipeline.predict_proba(customer)[0, 1] " ] }, { "cell_type": "code", "execution_count": 29, "id": "96a4d3ac-d5e4-4890-a085-00298c231e28", "metadata": {}, "outputs": [], "source": [ "# save the model\n", "import pickle\n", "\n", "with open('model.bin', 'wb') as f:\n", " pickle.dump(pipeline, f)" ] }, { "cell_type": "code", "execution_count": 31, "id": "7f99bdbb-1304-49e1-9f6f-fdc1fdcdba54", "metadata": {}, "outputs": [], "source": [ "# load the model\n", "\n", "with open('model.bin', 'rb') as f_in:\n", " model = pickle.load(f_in)" ] }, { "cell_type": "code", "execution_count": 32, "id": "0ac0af36-e4e8-475f-896d-645a63877aff", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n",
       "                ('logisticregression', LogisticRegression(solver='liblinear'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('dictvectorizer', DictVectorizer()),\n", " ('logisticregression', LogisticRegression(solver='liblinear'))])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model" ] }, { "cell_type": "code", "execution_count": null, "id": "e4452cb3-f563-430c-ae69-09e2a5e24475", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8720b7f9-438b-436e-8151-b6b4e64850bd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.1" } }, "nbformat": 4, "nbformat_minor": 5 }