#!/usr/bin/env python # coding: utf-8 # This is a starter notebook for an updated module 5 of ML Zoomcamp # # The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) # Import the necessary libraries import numpy as np import pandas as pd import sklearn import pickle from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline from sklearn.feature_extraction import DictVectorizer print(f'pandas=={pd.__version__}') print(f'numpy=={np.__version__}') print(f'sklearn=={sklearn.__version__}') # Load the data def load_data(): data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv" df = pd.read_csv(data_url) return df def train_model(df): # Preprocessing using DictVectorizer and Training the Logistic Regressio model categorical = ['lead_source'] numeric = ['number_of_courses_viewed', 'annual_income'] df[categorical] = df[categorical].fillna('NA') df[numeric] = df[numeric].fillna(0) train_dict = df[categorical + numeric].to_dict(orient='records') pipeline = make_pipeline( DictVectorizer(), LogisticRegression(solver='liblinear') ) # the target variable y_train = df.converted pipeline.fit(train_dict, y_train) return pipeline def save_model(filename, model): with open(filename, 'wb') as f_out: pickle.dump(model, f_out) print(f"Model saved to {filename}") df = load_data() pipeline = train_model(df) save_model('model.bin', pipeline)