# -*- coding: utf-8 -*- """model.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1hA9Xz_VjzMVj66qS_j3A5dKcGkAfScKM """ pip install pycaret from scipy import stats import numpy as np import pandas as pd import pickle import matplotlib.pyplot as plt # create fictitious data set simple_life_dataset = pd.DataFrame({'Age':[0, 60], 'Life Expectancy':[90, 30]}) simple_life_dataset.head() import numpy as np from scipy import stats slope, intercept, r_value, p_value, std_err = stats.linregress(simple_life_dataset['Age'],simple_life_dataset['Life Expectancy']) print('intercept: ', intercept) print('slope: ', slope) np.ceil(slope * 20 + intercept) fig, axes = plt.subplots(figsize=(5,5)) x = [0,20,60] y = [90, 70, 30] axes.plot(x,y, color='blue', linestyle='--', marker='o') fig.suptitle('Life Expectancy') axes.set_xlabel('age') axes.set_xlim([-5,100]) axes.set_ylabel('life_expectancy') axes.set_ylim([0,100]) plt.grid() plt.show() # load WHO longevity data # http://apps.who.int/gho/data/node.main.688 who_list = pd.read_csv('/content/drive/MyDrive/WHOSIS_000001,WHOSIS_000015.csv') # save a local copy of the data set for our Flask prototype later on who_list.to_csv('WHOSIS_000001,WHOSIS_000015.csv') # Keep only useful features fix case display of country text who_list = who_list[['GHO (DISPLAY)', 'YEAR (CODE)' , 'COUNTRY (DISPLAY)', 'SEX (DISPLAY)', 'Numeric']] who_list['COUNTRY (DISPLAY)'] = [ctry.title() for ctry in who_list['COUNTRY (DISPLAY)'].values] # print a few rows who_list[who_list['COUNTRY (DISPLAY)']=='France'].head(10) country = 'United States Of America' sex = 'Male' # pull latest entries for birth and 60 years for a country and gender sub_set = who_list[who_list['COUNTRY (DISPLAY)'].str.startswith(country, na=False)] sub_set = sub_set[sub_set['SEX (DISPLAY)'] == sex] # sort by year in descending order to work with the latest read sub_set = sub_set.sort_values('YEAR (CODE)', ascending=False) sub_set_birth = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at birth (years)'] sub_set_60 = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at age 60 (years)'] print('sub_set_birth:') print(sub_set_birth.head(5)) print('sub_set_60:') print(sub_set_60.head(5)) # create data set with both points as shown in first example lf_at_birth = sub_set_birth['Numeric'].values[0] lf_at_60 = sub_set_60['Numeric'].values[0] # let's organize our data and plot age = [0,60] life_expectancy = [lf_at_birth, lf_at_60] fig, axes = plt.subplots(figsize=(5,5)) x = age y = life_expectancy axes.plot(x,y, color='blue', linestyle='--', marker='o') fig.suptitle('Life Expectancy') axes.set_xlabel('age') axes.set_xlim([-5,100]) axes.set_ylabel('life expectancy') axes.set_ylim([0,100]) plt.grid() plt.show() # model slope, intercept, r_value, p_value, std_err = stats.linregress(age, life_expectancy) print('intercept: ', intercept) print('slope: ', slope) # predict life expectancy for an 49-year-old male in the USA: np.ceil(slope * 49 + intercept) def get_life_expectancy(age, country, sex): # pull latest entries for birth and 60 years sub_set = who_list[who_list['COUNTRY (DISPLAY)'].str.startswith(country, na=False)] sub_set = sub_set[sub_set['SEX (DISPLAY)'] == sex] sub_set = sub_set.sort_values('YEAR (CODE)', ascending=False) sub_set_birth = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at birth (years)'] sub_set_60 = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at age 60 (years)'] # not all combinations exsits so check that we have data for both if len(sub_set_birth['Numeric']) > 0 and len(sub_set_60['Numeric']) > 0: # create data set with both points as shown in first example lf_at_birth = sub_set_birth['Numeric'].values[0] lf_at_60 = sub_set_60['Numeric'].values[0] # model slope, intercept, r_value, p_value, std_err = stats.linregress([0,60],[lf_at_birth, lf_at_60]) # predict for the age variable return(np.ceil(slope * age + intercept)) else: return None list(set(who_list['COUNTRY (DISPLAY)']))[0:10] # test the function out using a 22-year-old Japanese female: get_life_expectancy(22, 'Japan', 'Female') get_life_expectancy(22,'Pakistan','Female') get_life_expectancy(21,'India','Male') missing_values_count = who_list.isnull().sum() print(missing_values_count) get_life_expectancy(80,'Pakistan','Female') pickle.dump(get_life_expectancy,open('model.pkl','wb')) model=pickle.load(open('model.pkl','rb'))