File size: 4,566 Bytes
7458c95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# -*- coding: utf-8 -*-
"""model.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1hA9Xz_VjzMVj66qS_j3A5dKcGkAfScKM
"""
pip install pycaret
from scipy import stats
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
# create fictitious data set
simple_life_dataset = pd.DataFrame({'Age':[0, 60], 'Life Expectancy':[90, 30]})
simple_life_dataset.head()
import numpy as np
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(simple_life_dataset['Age'],simple_life_dataset['Life Expectancy'])
print('intercept: ', intercept)
print('slope: ', slope)
np.ceil(slope * 20 + intercept)
fig, axes = plt.subplots(figsize=(5,5))
x = [0,20,60]
y = [90, 70, 30]
axes.plot(x,y, color='blue', linestyle='--', marker='o')
fig.suptitle('Life Expectancy')
axes.set_xlabel('age')
axes.set_xlim([-5,100])
axes.set_ylabel('life_expectancy')
axes.set_ylim([0,100])
plt.grid()
plt.show()
# load WHO longevity data
# http://apps.who.int/gho/data/node.main.688
who_list = pd.read_csv('/content/drive/MyDrive/WHOSIS_000001,WHOSIS_000015.csv')
# save a local copy of the data set for our Flask prototype later on
who_list.to_csv('WHOSIS_000001,WHOSIS_000015.csv')
# Keep only useful features fix case display of country text
who_list = who_list[['GHO (DISPLAY)', 'YEAR (CODE)' , 'COUNTRY (DISPLAY)', 'SEX (DISPLAY)', 'Numeric']]
who_list['COUNTRY (DISPLAY)'] = [ctry.title() for ctry in who_list['COUNTRY (DISPLAY)'].values]
# print a few rows
who_list[who_list['COUNTRY (DISPLAY)']=='France'].head(10)
country = 'United States Of America'
sex = 'Male'
# pull latest entries for birth and 60 years for a country and gender
sub_set = who_list[who_list['COUNTRY (DISPLAY)'].str.startswith(country, na=False)]
sub_set = sub_set[sub_set['SEX (DISPLAY)'] == sex]
# sort by year in descending order to work with the latest read
sub_set = sub_set.sort_values('YEAR (CODE)', ascending=False)
sub_set_birth = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at birth (years)']
sub_set_60 = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at age 60 (years)']
print('sub_set_birth:')
print(sub_set_birth.head(5))
print('sub_set_60:')
print(sub_set_60.head(5))
# create data set with both points as shown in first example
lf_at_birth = sub_set_birth['Numeric'].values[0]
lf_at_60 = sub_set_60['Numeric'].values[0]
# let's organize our data and plot
age = [0,60]
life_expectancy = [lf_at_birth, lf_at_60]
fig, axes = plt.subplots(figsize=(5,5))
x = age
y = life_expectancy
axes.plot(x,y, color='blue', linestyle='--', marker='o')
fig.suptitle('Life Expectancy')
axes.set_xlabel('age')
axes.set_xlim([-5,100])
axes.set_ylabel('life expectancy')
axes.set_ylim([0,100])
plt.grid()
plt.show()
# model
slope, intercept, r_value, p_value, std_err = stats.linregress(age, life_expectancy)
print('intercept: ', intercept)
print('slope: ', slope)
# predict life expectancy for an 49-year-old male in the USA:
np.ceil(slope * 49 + intercept)
def get_life_expectancy(age, country, sex):
# pull latest entries for birth and 60 years
sub_set = who_list[who_list['COUNTRY (DISPLAY)'].str.startswith(country, na=False)]
sub_set = sub_set[sub_set['SEX (DISPLAY)'] == sex]
sub_set = sub_set.sort_values('YEAR (CODE)', ascending=False)
sub_set_birth = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at birth (years)']
sub_set_60 = sub_set[sub_set['GHO (DISPLAY)'] == 'Life expectancy at age 60 (years)']
# not all combinations exsits so check that we have data for both
if len(sub_set_birth['Numeric']) > 0 and len(sub_set_60['Numeric']) > 0:
# create data set with both points as shown in first example
lf_at_birth = sub_set_birth['Numeric'].values[0]
lf_at_60 = sub_set_60['Numeric'].values[0]
# model
slope, intercept, r_value, p_value, std_err = stats.linregress([0,60],[lf_at_birth, lf_at_60])
# predict for the age variable
return(np.ceil(slope * age + intercept))
else:
return None
list(set(who_list['COUNTRY (DISPLAY)']))[0:10]
# test the function out using a 22-year-old Japanese female:
get_life_expectancy(22, 'Japan', 'Female')
get_life_expectancy(22,'Pakistan','Female')
get_life_expectancy(21,'India','Male')
missing_values_count = who_list.isnull().sum()
print(missing_values_count)
get_life_expectancy(80,'Pakistan','Female')
pickle.dump(get_life_expectancy,open('model.pkl','wb'))
model=pickle.load(open('model.pkl','rb')) |