|
|
import streamlit as st |
|
|
import numpy as np |
|
|
import pickle |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
|
|
|
|
|
|
model = pickle.load(open("life_expectancy_model.pkl", "rb")) |
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Life Expectancy Prediction", |
|
|
page_icon="π", |
|
|
layout="centered", |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
.stApp { |
|
|
background-color: #E3F2FD; |
|
|
} |
|
|
.title { |
|
|
text-align: center; |
|
|
font-size: 28px; |
|
|
font-weight: bold; |
|
|
color: #2C3E50; |
|
|
} |
|
|
.subtitle { |
|
|
text-align: center; |
|
|
font-size: 30px; |
|
|
font-weight: bold; |
|
|
color: #003366; |
|
|
margin-top: 10px; |
|
|
} |
|
|
.stButton > button { |
|
|
width: 100%; |
|
|
background-color: #1E88E5; |
|
|
color: white; |
|
|
font-size: 16px; |
|
|
font-weight: bold; |
|
|
border-radius: 6px; |
|
|
padding: 8px; |
|
|
transition: 0.3s; |
|
|
} |
|
|
.stButton > button:hover { |
|
|
background-color: #1565C0; |
|
|
} |
|
|
.result-box { |
|
|
text-align: center; |
|
|
font-size: 22px; |
|
|
font-weight: bold; |
|
|
color: white; |
|
|
padding: 15px; |
|
|
border-radius: 8px; |
|
|
margin-top: 20px; |
|
|
background-color: #388E3C; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
if "current_page" not in st.session_state: |
|
|
st.session_state.current_page = "Model Pipeline" |
|
|
|
|
|
def switch_page(page): |
|
|
st.session_state.current_page = page |
|
|
|
|
|
|
|
|
st.sidebar.title("Navigation") |
|
|
if st.sidebar.button("Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
if st.sidebar.button("Hands-on Model"): |
|
|
switch_page("Hands-on Model") |
|
|
|
|
|
|
|
|
data = pd.read_csv("Life Expectancy Data.csv") |
|
|
data.columns = data.columns.str.strip() |
|
|
|
|
|
|
|
|
if st.session_state.current_page == "Model Pipeline": |
|
|
st.markdown("<h1 class='title'>Model Pipeline</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
st.image("images/Life_Expectancy.webp", |
|
|
caption="Life Expectancy Prediction Overview", |
|
|
use_container_width=True) |
|
|
|
|
|
|
|
|
if st.button("**Problem Statement**"): |
|
|
switch_page("Problem Statement") |
|
|
if st.button("**Data Collection**"): |
|
|
switch_page("Data Collection") |
|
|
if st.button("**Simple EDA**"): |
|
|
switch_page("Simple EDA") |
|
|
if st.button("**Data Pre-processing**"): |
|
|
switch_page("Data Pre-processing") |
|
|
if st.button("**Exploratory Data Analysis**"): |
|
|
switch_page("EDA") |
|
|
if st.button("**Model Building**"): |
|
|
switch_page("Model Building") |
|
|
if st.button("**Final Model**"): |
|
|
switch_page("Final Model") |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<div style="text-align: center;"> |
|
|
<a href="https://github.com/Yashvj22/Life_Expectancy_Model" target="_blank" style=" |
|
|
background-color: #007bff; |
|
|
color: white; |
|
|
padding: 12px 25px; |
|
|
text-decoration: none; |
|
|
font-size: 16px; |
|
|
font-weight: bold; |
|
|
border-radius: 8px; |
|
|
display: inline-block; |
|
|
transition: 0.3s;"> |
|
|
π See Whole Code on GitHub |
|
|
</a> |
|
|
</div> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(''' |
|
|
<h2 style="text-align:center;"> About Author</h2> |
|
|
<div style="background-color:#f5f5f5; border-radius:10px; padding:20px; margin-top:20px;"> |
|
|
<p style="font-size:16px; text-align:center; font-family:Georgia; line-height:1.6; color:#000;"> |
|
|
Hello! Iβm <b>Yash Jadhav</b>, a passionate <span style="color:#FF6347;">Data Scientist</span> |
|
|
and <span style="color:#4682B4;">Data Analyst</span>. |
|
|
I specialize in transforming raw data into actionable insights and helping others master the art of Machine Learning. |
|
|
</p> |
|
|
<div style="text-align:center; margin-top:20px;"> |
|
|
<a href="https://www.linkedin.com/in/yash-jadhav-454b0a237/" target="_blank" style=" |
|
|
background-color:#0073b1; color:white; padding:10px 20px; border-radius:5px; |
|
|
text-decoration:none; margin-right:10px;">LinkedIn</a> |
|
|
<a href="https://github.com/Yashvj22" target="_blank" style=" |
|
|
background-color:black; color:white; padding:10px 20px; border-radius:5px; |
|
|
text-decoration:none; margin-right:10px;">GitHub</a> |
|
|
<a href="https://medium.com/@yashvj2222" target="_blank" style=" |
|
|
background-color:grey; color:white; padding:10px 20px; border-radius:5px; |
|
|
text-decoration:none;">Medium</a> |
|
|
</div> |
|
|
</div> |
|
|
''', unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "Problem Statement": |
|
|
st.markdown("<h1 class='title'>Problem Statement</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center; margin-top: 20px;"> |
|
|
The Goal of this project is to build a predictive model that estimates the Life Expectancy of a country |
|
|
based on multiple influencing factors such as health indicators, economic conditions, and social parameters. |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
st.image("images/problem_statement.png", |
|
|
caption="Life Expectancy Prediction Overview", |
|
|
use_container_width=True) |
|
|
|
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
elif st.session_state.current_page == "Data Collection": |
|
|
st.markdown("<h1 class='title'>Data Collection</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center; margin-top: 20px;"> |
|
|
The dataset used in this project is sourced from Kaggle, containing information on life expectancy across |
|
|
different countries along with various health, economic, and demographic factors. |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center; margin-top: 10px;"> |
|
|
π <a href="https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who" target="_blank" style="font-weight: bold; color: #007BFF; text-decoration: none;"> |
|
|
Click here to access the dataset on Kaggle</a> |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h2 class='subtitle' style='text-align: center; margin-top: 20px;'>Dataset Overview</h2>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center; margin-top: 15px; margin-bottom: 20px;"> |
|
|
The dataset consists of <b>2938 rows</b> and <b>22 columns</b>, capturing crucial indicators such as life expectancy, |
|
|
mortality rates, GDP, schooling, immunization rates, and more. Below is a summary of the dataset features: |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
data_info = """ |
|
|
<div style= "font-size: 16px; background-color: #F5F5F5; padding: 15px; border-radius: 10px;"> |
|
|
β’ <b>Country:</b> Name of the country (Categorical)<br> |
|
|
β’ <b>Year:</b> Year of observation (Numerical)<br> |
|
|
β’ <b>Status:</b> Developing or Developed country (Categorical)<br> |
|
|
β’ <b>Life Expectancy:</b> Average age a person is expected to live (Numerical)<br> |
|
|
β’ <b>Adult Mortality:</b> Probability of dying between 15-60 years per 1000 population (Numerical)<br> |
|
|
β’ <b>Infant Deaths:</b> Number of infant deaths per 1000 live births (Numerical)<br> |
|
|
β’ <b>Alcohol:</b> Alcohol consumption per capita (Numerical)<br> |
|
|
β’ <b>Percentage Expenditure:</b> Government expenditure on health as a percentage of GDP (Numerical)<br> |
|
|
β’ <b>Hepatitis B:</b> Immunization coverage for Hepatitis B (Numerical)<br> |
|
|
β’ <b>Measles:</b> Number of reported measles cases per year (Numerical)<br> |
|
|
β’ <b>BMI:</b> Average Body Mass Index of the population (Numerical)<br> |
|
|
β’ <b>Under-five Deaths:</b> Number of deaths under the age of five per 1000 live births (Numerical)<br> |
|
|
β’ <b>Polio:</b> Immunization coverage for Polio (Numerical)<br> |
|
|
β’ <b>Total Expenditure:</b> Total health expenditure as a percentage of GDP (Numerical)<br> |
|
|
β’ <b>Diphtheria:</b> Immunization coverage for Diphtheria (Numerical)<br> |
|
|
β’ <b>HIV/AIDS:</b> Death rate due to HIV/AIDS per 100,000 people (Numerical)<br> |
|
|
β’ <b>GDP:</b> Gross Domestic Product per capita (Numerical)<br> |
|
|
β’ <b>Population:</b> Total population of the country (Numerical)<br> |
|
|
β’ <b>Thinness 1-19 Years:</b> Percentage of thin individuals aged 1-19 years (Numerical)<br> |
|
|
β’ <b>Thinness 5-9 Years:</b> Percentage of thin individuals aged 5-9 years (Numerical)<br> |
|
|
β’ <b>Income Composition:</b> Human development index based on income composition (Numerical)<br> |
|
|
β’ <b>Schooling:</b> Average number of years of schooling (Numerical)<br> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
st.markdown(data_info, unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "Simple EDA": |
|
|
st.markdown("<h1 class='title'>Simple Exploratory Data Analysis</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center; margin-top: 20px;"> |
|
|
Exploratory Data Analysis (EDA) helps in understanding the structure, patterns, and missing values in the dataset. |
|
|
Below is an initial preview of the data, followed by a missing values summary. |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h3 class='subtitle' style='text-align: center;'>Sample Dataset</h3>", unsafe_allow_html=True) |
|
|
st.dataframe(data.head()) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h3 class='subtitle' style='text-align: center;'>Missing Values Summary</h3>", unsafe_allow_html=True) |
|
|
|
|
|
missing_values = data.isna().sum().reset_index() |
|
|
missing_values.columns = ["Column Name", "Missing Values"] |
|
|
|
|
|
col1, col2, col3 = st.columns([1, 2, 1]) |
|
|
|
|
|
with col2: |
|
|
st.dataframe(missing_values) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h3 class='subtitle' style='text-align: center;'>Data Description</h3>", unsafe_allow_html=True) |
|
|
|
|
|
st.dataframe(data.describe()) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h3 class='subtitle' style='text-align: center;'>Boxplots for Data Distribution</h3>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
columns = ['Life expectancy', 'Adult Mortality', |
|
|
'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', |
|
|
'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure', |
|
|
'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness 1-19 years', |
|
|
'thinness 5-9 years', 'Income composition of resources', 'Schooling'] |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(12, 30)) |
|
|
axes = axes.flatten() |
|
|
|
|
|
for i, col in enumerate(columns): |
|
|
sns.boxplot(x=data[col], ax=axes[i], color="skyblue") |
|
|
axes[i].set_title(f'Boxplot of {col}', fontsize=12) |
|
|
axes[i].set_xlabel("") |
|
|
|
|
|
plt.tight_layout() |
|
|
st.pyplot(fig) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "Data Pre-processing": |
|
|
st.markdown("<h1 class='title'>Data Preprocessing</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<h2 class='subtitle' style='text-align: center;'>Handling Missing Values</h2>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center;"> |
|
|
<b>Using "Median" Imputation to Fill Highly Skewed Data</b> |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<div style=" |
|
|
border: 1px solid #ddd; |
|
|
border-radius: 8px; |
|
|
padding: 15px; |
|
|
background-color: #f9f9f9; |
|
|
text-align: justify;"> |
|
|
Median imputation is used for columns where data distribution is highly skewed. |
|
|
This approach ensures that extreme values do not overly influence the dataset. |
|
|
Examples include GDP, Population, and Adult Mortality. |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center;"> |
|
|
<b>Mean Imputation for Columns with Small Missing Values and Normally Distributed Data</b> |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<div style=" |
|
|
border: 1px solid #ddd; |
|
|
border-radius: 8px; |
|
|
padding: 15px; |
|
|
background-color: #f9f9f9; |
|
|
text-align: justify;"> |
|
|
Mean imputation is applied when missing values are small and the data is normally distributed. |
|
|
This helps maintain the overall dataset structure without being affected by extreme values. |
|
|
Suitable columns include BMI, Polio, and Schooling. |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center;"> |
|
|
<b>Applying One-Hot Encoding on "Status" Column</b> |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<div style=" |
|
|
border: 1px solid #ddd; |
|
|
border-radius: 8px; |
|
|
padding: 15px; |
|
|
background-color: #f9f9f9; |
|
|
text-align: justify;"> |
|
|
The "Status" column categorizes countries as either Developed or Developing. |
|
|
One-Hot Encoding is used to convert this categorical variable into a numerical format |
|
|
suitable for machine learning models. The "drop='first'" parameter is applied to prevent |
|
|
multicollinearity. |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "EDA": |
|
|
st.markdown("<h1 class='title'>Exploratory Data Analysis (EDA)</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h2 class='subtitle' style='text-align: center;'>Target Column Distribution</h2>", unsafe_allow_html=True) |
|
|
st.image("images/target_column_distribution.png", caption="Life Expectancy Distribution", use_container_width=True) |
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center;"> |
|
|
Insight: Mostly Life Expectancy is in <b>range of 50-80</b>. |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h2 class='subtitle' style='text-align: center;'>Correlation Heatmap</h2>", unsafe_allow_html=True) |
|
|
st.image("images/Correlation_Heatmap.png", caption="Correlation Heatmap", use_container_width=True) |
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center;"> |
|
|
Insight: Our target column <b>Life Expectancy</b> is mostly linearly dependent on |
|
|
<b>Schooling, Income Composition of Resources, GDP, Diphtheria, Polio, BMI, and Percentage Expenditure</b>. |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h2 class='subtitle' style='text-align: center;'>How Specific Columns Affect Life Expectancy</h2>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.image("images/specific_col_affecting_target.png", caption="Features vs. Life Expectancy", use_container_width=True) |
|
|
st.markdown(""" |
|
|
<h5> |
|
|
Insights: |
|
|
|
|
|
1οΈβ£ **GDP vs. Life Expectancy** |
|
|
- Positive correlation: As GDP increases, Life Expectancy also increases. |
|
|
- Some countries with low GDP still have high Life Expectancy due to good healthcare policies. |
|
|
|
|
|
2οΈβ£ **Schooling vs. Life Expectancy** |
|
|
- Strong positive correlation: More years of schooling β longer life. |
|
|
- Educated populations follow better hygiene, diet, and medical care, increasing Life Expectancy. |
|
|
|
|
|
3οΈβ£ **Income Composition vs. Life Expectancy** |
|
|
- Higher economic stability leads to better healthcare systems and lifestyles, improving Life Expectancy. |
|
|
|
|
|
4οΈβ£ **Diphtheria & Polio vs. Life Expectancy** |
|
|
- Higher vaccination rates (80%-100%) correspond to Life Expectancy above 70 years. |
|
|
- Lower vaccination rates (<40%) lead to lower Life Expectancy (~40-60 years), indicating weak healthcare infrastructure. |
|
|
|
|
|
5οΈβ£ **BMI vs. Life Expectancy** |
|
|
- No clear linear trend due to high variance in data points. |
|
|
- BMI < 18 (malnutrition) and BMI > 30 (obesity) reduce Life Expectancy. |
|
|
- Advanced healthcare and better nutrition in some countries help maintain high Life Expectancy despite malnutrition/obesity. |
|
|
</h5> |
|
|
""", unsafe_allow_html=True) |
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown("<h2 class='subtitle' style='text-align: center;'>Life Expectancy vs Developed / Undeveloped Countries</h2>", unsafe_allow_html=True) |
|
|
st.image("images/target_col vs countries.png", caption="Life Expectancy vs Developed / Undeveloped Countries", use_container_width=True) |
|
|
st.markdown(""" |
|
|
<h5 style="text-align: center;"> |
|
|
Insight: Life Expectancy is <b>higher in Developed Countries</b> due to Advanced Healthcare, Better Nutrition, Medical Interventions. |
|
|
</h5>""", unsafe_allow_html=True) |
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "Model Building": |
|
|
|
|
|
st.markdown(""" |
|
|
<h2 style='text-align: center; color: #333;'>Model Building</h2> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h2>Introduction</h2> |
|
|
<p>In this section, we explore different <b>Ensemble Learning</b> techniques to improve model performance.</p> |
|
|
<p>We implemented three ensemble models: |
|
|
<span style='font-size:16px;'>π₯ <b>Voting Regressor</b> - π― <b>Bagging Regressor</b> - π² <b>Random Forest Regressor</b></span></p> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style='color: #1363DF;'>1οΈβ£ Voting Regressor</h5> |
|
|
<ul> |
|
|
<li><b>Concept:</b> Combines multiple models (<b>KNN & Decision Tree</b>) and takes the <b>average prediction</b>.</li> |
|
|
<li><b>Why Voting Regressor?</b> β
Works well when models have different strengths. β
Reduces variance while maintaining interpretability.</li> |
|
|
</ul> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px dashed #bbb;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style='color: #FF6D28;'>2οΈβ£ Bagging Regressor</h5> |
|
|
<ul> |
|
|
<li><b>Concept:</b> Uses <b>bootstrap sampling</b> to train multiple models on different subsets of data.</li> |
|
|
<li><b>Why Bagging Regressor?</b> β
Reduces overfitting by averaging multiple models. β
Works best with <b>high-variance models</b> like Decision Tree.</li> |
|
|
</ul> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px dashed #bbb;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h5 style='color: #2EB086;'>3οΈβ£ Random Forest Regressor</h5> |
|
|
<ul> |
|
|
<li><b>Concept:</b> Uses <b>multiple Decision Trees</b>, trained on different feature subsets.</li> |
|
|
<li><b>Why Random Forest?</b> β
Handles <b>non-linearity</b> well. β
Less prone to overfitting compared to a single Decision Tree.</li> |
|
|
</ul> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<h3>Combining High & Low Variance Models</h3> |
|
|
<p>A crucial step to improve ensemble performance is <b>choosing models with different variance levels</b>:</p> |
|
|
<ul> |
|
|
<li><b>Voting Regressor:</b> Uses a combination of <b>high-variance</b> (Decision Tree, KNN with small K) and <b>low-variance</b> (KNN with large K, Decision Tree with depth constraint) models.</li> |
|
|
<li><b>Bagging & Random Forest:</b> Use <b>only high-variance models</b> (Decision Trees with deep splits) to maximize variance reduction.</li> |
|
|
</ul> |
|
|
<p><b>This technique helps create a <span style='color: green;'>balanced ensemble</span>, preventing excessive overfitting or underfitting! β
</b></p> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<h3>Hyperparameter Tuning using Optuna β‘</h3> |
|
|
<p>We optimized hyperparameters for <b>KNN, Decision Tree, Bagging Regressor, and Random Forest</b> using <b>Optuna</b>.</p> |
|
|
<p>Below are the <b>optimized parameters</b> for each model:</p> |
|
|
|
|
|
<h5>πΉ K-Nearest Neighbors (KNN)</h5> |
|
|
<ul> |
|
|
<li><code>n_neighbors</code></li> |
|
|
<li><code>p</code></li> |
|
|
<li><code>weights</code></li> |
|
|
<li><code>algorithm</code></li> |
|
|
</ul> |
|
|
|
|
|
<h5>πΉ Decision Tree</h5> |
|
|
<ul> |
|
|
<li><code>max_depth</code></li> |
|
|
<li><code>min_samples_split</code></li> |
|
|
<li><code>min_samples_leaf</code></li> |
|
|
<li><code>max_features</code></li> |
|
|
<li><code>min_impurity_decrease</code></li> |
|
|
</ul> |
|
|
|
|
|
<h5>πΉ Bagging Regressor</h5> |
|
|
<ul> |
|
|
<li><code>n_estimators</code></li> |
|
|
<li><code>max_samples</code></li> |
|
|
</ul> |
|
|
|
|
|
<h5>πΉ Random Forest</h5> |
|
|
<ul> |
|
|
<li><code>n_estimators</code></li> |
|
|
<li><code>max_samples</code></li> |
|
|
</ul> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<h3>Model Performance Insights π</h3> |
|
|
<p>Hereβs how our ensemble models performed on training and test datasets:</p> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
table { |
|
|
width: 100%; |
|
|
border-collapse: collapse; |
|
|
text-align: center; |
|
|
font-size: 16px; |
|
|
} |
|
|
th, td { |
|
|
padding: 10px; |
|
|
border-bottom: 1px solid #ddd; |
|
|
} |
|
|
th { |
|
|
background-color: #F3F4F6; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
|
<table> |
|
|
<tr> |
|
|
<th>Ensemble</th> |
|
|
<th>Training Score</th> |
|
|
<th>Test Score</th> |
|
|
<th>Generalized Score</th> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Voting Ensemble</td> |
|
|
<td>95.80%</td> |
|
|
<td>92.13%</td> |
|
|
<td>92.89%</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Bagging Ensemble</td> |
|
|
<td>98.68%</td> |
|
|
<td>95.04%</td> |
|
|
<td><b>95.45%</b></td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Random Forest</td> |
|
|
<td>97.92%</td> |
|
|
<td>94.71%</td> |
|
|
<td>94.71%</td> |
|
|
</tr> |
|
|
</table> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<br>", unsafe_allow_html=True) |
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "Final Model": |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
<style> |
|
|
.title { |
|
|
text-align: center; |
|
|
font-size: 36px; |
|
|
font-weight: bold; |
|
|
color: #1E3A8A; |
|
|
} |
|
|
.subtitle { |
|
|
text-align: center; |
|
|
font-size: 20px; |
|
|
color: #475569; |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
.image-container { |
|
|
display: flex; |
|
|
justify-content: center; |
|
|
} |
|
|
.caption { |
|
|
text-align: center; |
|
|
font-size: 16px; |
|
|
font-style: italic; |
|
|
color: #6B7280; |
|
|
} |
|
|
.box { |
|
|
background-color: #F8FAFC; |
|
|
padding: 15px; |
|
|
border-radius: 10px; |
|
|
box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.1); |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
</style> |
|
|
""", |
|
|
unsafe_allow_html=True, |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown("<h1 class='title'>Final Model</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown( |
|
|
"<div class='box'>" |
|
|
"<p><strong>After experimenting with multiple trials using Optuna, we selected the best-fit model " |
|
|
"by analyzing the training and test scores of different trials. " |
|
|
"The following scatter plots provide insights into this selection process.</strong></p>" |
|
|
"</div>", |
|
|
unsafe_allow_html=True, |
|
|
) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<h3 style='text-align: center;'>Training vs Test Score (All Trials)</h3>", unsafe_allow_html=True) |
|
|
st.markdown( |
|
|
"<p class='subtitle'>This scatter plot visualizes the training and test scores of all trials. " |
|
|
"The goal was to identify a model where both scores are closely aligned, ensuring minimal overfitting or underfitting.</p>", |
|
|
unsafe_allow_html=True, |
|
|
) |
|
|
|
|
|
st.image("images/bagging_trails.png", |
|
|
caption="All Trails", |
|
|
use_container_width=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<h3 style='text-align: center;'>Training vs Test Score (First 50 Trials)</h3>", unsafe_allow_html=True) |
|
|
st.markdown( |
|
|
"<p class='subtitle'>By filtering the first 50 trials, we focused on models that demonstrated balanced performance. " |
|
|
"The best-fit model was selected by ensuring that the training and test scores are close to each other.</p>", |
|
|
unsafe_allow_html=True, |
|
|
) |
|
|
|
|
|
st.image("images/bagging_50trails.png", |
|
|
caption="50 Trails", |
|
|
use_container_width=True) |
|
|
|
|
|
st.markdown( |
|
|
"<p style='text-align: center; font-weight: bold; font-size: 16px;'>" |
|
|
"From the above trials, we selected the <b>9th trial</b> as its train score and test score have minimal difference." |
|
|
"</p>", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<h3 style='text-align: center;'>Selected Best-Fit Model</h3>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown( |
|
|
"<div class='box'>" |
|
|
"<ul>" |
|
|
"<li><b>Base Model:</b> DecisionTreeRegressor</li>" |
|
|
"<li><b>Hyperparameters:</b>" |
|
|
"<ul>" |
|
|
"<li>min_samples_leaf = 2</li>" |
|
|
"<li>min_samples_split = 3</li>" |
|
|
"</ul></li>" |
|
|
"<li><b>Ensemble Method:</b> BaggingRegressor</li>" |
|
|
"<li><b>Bagging Hyperparameters:</b>" |
|
|
"<ul>" |
|
|
"<li>n_estimators = 40</li>" |
|
|
"<li>max_samples = 0.838404</li>" |
|
|
"</ul></li>" |
|
|
"</ul>" |
|
|
"<p>This model was selected as it demonstrated a balance between generalization and performance.</p>" |
|
|
"</div>", |
|
|
unsafe_allow_html=True, |
|
|
) |
|
|
|
|
|
|
|
|
if st.button("π Go Back to Model Pipeline"): |
|
|
switch_page("Model Pipeline") |
|
|
|
|
|
|
|
|
|
|
|
elif st.session_state.current_page == "Hands-on Model": |
|
|
st.markdown("<h1 class='title'>Hands-on Model</h1>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("<h4 class='subtitle' style='text-align: center;'>Provide inputs to predict Life Expectancy</h4>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
year = st.slider("Year", 2000, 2015, 2008) |
|
|
status = st.radio("Status", ["Developing", "Developed"], horizontal=True) |
|
|
status = 1 if status == "Developed" else 0 |
|
|
adult_mortality = st.slider("Adult Mortality Rate", 1, 723, 144) |
|
|
infant_deaths = st.slider("Infant Deaths", 0, 1800, 3) |
|
|
alcohol = st.slider("Alcohol Consumption", 0.01, 17.87, 4.55) |
|
|
percentage_expenditure = st.slider("Percentage Expenditure", 0.0, 19479.91, 738.25) |
|
|
hepatitis_b = st.slider("Hepatitis B Immunization (%)", 1, 99, 83) |
|
|
measles = st.slider("Measles Cases", 0, 212183, 2419) |
|
|
bmi = st.slider("BMI", 1.0, 87.3, 38.3) |
|
|
polio = st.slider("Polio Immunization (%)", 3, 99, 82) |
|
|
|
|
|
with col2: |
|
|
under_five_deaths = st.slider("Under-Five Deaths", 0, 2500, 4) |
|
|
total_expenditure = st.slider("Total Healthcare Expenditure (%)", 0.37, 17.6, 5.92) |
|
|
diphtheria = st.slider("Diphtheria Immunization (%)", 2, 99, 82) |
|
|
hiv_aids = st.slider("HIV/AIDS Prevalence Rate", 0.1, 50.6, 1.74) |
|
|
gdp = st.slider("GDP per Capita", 1.68, 119172.7, 6611.52) |
|
|
population = st.slider("Population", 34, 1293859000, 10230850) |
|
|
thinness_1_19 = st.slider("Thinness 1-19 years (%)", 0.1, 27.7, 4.83) |
|
|
thinness_5_9 = st.slider("Thinness 5-9 years (%)", 0.1, 28.6, 4.86) |
|
|
income_composition = st.slider("Income Composition of Resources", 0.0, 0.948, 0.63) |
|
|
schooling = st.slider("Schooling (Years)", 0.0, 20.7, 11.99) |
|
|
|
|
|
if st.button("Predict Life Expectancy"): |
|
|
features = np.array([[year, status, adult_mortality, infant_deaths, alcohol, percentage_expenditure, |
|
|
hepatitis_b, measles, bmi, under_five_deaths, polio, total_expenditure, |
|
|
diphtheria, hiv_aids, gdp, population, thinness_1_19, thinness_5_9, |
|
|
income_composition, schooling]]) |
|
|
|
|
|
prediction = model.predict(features)[0] |
|
|
|
|
|
st.markdown( |
|
|
f""" |
|
|
<div class="result-box"> |
|
|
Predicted Life Expectancy: <b>{prediction:.2f} years</b> |
|
|
</div> |
|
|
""", |
|
|
unsafe_allow_html=True, |
|
|
) |
|
|
|