Update app.py
Browse files
app.py
CHANGED
|
@@ -288,70 +288,55 @@ elif st.session_state.current_page == "Simple EDA":
|
|
| 288 |
elif st.session_state.current_page == "Data Pre-processing":
|
| 289 |
st.markdown("<h1 class='title'>Data Preprocessing</h1>", unsafe_allow_html=True)
|
| 290 |
|
| 291 |
-
# Title for Handling Missing Values
|
| 292 |
st.markdown("<h2 class='subtitle' style='text-align: center;'>Handling Missing Values</h2>", unsafe_allow_html=True)
|
| 293 |
|
| 294 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 295 |
|
| 296 |
-
# Using Median Imputation
|
| 297 |
st.markdown("""
|
| 298 |
<h5 style="text-align: center;">
|
| 299 |
<b>Using "Median" Imputation to Fill Highly Skewed Data</b>
|
| 300 |
</h5>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
""", unsafe_allow_html=True)
|
| 302 |
|
| 303 |
-
code_median = """
|
| 304 |
-
datac['GDP'].fillna(datac['GDP'].median(), inplace=True)
|
| 305 |
-
datac['Population'].fillna(datac['Population'].median(), inplace=True)
|
| 306 |
-
datac['Hepatitis B'].fillna(datac['Hepatitis B'].median(), inplace=True)
|
| 307 |
-
datac['Total expenditure'].fillna(datac['Total expenditure'].median(), inplace=True)
|
| 308 |
-
datac['Adult Mortality'].fillna(datac['Adult Mortality'].median(), inplace=True)
|
| 309 |
-
datac['Alcohol'].fillna(datac['Alcohol'].median(), inplace=True)
|
| 310 |
-
datac['thinness 1-19 years'].fillna(datac['Alcohol'].median(), inplace=True)
|
| 311 |
-
datac['thinness 5-9 years'].fillna(datac['Alcohol'].median(), inplace=True)
|
| 312 |
-
"""
|
| 313 |
-
st.code(code_median, language="python")
|
| 314 |
-
|
| 315 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 316 |
|
| 317 |
-
# Using Mean Imputation
|
| 318 |
st.markdown("""
|
| 319 |
<h5 style="text-align: center;">
|
| 320 |
<b>Mean Imputation for Columns with Small Missing Values and Normally Distributed Data</b>
|
| 321 |
</h5>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
""", unsafe_allow_html=True)
|
| 323 |
|
| 324 |
-
code_mean = """
|
| 325 |
-
datac['Diphtheria'].fillna(datac['Diphtheria'].mean(), inplace=True)
|
| 326 |
-
datac['Polio'].fillna(datac['Polio'].mean(), inplace=True)
|
| 327 |
-
datac['BMI'].fillna(datac['BMI'].mean(), inplace=True)
|
| 328 |
-
datac['Income composition of resources'].fillna(datac['Income composition of resources'].mean(), inplace=True)
|
| 329 |
-
datac['Schooling'].fillna(datac['Schooling'].mean(), inplace=True)
|
| 330 |
-
datac['Life expectancy'].fillna(datac['Life expectancy'].mean(), inplace=True)
|
| 331 |
-
"""
|
| 332 |
-
st.code(code_mean, language="python")
|
| 333 |
-
|
| 334 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 335 |
|
| 336 |
-
# One-Hot Encoding for "Status" Column
|
| 337 |
st.markdown("""
|
| 338 |
<h5 style="text-align: center;">
|
| 339 |
<b>Applying One-Hot Encoding on "Status" Column</b>
|
| 340 |
</h5>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
""", unsafe_allow_html=True)
|
| 342 |
|
| 343 |
-
code_ohe = """
|
| 344 |
-
from sklearn.preprocessing import OneHotEncoder
|
| 345 |
-
|
| 346 |
-
oe = OneHotEncoder(drop="first", sparse_output=False)
|
| 347 |
-
datac["Status"] = oe.fit_transform(datac[["Status"]])
|
| 348 |
-
"""
|
| 349 |
-
st.code(code_ohe, language="python")
|
| 350 |
-
|
| 351 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 352 |
|
| 353 |
if st.button("🔙 Go Back to Model Pipeline"):
|
| 354 |
-
switch_page("Model Pipeline")
|
|
|
|
| 355 |
|
| 356 |
elif st.session_state.current_page == "EDA":
|
| 357 |
st.markdown("<h1 class='title'>Exploratory Data Analysis (EDA)</h1>", unsafe_allow_html=True)
|
|
@@ -664,6 +649,13 @@ elif st.session_state.current_page == "Final Model":
|
|
| 664 |
caption="50 Trails",
|
| 665 |
use_container_width=True)
|
| 666 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True)
|
| 668 |
|
| 669 |
st.markdown("<h3 style='text-align: center;'>Selected Best-Fit Model</h3>", unsafe_allow_html=True)
|
|
|
|
| 288 |
elif st.session_state.current_page == "Data Pre-processing":
|
| 289 |
st.markdown("<h1 class='title'>Data Preprocessing</h1>", unsafe_allow_html=True)
|
| 290 |
|
|
|
|
| 291 |
st.markdown("<h2 class='subtitle' style='text-align: center;'>Handling Missing Values</h2>", unsafe_allow_html=True)
|
| 292 |
|
| 293 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 294 |
|
|
|
|
| 295 |
st.markdown("""
|
| 296 |
<h5 style="text-align: center;">
|
| 297 |
<b>Using "Median" Imputation to Fill Highly Skewed Data</b>
|
| 298 |
</h5>
|
| 299 |
+
<p style="text-align: justify;">
|
| 300 |
+
Median imputation is used to handle missing values in columns where data distribution is skewed.
|
| 301 |
+
This method is more robust than mean imputation in such cases, as it prevents the effect of outliers
|
| 302 |
+
from distorting the dataset. For example, GDP, Population, and Adult Mortality tend to have extreme values,
|
| 303 |
+
making median a better choice for filling in missing data.
|
| 304 |
+
</p>
|
| 305 |
""", unsafe_allow_html=True)
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 308 |
|
|
|
|
| 309 |
st.markdown("""
|
| 310 |
<h5 style="text-align: center;">
|
| 311 |
<b>Mean Imputation for Columns with Small Missing Values and Normally Distributed Data</b>
|
| 312 |
</h5>
|
| 313 |
+
<p style="text-align: justify;">
|
| 314 |
+
Mean imputation is applied to columns where missing values are relatively small and the data follows a normal
|
| 315 |
+
distribution. This method ensures that the overall distribution remains unchanged. Columns like BMI, Polio,
|
| 316 |
+
and Schooling are typically well-suited for this approach as they do not contain extreme outliers that could
|
| 317 |
+
distort the mean.
|
| 318 |
+
</p>
|
| 319 |
""", unsafe_allow_html=True)
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 322 |
|
|
|
|
| 323 |
st.markdown("""
|
| 324 |
<h5 style="text-align: center;">
|
| 325 |
<b>Applying One-Hot Encoding on "Status" Column</b>
|
| 326 |
</h5>
|
| 327 |
+
<p style="text-align: justify;">
|
| 328 |
+
The "Status" column contains categorical data, differentiating countries as either <b>Developed</b> or
|
| 329 |
+
<b>Developing</b>. Since machine learning models work better with numerical data, we apply One-Hot Encoding,
|
| 330 |
+
which converts this categorical variable into a numerical format. We use the "drop='first'" parameter to avoid
|
| 331 |
+
multicollinearity by keeping only one of the binary categories.
|
| 332 |
+
</p>
|
| 333 |
""", unsafe_allow_html=True)
|
| 334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
st.markdown("<br>", unsafe_allow_html=True)
|
| 336 |
|
| 337 |
if st.button("🔙 Go Back to Model Pipeline"):
|
| 338 |
+
switch_page("Model Pipeline")
|
| 339 |
+
|
| 340 |
|
| 341 |
elif st.session_state.current_page == "EDA":
|
| 342 |
st.markdown("<h1 class='title'>Exploratory Data Analysis (EDA)</h1>", unsafe_allow_html=True)
|
|
|
|
| 649 |
caption="50 Trails",
|
| 650 |
use_container_width=True)
|
| 651 |
|
| 652 |
+
st.markdown(
|
| 653 |
+
"<p style='text-align: center; font-weight: bold; font-size: 16px;'>"
|
| 654 |
+
"From the above trials, we selected the <b>9th trial</b> as its train score and test score have minimal difference."
|
| 655 |
+
"</p>",
|
| 656 |
+
unsafe_allow_html=True
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True)
|
| 660 |
|
| 661 |
st.markdown("<h3 style='text-align: center;'>Selected Best-Fit Model</h3>", unsafe_allow_html=True)
|