Yashvj123 commited on
Commit
b7885e0
·
verified ·
1 Parent(s): 20ed088

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -35
app.py CHANGED
@@ -288,70 +288,55 @@ elif st.session_state.current_page == "Simple EDA":
288
  elif st.session_state.current_page == "Data Pre-processing":
289
  st.markdown("<h1 class='title'>Data Preprocessing</h1>", unsafe_allow_html=True)
290
 
291
- # Title for Handling Missing Values
292
  st.markdown("<h2 class='subtitle' style='text-align: center;'>Handling Missing Values</h2>", unsafe_allow_html=True)
293
 
294
  st.markdown("<br>", unsafe_allow_html=True)
295
 
296
- # Using Median Imputation
297
  st.markdown("""
298
  <h5 style="text-align: center;">
299
  <b>Using "Median" Imputation to Fill Highly Skewed Data</b>
300
  </h5>
 
 
 
 
 
 
301
  """, unsafe_allow_html=True)
302
 
303
- code_median = """
304
- datac['GDP'].fillna(datac['GDP'].median(), inplace=True)
305
- datac['Population'].fillna(datac['Population'].median(), inplace=True)
306
- datac['Hepatitis B'].fillna(datac['Hepatitis B'].median(), inplace=True)
307
- datac['Total expenditure'].fillna(datac['Total expenditure'].median(), inplace=True)
308
- datac['Adult Mortality'].fillna(datac['Adult Mortality'].median(), inplace=True)
309
- datac['Alcohol'].fillna(datac['Alcohol'].median(), inplace=True)
310
- datac['thinness 1-19 years'].fillna(datac['Alcohol'].median(), inplace=True)
311
- datac['thinness 5-9 years'].fillna(datac['Alcohol'].median(), inplace=True)
312
- """
313
- st.code(code_median, language="python")
314
-
315
  st.markdown("<br>", unsafe_allow_html=True)
316
 
317
- # Using Mean Imputation
318
  st.markdown("""
319
  <h5 style="text-align: center;">
320
  <b>Mean Imputation for Columns with Small Missing Values and Normally Distributed Data</b>
321
  </h5>
 
 
 
 
 
 
322
  """, unsafe_allow_html=True)
323
 
324
- code_mean = """
325
- datac['Diphtheria'].fillna(datac['Diphtheria'].mean(), inplace=True)
326
- datac['Polio'].fillna(datac['Polio'].mean(), inplace=True)
327
- datac['BMI'].fillna(datac['BMI'].mean(), inplace=True)
328
- datac['Income composition of resources'].fillna(datac['Income composition of resources'].mean(), inplace=True)
329
- datac['Schooling'].fillna(datac['Schooling'].mean(), inplace=True)
330
- datac['Life expectancy'].fillna(datac['Life expectancy'].mean(), inplace=True)
331
- """
332
- st.code(code_mean, language="python")
333
-
334
  st.markdown("<br>", unsafe_allow_html=True)
335
 
336
- # One-Hot Encoding for "Status" Column
337
  st.markdown("""
338
  <h5 style="text-align: center;">
339
  <b>Applying One-Hot Encoding on "Status" Column</b>
340
  </h5>
 
 
 
 
 
 
341
  """, unsafe_allow_html=True)
342
 
343
- code_ohe = """
344
- from sklearn.preprocessing import OneHotEncoder
345
-
346
- oe = OneHotEncoder(drop="first", sparse_output=False)
347
- datac["Status"] = oe.fit_transform(datac[["Status"]])
348
- """
349
- st.code(code_ohe, language="python")
350
-
351
  st.markdown("<br>", unsafe_allow_html=True)
352
 
353
  if st.button("🔙 Go Back to Model Pipeline"):
354
- switch_page("Model Pipeline")
 
355
 
356
  elif st.session_state.current_page == "EDA":
357
  st.markdown("<h1 class='title'>Exploratory Data Analysis (EDA)</h1>", unsafe_allow_html=True)
@@ -664,6 +649,13 @@ elif st.session_state.current_page == "Final Model":
664
  caption="50 Trails",
665
  use_container_width=True)
666
 
 
 
 
 
 
 
 
667
  st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True)
668
 
669
  st.markdown("<h3 style='text-align: center;'>Selected Best-Fit Model</h3>", unsafe_allow_html=True)
 
288
  elif st.session_state.current_page == "Data Pre-processing":
289
  st.markdown("<h1 class='title'>Data Preprocessing</h1>", unsafe_allow_html=True)
290
 
 
291
  st.markdown("<h2 class='subtitle' style='text-align: center;'>Handling Missing Values</h2>", unsafe_allow_html=True)
292
 
293
  st.markdown("<br>", unsafe_allow_html=True)
294
 
 
295
  st.markdown("""
296
  <h5 style="text-align: center;">
297
  <b>Using "Median" Imputation to Fill Highly Skewed Data</b>
298
  </h5>
299
+ <p style="text-align: justify;">
300
+ Median imputation is used to handle missing values in columns where data distribution is skewed.
301
+ This method is more robust than mean imputation in such cases, as it prevents the effect of outliers
302
+ from distorting the dataset. For example, GDP, Population, and Adult Mortality tend to have extreme values,
303
+ making median a better choice for filling in missing data.
304
+ </p>
305
  """, unsafe_allow_html=True)
306
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  st.markdown("<br>", unsafe_allow_html=True)
308
 
 
309
  st.markdown("""
310
  <h5 style="text-align: center;">
311
  <b>Mean Imputation for Columns with Small Missing Values and Normally Distributed Data</b>
312
  </h5>
313
+ <p style="text-align: justify;">
314
+ Mean imputation is applied to columns where missing values are relatively small and the data follows a normal
315
+ distribution. This method ensures that the overall distribution remains unchanged. Columns like BMI, Polio,
316
+ and Schooling are typically well-suited for this approach as they do not contain extreme outliers that could
317
+ distort the mean.
318
+ </p>
319
  """, unsafe_allow_html=True)
320
 
 
 
 
 
 
 
 
 
 
 
321
  st.markdown("<br>", unsafe_allow_html=True)
322
 
 
323
  st.markdown("""
324
  <h5 style="text-align: center;">
325
  <b>Applying One-Hot Encoding on "Status" Column</b>
326
  </h5>
327
+ <p style="text-align: justify;">
328
+ The "Status" column contains categorical data, differentiating countries as either <b>Developed</b> or
329
+ <b>Developing</b>. Since machine learning models work better with numerical data, we apply One-Hot Encoding,
330
+ which converts this categorical variable into a numerical format. We use the "drop='first'" parameter to avoid
331
+ multicollinearity by keeping only one of the binary categories.
332
+ </p>
333
  """, unsafe_allow_html=True)
334
 
 
 
 
 
 
 
 
 
335
  st.markdown("<br>", unsafe_allow_html=True)
336
 
337
  if st.button("🔙 Go Back to Model Pipeline"):
338
+ switch_page("Model Pipeline")
339
+
340
 
341
  elif st.session_state.current_page == "EDA":
342
  st.markdown("<h1 class='title'>Exploratory Data Analysis (EDA)</h1>", unsafe_allow_html=True)
 
649
  caption="50 Trails",
650
  use_container_width=True)
651
 
652
+ st.markdown(
653
+ "<p style='text-align: center; font-weight: bold; font-size: 16px;'>"
654
+ "From the above trials, we selected the <b>9th trial</b> as its train score and test score have minimal difference."
655
+ "</p>",
656
+ unsafe_allow_html=True
657
+ )
658
+
659
  st.markdown("<hr style='border:1px solid #ddd;'>", unsafe_allow_html=True)
660
 
661
  st.markdown("<h3 style='text-align: center;'>Selected Best-Fit Model</h3>", unsafe_allow_html=True)