Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- adult.csv +0 -0
- eda.py +142 -0
- main.py +45 -0
- pipeline.pkl +3 -0
- prediction.py +77 -0
- requirements.txt +16 -0
adult.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eda.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import seaborn as sns
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import os
|
| 6 |
+
from phik import phik_matrix
|
| 7 |
+
|
| 8 |
+
# Path to dataset
|
| 9 |
+
data_path = r"C:\Users\handw\Documents\FTDS\p1-ftds036-rmt-m2-ebhon\deployment\adult.csv"
|
| 10 |
+
|
| 11 |
+
# Load dataset
|
| 12 |
+
@st.cache_data
|
| 13 |
+
def load_data():
|
| 14 |
+
if not os.path.isfile(data_path):
|
| 15 |
+
st.error(f"File not found: {data_path}")
|
| 16 |
+
return None
|
| 17 |
+
return pd.read_csv(data_path)
|
| 18 |
+
|
| 19 |
+
def run_eda():
|
| 20 |
+
# Load data
|
| 21 |
+
data = load_data()
|
| 22 |
+
|
| 23 |
+
# Check if data is loaded successfully
|
| 24 |
+
if data is not None:
|
| 25 |
+
# Trim whitespace from column names
|
| 26 |
+
data.columns = data.columns.str.strip()
|
| 27 |
+
|
| 28 |
+
# Sidebar for chart selection
|
| 29 |
+
st.sidebar.title("EDA Menu")
|
| 30 |
+
menu_options = st.sidebar.radio("Select a chart:",
|
| 31 |
+
("Age Distribution Histogram",
|
| 32 |
+
"Average Age by Income Category",
|
| 33 |
+
"Count by Work Class and Income",
|
| 34 |
+
"Average Capital Gain by Education Level",
|
| 35 |
+
"Total Hours Worked by Income Category",
|
| 36 |
+
"Count by Marital Status and Income",
|
| 37 |
+
"Phik Correlation Matrix"))
|
| 38 |
+
|
| 39 |
+
# Histogram of Age distribution
|
| 40 |
+
if menu_options == "Age Distribution Histogram":
|
| 41 |
+
st.subheader("Histogram of Age Distribution")
|
| 42 |
+
if 'age' in data.columns:
|
| 43 |
+
plt.figure(figsize=(10, 6))
|
| 44 |
+
sns.histplot(data['age'], bins=30, kde=True)
|
| 45 |
+
plt.title('Distribusi Usia')
|
| 46 |
+
plt.xlabel('Usia')
|
| 47 |
+
plt.ylabel('Frekuensi')
|
| 48 |
+
st.pyplot(plt)
|
| 49 |
+
st.write("**Insight:** This histogram shows the age distribution of individuals in the dataset, indicating how age varies among the population.")
|
| 50 |
+
else:
|
| 51 |
+
st.error("Column 'age' not found in the dataset.")
|
| 52 |
+
|
| 53 |
+
# Average Age by Income Category
|
| 54 |
+
if menu_options == "Average Age by Income Category":
|
| 55 |
+
st.subheader("Average Age Based on Income Category")
|
| 56 |
+
if 'income' in data.columns and 'age' in data.columns:
|
| 57 |
+
age_income = data.groupby('income')['age'].mean().reset_index() # Group age by income
|
| 58 |
+
plt.figure(figsize=(10, 6))
|
| 59 |
+
sns.barplot(data=age_income, x='income', y='age')
|
| 60 |
+
plt.title('Rata-rata Usia berdasarkan Kategori Pendapatan')
|
| 61 |
+
plt.xlabel('Kategori Pendapatan')
|
| 62 |
+
plt.ylabel('Rata-rata Usia')
|
| 63 |
+
st.pyplot(plt)
|
| 64 |
+
st.write("**Insight:** This bar plot displays the average age of individuals based on income categories, showing how age correlates with income.")
|
| 65 |
+
else:
|
| 66 |
+
st.error("Required columns not found in the dataset.")
|
| 67 |
+
|
| 68 |
+
# Count by Work Class and Income
|
| 69 |
+
if menu_options == "Count by Work Class and Income":
|
| 70 |
+
st.subheader("Count by Work Class and Income")
|
| 71 |
+
if 'workclass' in data.columns and 'income' in data.columns:
|
| 72 |
+
workclass_income = data.groupby(['workclass', 'income']).size().reset_index(name='count')
|
| 73 |
+
plt.figure(figsize=(12, 6))
|
| 74 |
+
sns.barplot(data=workclass_income, x='workclass', y='count', hue='income')
|
| 75 |
+
plt.title('Jumlah Individu berdasarkan Jenis Pekerjaan dan Pendapatan')
|
| 76 |
+
plt.xticks(rotation=45)
|
| 77 |
+
st.pyplot(plt)
|
| 78 |
+
st.write("**Insight:** This plot illustrates the distribution of individuals by their job types and income levels, highlighting job categories that attract higher income.")
|
| 79 |
+
else:
|
| 80 |
+
st.error("Required columns not found in the dataset.")
|
| 81 |
+
|
| 82 |
+
# Average Capital Gain by Education Level
|
| 83 |
+
if menu_options == "Average Capital Gain by Education Level":
|
| 84 |
+
st.subheader("Average Capital Gain Based on Education Level")
|
| 85 |
+
if 'education' in data.columns and 'capital-gain' in data.columns:
|
| 86 |
+
capital_gain_education = data.groupby('education')['capital-gain'].mean().reset_index()
|
| 87 |
+
plt.figure(figsize=(12, 6))
|
| 88 |
+
sns.barplot(data=capital_gain_education, x='education', y='capital-gain')
|
| 89 |
+
plt.title('Rata-rata Keuntungan Modal berdasarkan Tingkat Pendidikan')
|
| 90 |
+
plt.xticks(rotation=45)
|
| 91 |
+
st.pyplot(plt)
|
| 92 |
+
st.write("**Insight:** This bar plot indicates the average capital gain across different education levels, suggesting that higher education is associated with greater financial gains.")
|
| 93 |
+
else:
|
| 94 |
+
st.error("Required columns not found in the dataset.")
|
| 95 |
+
|
| 96 |
+
# Total Hours Worked by Income Category
|
| 97 |
+
if menu_options == "Total Hours Worked by Income Category":
|
| 98 |
+
st.subheader("Total Hours Worked Based on Income Category")
|
| 99 |
+
if 'income' in data.columns and 'hours-per-week' in data.columns:
|
| 100 |
+
hours_income = data.groupby('income')['hours-per-week'].sum().reset_index()
|
| 101 |
+
plt.figure(figsize=(8, 5))
|
| 102 |
+
sns.barplot(data=hours_income, x='income', y='hours-per-week')
|
| 103 |
+
plt.title('Total Jam Kerja berdasarkan Kategori Pendapatan')
|
| 104 |
+
plt.xlabel('Kategori Pendapatan')
|
| 105 |
+
plt.ylabel('Total Jam Kerja')
|
| 106 |
+
st.pyplot(plt)
|
| 107 |
+
st.write("**Insight:** This plot shows the total number of hours worked for each income category, indicating the relationship between working hours and income.")
|
| 108 |
+
else:
|
| 109 |
+
st.error("Required columns not found in the dataset.")
|
| 110 |
+
|
| 111 |
+
# Count by Marital Status and Income
|
| 112 |
+
if menu_options == "Count by Marital Status and Income":
|
| 113 |
+
st.subheader("Count by Marital Status and Income")
|
| 114 |
+
if 'marital-status' in data.columns and 'income' in data.columns:
|
| 115 |
+
relationship_income = data.groupby(['marital-status', 'income']).size().reset_index(name='count')
|
| 116 |
+
plt.figure(figsize=(12, 6))
|
| 117 |
+
sns.barplot(data=relationship_income, x='marital-status', y='count', hue='income')
|
| 118 |
+
plt.title('Jumlah Individu berdasarkan Status Perkawinan dan Pendapatan')
|
| 119 |
+
plt.xticks(rotation=45)
|
| 120 |
+
st.pyplot(plt)
|
| 121 |
+
st.write("**Insight:** This plot shows the distribution of individuals by marital status and income category, providing insights into how marital status may affect income.")
|
| 122 |
+
else:
|
| 123 |
+
st.error("Required columns not found in the dataset.")
|
| 124 |
+
|
| 125 |
+
# Phik Correlation Matrix
|
| 126 |
+
if menu_options == "Phik Correlation Matrix":
|
| 127 |
+
st.subheader("Phik Correlation Matrix")
|
| 128 |
+
# List the required columns
|
| 129 |
+
required_columns = ['income', 'age', 'capital-gain', 'hours-per-week', 'marital-status', 'education', 'workclass']
|
| 130 |
+
if all(col in data.columns for col in required_columns):
|
| 131 |
+
# Calculate the Phik correlation matrix
|
| 132 |
+
phik_corr = data.phik_matrix()
|
| 133 |
+
plt.figure(figsize=(12, 8))
|
| 134 |
+
sns.heatmap(phik_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
|
| 135 |
+
plt.title('Phik Correlation Matrix (Sampled Data)')
|
| 136 |
+
st.pyplot(plt)
|
| 137 |
+
st.write("**Insight:** The Phik correlation matrix reveals the strength and direction of relationships between variables, helping identify multicollinearity and associations within the dataset.")
|
| 138 |
+
else:
|
| 139 |
+
missing_cols = [col for col in required_columns if col not in data.columns]
|
| 140 |
+
st.error(f"Required columns not found in the dataset: {', '.join(missing_cols)}")
|
| 141 |
+
else:
|
| 142 |
+
st.error("Data not loaded successfully.")
|
main.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
st.title("Income Inequality Analysis Application")
|
| 5 |
+
|
| 6 |
+
# Navigation side bar
|
| 7 |
+
st.sidebar.title("Navigation")
|
| 8 |
+
app_mode = st.sidebar.selectbox("Choose the app mode", ["Home", "EDA", "Prediction"])
|
| 9 |
+
|
| 10 |
+
if app_mode == "Home":
|
| 11 |
+
st.subheader("Welcome to Income Inequality Analysis App!")
|
| 12 |
+
st.write("""This machine learning project aims to predict individuals' income levels based on demographic and occupational characteristics. By utilizing machine learning, this classification model is expected to assist the government in formulating more targeted and efficient economic policies to reduce existing social disparities.""")
|
| 13 |
+
|
| 14 |
+
elif app_mode == "EDA":
|
| 15 |
+
st.subheader("Exploratory Data Analysis (EDA)")
|
| 16 |
+
import eda
|
| 17 |
+
eda.run_eda()
|
| 18 |
+
|
| 19 |
+
elif app_mode == "Prediction":
|
| 20 |
+
st.subheader("Prediction")
|
| 21 |
+
|
| 22 |
+
# Import the prediction module
|
| 23 |
+
import prediction
|
| 24 |
+
|
| 25 |
+
# Get user input from the prediction module
|
| 26 |
+
user_input = prediction.get_user_input()
|
| 27 |
+
|
| 28 |
+
# Prediction Button
|
| 29 |
+
if st.button("Prediksi"):
|
| 30 |
+
try:
|
| 31 |
+
# Display user input in table form
|
| 32 |
+
st.subheader("Data Input Pengguna")
|
| 33 |
+
st.table(user_input) # Display user input as a table
|
| 34 |
+
|
| 35 |
+
# Call the run_modelling function from the prediction module
|
| 36 |
+
prediction_result = prediction.run_modelling(user_input)
|
| 37 |
+
|
| 38 |
+
st.subheader("Hasil Prediksi")
|
| 39 |
+
if prediction_result[0] == 0:
|
| 40 |
+
st.write("Berdasarkan analisa, individu dengan data tersebut memiliki penghasilan <= 50.000.000 IDR")
|
| 41 |
+
else:
|
| 42 |
+
st.write("Berdasarkan analisa, individu dengan data tersebut memiliki penghasilan > 50.000.000 IDR")
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
st.error(f"An error occurred during prediction: {e}")
|
pipeline.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:053932c59540b8f49895c2cfc2d0eb49ca7d9f3cfc78162b4e1de16f129fa17c
|
| 3 |
+
size 1741327
|
prediction.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import pickle
|
| 4 |
+
|
| 5 |
+
# Load model
|
| 6 |
+
model_path = "pipeline.pkl"
|
| 7 |
+
with open(model_path, 'rb') as file:
|
| 8 |
+
model = pickle.load(file)
|
| 9 |
+
|
| 10 |
+
def run_modelling(user_input):
|
| 11 |
+
prediction = model.predict(user_input)
|
| 12 |
+
return prediction
|
| 13 |
+
|
| 14 |
+
# Function to get user input from the sidebar
|
| 15 |
+
def get_user_input():
|
| 16 |
+
st.sidebar.header("Input Parameters")
|
| 17 |
+
|
| 18 |
+
# Use a unique key by appending a suffix or index
|
| 19 |
+
age = st.sidebar.number_input("Usia", min_value=0, max_value=120, value=25, key='age_input')
|
| 20 |
+
|
| 21 |
+
# Select boxes with options for education, marital status, and occupation
|
| 22 |
+
workclass = st.sidebar.selectbox("Jenis Pekerjaan", ["Swasta", "Wiraswasta (Tidak Terdaftar)", "Wiraswasta (Terdaftar)",
|
| 23 |
+
"Pemerintah Pusat", "Pemerintah Lokal", "Pemerintah Daerah",
|
| 24 |
+
"Tanpa Bayaran", "Belum Pernah Bekerja"], key='workclass_input')
|
| 25 |
+
education = st.sidebar.selectbox("Pendidikan", ['Lulusan SMA', 'Sebagian Kuliah', 'Sarjana',
|
| 26 |
+
'Magister', 'Diploma Vokasi', 'SMA (Kelas 11)',
|
| 27 |
+
'Diploma Akademik', 'SMA (Kelas 10)',
|
| 28 |
+
'SMP (Kelas 7-8)', 'Sekolah Profesional',
|
| 29 |
+
'SMP (Kelas 9)', 'SMA (Kelas 12)', 'Doktor',
|
| 30 |
+
'SD (Kelas 5-6)', 'SD (Kelas 1-4)',
|
| 31 |
+
'TK (Taman Kanak-Kanak)'], key='education_input')
|
| 32 |
+
fnlwgt = st.sidebar.number_input("Bobot Akhir", value=0, key='fnlwgt_input')
|
| 33 |
+
marital_status = st.sidebar.selectbox("Status Perkawinan", ['Menikah', 'Belum Pernah Menikah',
|
| 34 |
+
'Bercerai', 'Berpisah', 'Duda/Janda',
|
| 35 |
+
'Menikah (Pasangan Tidak Ada)',
|
| 36 |
+
'Menikah (Pasangan di Militer)'], key='marital_status_input')
|
| 37 |
+
occupation = st.sidebar.selectbox("Pekerjaan", ['Profesional', 'Perbaikan Kerajinan',
|
| 38 |
+
'Eksekutif/Manajerial', 'Administrasi/Klerikal',
|
| 39 |
+
'Penjualan', 'Layanan Lain',
|
| 40 |
+
'Operator Mesin/Inspeksi', 'Transportasi/Pengemudi',
|
| 41 |
+
'Pembersih/Tenaga Kasar', 'Pertanian/Perikanan',
|
| 42 |
+
'Dukungan Teknis', 'Layanan Perlindungan',
|
| 43 |
+
'Pelayan Rumah Tangga', 'Angkatan Bersenjata'], key='occupation_input')
|
| 44 |
+
relationship = st.sidebar.selectbox("Hubungan", ["Suami","Tidak Dalam Keluarga", "Anak Sendiri", "Unmarried", "Istri", "Kerabat Lain"], key='relationship_input')
|
| 45 |
+
race = st.sidebar.selectbox("Ras", ["Caucasian (Putih)" , "Afrika (Hitam)", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Lain-lain"], key='race_input')
|
| 46 |
+
gender = st.sidebar.selectbox("Jenis kelamin", ['Perempuan', 'Laki-Laki'], key='gender_input')
|
| 47 |
+
capital_gain = st.sidebar.number_input("Keuntungan Modal", value=0, key='capital_gain_input')
|
| 48 |
+
capital_loss = st.sidebar.number_input("Kerugian Modal", value=0, key='capital_loss_input')
|
| 49 |
+
hours_per_week = st.sidebar.number_input("Jam Kerja per Minggu", value=40, key='hours_per_week_input')
|
| 50 |
+
native_country = st.sidebar.selectbox("Negara Asal", ['United-States', 'Cambodia', 'England',
|
| 51 |
+
'Puerto-Rico', 'Canada', 'Germany',
|
| 52 |
+
'India', 'Japan', 'Greece', 'South',
|
| 53 |
+
'China', 'Cuba', 'Iran', 'Honduras',
|
| 54 |
+
'Philippines', 'Italy', 'Poland',
|
| 55 |
+
'Columbia', 'Mexico', 'Portugal',
|
| 56 |
+
'South Africa', 'Taiwan', 'Thailand',
|
| 57 |
+
'Yugoslavia'], key='native_country_input')
|
| 58 |
+
|
| 59 |
+
# Create a DataFrame from the inputs
|
| 60 |
+
user_input = pd.DataFrame({
|
| 61 |
+
'usia': [age],
|
| 62 |
+
'jenis_pekerjaan': [workclass],
|
| 63 |
+
'bobot_akhir': [fnlwgt],
|
| 64 |
+
'pendidikan': [education],
|
| 65 |
+
'nomor_pendidikan': [12],
|
| 66 |
+
'status_perkawinan': [marital_status],
|
| 67 |
+
'pekerjaan': [occupation],
|
| 68 |
+
'hubungan': [relationship],
|
| 69 |
+
'ras': [race],
|
| 70 |
+
'jenis_kelamin': [gender],
|
| 71 |
+
'keuntungan_modal': [capital_gain],
|
| 72 |
+
'kerugian_modal': [capital_loss],
|
| 73 |
+
'jam_kerja': [hours_per_week],
|
| 74 |
+
'negara_asal': [native_country]
|
| 75 |
+
})
|
| 76 |
+
|
| 77 |
+
return user_input
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data manipulation and plotting
|
| 2 |
+
pandas==2.2.2
|
| 3 |
+
matplotlib==3.8.4
|
| 4 |
+
seaborn==0.13.2
|
| 5 |
+
|
| 6 |
+
# Encoding and saving models
|
| 7 |
+
category_encoders==2.6.4
|
| 8 |
+
pickle-mixin==1.0.2
|
| 9 |
+
|
| 10 |
+
# Machine Learning & Preprocessing
|
| 11 |
+
scikit-learn==1.4.2
|
| 12 |
+
imblearn
|
| 13 |
+
phik==0.12.4
|
| 14 |
+
|
| 15 |
+
# Statistical tests and distributions
|
| 16 |
+
scipy==1.13.1
|