ebhon commited on
Commit
5d2542e
·
verified ·
1 Parent(s): a2bb9a7

Update eda.py

Browse files
Files changed (1) hide show
  1. eda.py +142 -142
eda.py CHANGED
@@ -1,142 +1,142 @@
1
- import pandas as pd
2
- import matplotlib.pyplot as plt
3
- import seaborn as sns
4
- import streamlit as st
5
- import os
6
- from phik import phik_matrix
7
-
8
- # Path to dataset
9
- data_path = r"C:\Users\handw\Documents\FTDS\p1-ftds036-rmt-m2-ebhon\deployment\adult.csv"
10
-
11
- # Load dataset
12
- @st.cache_data
13
- def load_data():
14
- if not os.path.isfile(data_path):
15
- st.error(f"File not found: {data_path}")
16
- return None
17
- return pd.read_csv(data_path)
18
-
19
- def run_eda():
20
- # Load data
21
- data = load_data()
22
-
23
- # Check if data is loaded successfully
24
- if data is not None:
25
- # Trim whitespace from column names
26
- data.columns = data.columns.str.strip()
27
-
28
- # Sidebar for chart selection
29
- st.sidebar.title("EDA Menu")
30
- menu_options = st.sidebar.radio("Select a chart:",
31
- ("Age Distribution Histogram",
32
- "Average Age by Income Category",
33
- "Count by Work Class and Income",
34
- "Average Capital Gain by Education Level",
35
- "Total Hours Worked by Income Category",
36
- "Count by Marital Status and Income",
37
- "Phik Correlation Matrix"))
38
-
39
- # Histogram of Age distribution
40
- if menu_options == "Age Distribution Histogram":
41
- st.subheader("Histogram of Age Distribution")
42
- if 'age' in data.columns:
43
- plt.figure(figsize=(10, 6))
44
- sns.histplot(data['age'], bins=30, kde=True)
45
- plt.title('Distribusi Usia')
46
- plt.xlabel('Usia')
47
- plt.ylabel('Frekuensi')
48
- st.pyplot(plt)
49
- st.write("**Insight:** This histogram shows the age distribution of individuals in the dataset, indicating how age varies among the population.")
50
- else:
51
- st.error("Column 'age' not found in the dataset.")
52
-
53
- # Average Age by Income Category
54
- if menu_options == "Average Age by Income Category":
55
- st.subheader("Average Age Based on Income Category")
56
- if 'income' in data.columns and 'age' in data.columns:
57
- age_income = data.groupby('income')['age'].mean().reset_index() # Group age by income
58
- plt.figure(figsize=(10, 6))
59
- sns.barplot(data=age_income, x='income', y='age')
60
- plt.title('Rata-rata Usia berdasarkan Kategori Pendapatan')
61
- plt.xlabel('Kategori Pendapatan')
62
- plt.ylabel('Rata-rata Usia')
63
- st.pyplot(plt)
64
- st.write("**Insight:** This bar plot displays the average age of individuals based on income categories, showing how age correlates with income.")
65
- else:
66
- st.error("Required columns not found in the dataset.")
67
-
68
- # Count by Work Class and Income
69
- if menu_options == "Count by Work Class and Income":
70
- st.subheader("Count by Work Class and Income")
71
- if 'workclass' in data.columns and 'income' in data.columns:
72
- workclass_income = data.groupby(['workclass', 'income']).size().reset_index(name='count')
73
- plt.figure(figsize=(12, 6))
74
- sns.barplot(data=workclass_income, x='workclass', y='count', hue='income')
75
- plt.title('Jumlah Individu berdasarkan Jenis Pekerjaan dan Pendapatan')
76
- plt.xticks(rotation=45)
77
- st.pyplot(plt)
78
- st.write("**Insight:** This plot illustrates the distribution of individuals by their job types and income levels, highlighting job categories that attract higher income.")
79
- else:
80
- st.error("Required columns not found in the dataset.")
81
-
82
- # Average Capital Gain by Education Level
83
- if menu_options == "Average Capital Gain by Education Level":
84
- st.subheader("Average Capital Gain Based on Education Level")
85
- if 'education' in data.columns and 'capital-gain' in data.columns:
86
- capital_gain_education = data.groupby('education')['capital-gain'].mean().reset_index()
87
- plt.figure(figsize=(12, 6))
88
- sns.barplot(data=capital_gain_education, x='education', y='capital-gain')
89
- plt.title('Rata-rata Keuntungan Modal berdasarkan Tingkat Pendidikan')
90
- plt.xticks(rotation=45)
91
- st.pyplot(plt)
92
- st.write("**Insight:** This bar plot indicates the average capital gain across different education levels, suggesting that higher education is associated with greater financial gains.")
93
- else:
94
- st.error("Required columns not found in the dataset.")
95
-
96
- # Total Hours Worked by Income Category
97
- if menu_options == "Total Hours Worked by Income Category":
98
- st.subheader("Total Hours Worked Based on Income Category")
99
- if 'income' in data.columns and 'hours-per-week' in data.columns:
100
- hours_income = data.groupby('income')['hours-per-week'].sum().reset_index()
101
- plt.figure(figsize=(8, 5))
102
- sns.barplot(data=hours_income, x='income', y='hours-per-week')
103
- plt.title('Total Jam Kerja berdasarkan Kategori Pendapatan')
104
- plt.xlabel('Kategori Pendapatan')
105
- plt.ylabel('Total Jam Kerja')
106
- st.pyplot(plt)
107
- st.write("**Insight:** This plot shows the total number of hours worked for each income category, indicating the relationship between working hours and income.")
108
- else:
109
- st.error("Required columns not found in the dataset.")
110
-
111
- # Count by Marital Status and Income
112
- if menu_options == "Count by Marital Status and Income":
113
- st.subheader("Count by Marital Status and Income")
114
- if 'marital-status' in data.columns and 'income' in data.columns:
115
- relationship_income = data.groupby(['marital-status', 'income']).size().reset_index(name='count')
116
- plt.figure(figsize=(12, 6))
117
- sns.barplot(data=relationship_income, x='marital-status', y='count', hue='income')
118
- plt.title('Jumlah Individu berdasarkan Status Perkawinan dan Pendapatan')
119
- plt.xticks(rotation=45)
120
- st.pyplot(plt)
121
- st.write("**Insight:** This plot shows the distribution of individuals by marital status and income category, providing insights into how marital status may affect income.")
122
- else:
123
- st.error("Required columns not found in the dataset.")
124
-
125
- # Phik Correlation Matrix
126
- if menu_options == "Phik Correlation Matrix":
127
- st.subheader("Phik Correlation Matrix")
128
- # List the required columns
129
- required_columns = ['income', 'age', 'capital-gain', 'hours-per-week', 'marital-status', 'education', 'workclass']
130
- if all(col in data.columns for col in required_columns):
131
- # Calculate the Phik correlation matrix
132
- phik_corr = data.phik_matrix()
133
- plt.figure(figsize=(12, 8))
134
- sns.heatmap(phik_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
135
- plt.title('Phik Correlation Matrix (Sampled Data)')
136
- st.pyplot(plt)
137
- st.write("**Insight:** The Phik correlation matrix reveals the strength and direction of relationships between variables, helping identify multicollinearity and associations within the dataset.")
138
- else:
139
- missing_cols = [col for col in required_columns if col not in data.columns]
140
- st.error(f"Required columns not found in the dataset: {', '.join(missing_cols)}")
141
- else:
142
- st.error("Data not loaded successfully.")
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ import streamlit as st
5
+ import os
6
+ from phik import phik_matrix
7
+
8
+ # Path to dataset
9
+ data_path = 'adult.csv'
10
+
11
+ # Load dataset
12
+ @st.cache_data
13
+ def load_data():
14
+ if not os.path.isfile(data_path):
15
+ st.error(f"File not found: {data_path}")
16
+ return None
17
+ return pd.read_csv(data_path)
18
+
19
+ def run_eda():
20
+ # Load data
21
+ data = load_data()
22
+
23
+ # Check if data is loaded successfully
24
+ if data is not None:
25
+ # Trim whitespace from column names
26
+ data.columns = data.columns.str.strip()
27
+
28
+ # Sidebar for chart selection
29
+ st.sidebar.title("EDA Menu")
30
+ menu_options = st.sidebar.radio("Select a chart:",
31
+ ("Age Distribution Histogram",
32
+ "Average Age by Income Category",
33
+ "Count by Work Class and Income",
34
+ "Average Capital Gain by Education Level",
35
+ "Total Hours Worked by Income Category",
36
+ "Count by Marital Status and Income",
37
+ "Phik Correlation Matrix"))
38
+
39
+ # Histogram of Age distribution
40
+ if menu_options == "Age Distribution Histogram":
41
+ st.subheader("Histogram of Age Distribution")
42
+ if 'age' in data.columns:
43
+ plt.figure(figsize=(10, 6))
44
+ sns.histplot(data['age'], bins=30, kde=True)
45
+ plt.title('Distribusi Usia')
46
+ plt.xlabel('Usia')
47
+ plt.ylabel('Frekuensi')
48
+ st.pyplot(plt)
49
+ st.write("**Insight:** This histogram shows the age distribution of individuals in the dataset, indicating how age varies among the population.")
50
+ else:
51
+ st.error("Column 'age' not found in the dataset.")
52
+
53
+ # Average Age by Income Category
54
+ if menu_options == "Average Age by Income Category":
55
+ st.subheader("Average Age Based on Income Category")
56
+ if 'income' in data.columns and 'age' in data.columns:
57
+ age_income = data.groupby('income')['age'].mean().reset_index() # Group age by income
58
+ plt.figure(figsize=(10, 6))
59
+ sns.barplot(data=age_income, x='income', y='age')
60
+ plt.title('Rata-rata Usia berdasarkan Kategori Pendapatan')
61
+ plt.xlabel('Kategori Pendapatan')
62
+ plt.ylabel('Rata-rata Usia')
63
+ st.pyplot(plt)
64
+ st.write("**Insight:** This bar plot displays the average age of individuals based on income categories, showing how age correlates with income.")
65
+ else:
66
+ st.error("Required columns not found in the dataset.")
67
+
68
+ # Count by Work Class and Income
69
+ if menu_options == "Count by Work Class and Income":
70
+ st.subheader("Count by Work Class and Income")
71
+ if 'workclass' in data.columns and 'income' in data.columns:
72
+ workclass_income = data.groupby(['workclass', 'income']).size().reset_index(name='count')
73
+ plt.figure(figsize=(12, 6))
74
+ sns.barplot(data=workclass_income, x='workclass', y='count', hue='income')
75
+ plt.title('Jumlah Individu berdasarkan Jenis Pekerjaan dan Pendapatan')
76
+ plt.xticks(rotation=45)
77
+ st.pyplot(plt)
78
+ st.write("**Insight:** This plot illustrates the distribution of individuals by their job types and income levels, highlighting job categories that attract higher income.")
79
+ else:
80
+ st.error("Required columns not found in the dataset.")
81
+
82
+ # Average Capital Gain by Education Level
83
+ if menu_options == "Average Capital Gain by Education Level":
84
+ st.subheader("Average Capital Gain Based on Education Level")
85
+ if 'education' in data.columns and 'capital-gain' in data.columns:
86
+ capital_gain_education = data.groupby('education')['capital-gain'].mean().reset_index()
87
+ plt.figure(figsize=(12, 6))
88
+ sns.barplot(data=capital_gain_education, x='education', y='capital-gain')
89
+ plt.title('Rata-rata Keuntungan Modal berdasarkan Tingkat Pendidikan')
90
+ plt.xticks(rotation=45)
91
+ st.pyplot(plt)
92
+ st.write("**Insight:** This bar plot indicates the average capital gain across different education levels, suggesting that higher education is associated with greater financial gains.")
93
+ else:
94
+ st.error("Required columns not found in the dataset.")
95
+
96
+ # Total Hours Worked by Income Category
97
+ if menu_options == "Total Hours Worked by Income Category":
98
+ st.subheader("Total Hours Worked Based on Income Category")
99
+ if 'income' in data.columns and 'hours-per-week' in data.columns:
100
+ hours_income = data.groupby('income')['hours-per-week'].sum().reset_index()
101
+ plt.figure(figsize=(8, 5))
102
+ sns.barplot(data=hours_income, x='income', y='hours-per-week')
103
+ plt.title('Total Jam Kerja berdasarkan Kategori Pendapatan')
104
+ plt.xlabel('Kategori Pendapatan')
105
+ plt.ylabel('Total Jam Kerja')
106
+ st.pyplot(plt)
107
+ st.write("**Insight:** This plot shows the total number of hours worked for each income category, indicating the relationship between working hours and income.")
108
+ else:
109
+ st.error("Required columns not found in the dataset.")
110
+
111
+ # Count by Marital Status and Income
112
+ if menu_options == "Count by Marital Status and Income":
113
+ st.subheader("Count by Marital Status and Income")
114
+ if 'marital-status' in data.columns and 'income' in data.columns:
115
+ relationship_income = data.groupby(['marital-status', 'income']).size().reset_index(name='count')
116
+ plt.figure(figsize=(12, 6))
117
+ sns.barplot(data=relationship_income, x='marital-status', y='count', hue='income')
118
+ plt.title('Jumlah Individu berdasarkan Status Perkawinan dan Pendapatan')
119
+ plt.xticks(rotation=45)
120
+ st.pyplot(plt)
121
+ st.write("**Insight:** This plot shows the distribution of individuals by marital status and income category, providing insights into how marital status may affect income.")
122
+ else:
123
+ st.error("Required columns not found in the dataset.")
124
+
125
+ # Phik Correlation Matrix
126
+ if menu_options == "Phik Correlation Matrix":
127
+ st.subheader("Phik Correlation Matrix")
128
+ # List the required columns
129
+ required_columns = ['income', 'age', 'capital-gain', 'hours-per-week', 'marital-status', 'education', 'workclass']
130
+ if all(col in data.columns for col in required_columns):
131
+ # Calculate the Phik correlation matrix
132
+ phik_corr = data.phik_matrix()
133
+ plt.figure(figsize=(12, 8))
134
+ sns.heatmap(phik_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
135
+ plt.title('Phik Correlation Matrix (Sampled Data)')
136
+ st.pyplot(plt)
137
+ st.write("**Insight:** The Phik correlation matrix reveals the strength and direction of relationships between variables, helping identify multicollinearity and associations within the dataset.")
138
+ else:
139
+ missing_cols = [col for col in required_columns if col not in data.columns]
140
+ st.error(f"Required columns not found in the dataset: {', '.join(missing_cols)}")
141
+ else:
142
+ st.error("Data not loaded successfully.")