File size: 13,674 Bytes
8dfedb7
 
 
835b9ef
 
8dfedb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f1dd18
8dfedb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
'''
Materi Dr. Eng. Farrikh Alzami, M.Kom - Universitas Dian Nuswantoro
'''
import streamlit as st

# Page configuration - MUST be first Streamlit command
st.set_page_config(
    page_title="Income Prediction App - Materi Dr.Eng. Farrikh Alzami, M.Kom",
    page_icon="๐Ÿ’ฐ",
    layout="wide",
    initial_sidebar_state="collapsed"
)

import pandas as pd
import numpy as np
import joblib
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import json

# Load model components
@st.cache_resource
def load_model():
    """Load the trained model components"""
    try:
        components = joblib.load('./src/income_prediction_components.joblib')
        return components
    except FileNotFoundError:
        st.error("Model file 'income_prediction_components.joblib' not found!")
        st.stop()
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        st.stop()

def predict_income(data, model_components):
    """Make income predictions using the trained model"""
    # Convert to DataFrame if needed
    if isinstance(data, dict):
        df = pd.DataFrame([data])
    else:
        df = data.copy()
    
    # Get components
    model = model_components['model']
    encoding_maps = model_components['encoding_maps']
    feature_names = model_components['feature_names']
    
    # Apply encodings to categorical columns
    for column in df.columns:
        if column in encoding_maps and column != 'income':
            df[column] = df[column].map(encoding_maps[column])
    
    # Ensure we only use features that the model was trained on
    df_for_pred = df[feature_names].copy()
    
    # Make prediction
    prediction = model.predict(df_for_pred)[0]
    probabilities = model.predict_proba(df_for_pred)[0]
    
    # Get income label
    income_map_inverse = {v: k for k, v in encoding_maps['income'].items()}
    prediction_label = income_map_inverse[prediction]
    
    return {
        'prediction': int(prediction),
        'prediction_label': prediction_label,
        'probability': float(probabilities[prediction]),
        'probabilities': probabilities.tolist()
    }

def validate_inputs(data):
    """Validate input data"""
    errors = []
    
    # Age validation
    if data['age'] < 17 or data['age'] > 90:
        errors.append("Age should be between 17 and 90")
    
    # Education number validation
    if data['education_num'] < 1 or data['education_num'] > 16:
        errors.append("Education number should be between 1 and 16")
    
    # Hours per week validation
    if data['hours_per_week'] < 1 or data['hours_per_week'] > 99:
        errors.append("Hours per week should be between 1 and 99")
    
    # Capital gain/loss validation
    if data['capital_gain'] < 0 or data['capital_gain'] > 99999:
        errors.append("Capital gain should be between 0 and 99999")
    
    if data['capital_loss'] < 0 or data['capital_loss'] > 4356:
        errors.append("Capital loss should be between 0 and 4356")
    
    # Final weight validation
    if data['fnlwgt'] < 12285 or data['fnlwgt'] > 1484705:
        errors.append("Final weight should be between 12285 and 1484705")
    
    return errors

def export_prediction(data, result):
    """Export prediction result to JSON"""
    export_data = {
        'timestamp': datetime.now().isoformat(),
        'input_data': data,
        'prediction': {
            'class': result['prediction_label'],
            'confidence': result['probability'],
            'raw_prediction': result['prediction']
        }
    }
    return json.dumps(export_data, indent=2)

def reset_session_state():
    """Reset all input values to default"""
    keys_to_reset = [
        'age', 'workclass', 'fnlwgt', 'education_num', 'marital_status',
        'occupation', 'relationship', 'race', 'sex', 'capital_gain',
        'capital_loss', 'hours_per_week', 'native_country'
    ]
    for key in keys_to_reset:
        if key in st.session_state:
            del st.session_state[key]

# Load model
model_components = load_model()

# Define mappings (from the original notebook)
workclass_options = ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 
                    'Local-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked']

marital_status_options = ['Never-married', 'Married-civ-spouse', 'Divorced', 
                         'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']

occupation_options = ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Prof-specialty',
                     'Other-service', 'Sales', 'Craft-repair', 'Transport-moving',
                     'Farming-fishing', 'Machine-op-inspct', 'Tech-support',
                     'Protective-serv', 'Armed-Forces', 'Priv-house-serv']

relationship_options = ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']

race_options = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']

sex_options = ['Male', 'Female']

native_country_options = ['United-States', 'Cuba', 'Jamaica', 'India', 'Mexico', 'South',
                         'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany', 'Iran',
                         'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia', 'Thailand', 'Ecuador',
                         'Laos', 'Taiwan', 'Haiti', 'Portugal', 'Dominican-Republic', 'El-Salvador',
                         'France', 'Guatemala', 'China', 'Japan', 'Yugoslavia', 'Peru',
                         'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago', 'Greece',
                         'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary', 'Holand-Netherlands']

# Main app
st.title("๐Ÿ’ฐ Income Prediction App - Dr. Eng. Farrikh Alzami, M.Kom")
st.markdown("Predict whether income exceeds $50K/year based on demographic data")

# Create two columns for layout
col1, col2 = st.columns([2, 1])

with col1:
    st.subheader("๐Ÿ“ Input Features")
    
    # Create form for inputs
    with st.form("prediction_form"):
        # Demographic Information
        st.markdown("**Demographic Information**")
        col_demo1, col_demo2 = st.columns(2)
        
        with col_demo1:
            age = st.number_input("Age", min_value=17, max_value=90, value=39, key="age")
            sex = st.selectbox("Sex", sex_options, key="sex")
            race = st.selectbox("Race", race_options, key="race")
        
        with col_demo2:
            marital_status = st.selectbox("Marital Status", marital_status_options, key="marital_status")
            relationship = st.selectbox("Relationship", relationship_options, key="relationship")
            native_country = st.selectbox("Native Country", native_country_options, key="native_country")
        
        st.divider()
        
        # Work Information
        st.markdown("**Work Information**")
        col_work1, col_work2 = st.columns(2)
        
        with col_work1:
            workclass = st.selectbox("Work Class", workclass_options, key="workclass")
            occupation = st.selectbox("Occupation", occupation_options, key="occupation")
            hours_per_week = st.number_input("Hours per Week", min_value=1, max_value=99, value=40, key="hours_per_week")
        
        with col_work2:
            education_num = st.number_input("Education Level (Years)", min_value=1, max_value=16, value=10, key="education_num")
            fnlwgt = st.number_input("Final Weight", min_value=12285, max_value=1484705, value=77516, key="fnlwgt")
        
        st.divider()
        
        # Financial Information
        st.markdown("**Financial Information**")
        col_fin1, col_fin2 = st.columns(2)
        
        with col_fin1:
            capital_gain = st.number_input("Capital Gain", min_value=0, max_value=99999, value=0, key="capital_gain")
        
        with col_fin2:
            capital_loss = st.number_input("Capital Loss", min_value=0, max_value=4356, value=0, key="capital_loss")
        
        # Buttons
        col_btn1, col_btn2, col_btn3 = st.columns(3)
        with col_btn1:
            predict_button = st.form_submit_button("๐Ÿ”ฎ Predict", type="primary")
        with col_btn2:
            reset_button = st.form_submit_button("๐Ÿ”„ Reset")
        with col_btn3:
            export_button = st.form_submit_button("๐Ÿ“ค Export Last Result")

# Handle reset button
if reset_button:
    reset_session_state()
    st.rerun()

# Handle prediction
if predict_button:
    # Collect input data
    input_data = {
        'age': age,
        'workclass': workclass,
        'fnlwgt': fnlwgt,
        'education_num': education_num,
        'marital_status': marital_status,
        'occupation': occupation,
        'relationship': relationship,
        'race': race,
        'sex': sex,
        'capital_gain': capital_gain,
        'capital_loss': capital_loss,
        'hours_per_week': hours_per_week,
        'native_country': native_country
    }
    
    # Validate inputs
    validation_errors = validate_inputs(input_data)
    
    if validation_errors:
        with col2:
            st.error("โŒ Validation Errors:")
            for error in validation_errors:
                st.error(f"โ€ข {error}")
    else:
        # Make prediction
        try:
            result = predict_income(input_data, model_components)
            
            # Store result in session state for export
            st.session_state['last_prediction'] = {
                'input_data': input_data,
                'result': result
            }
            
            with col2:
                st.subheader("๐ŸŽฏ Prediction Results")
                
                # Display prediction
                prediction_color = "green" if result['prediction_label'] == '>50K' else "orange"
                st.markdown(f"**Predicted Income:** :{prediction_color}[{result['prediction_label']}]")
                
                # Confidence level with gauge
                confidence = result['probability'] * 100
                
                fig_gauge = go.Figure(go.Indicator(
                    mode = "gauge+number+delta",
                    value = confidence,
                    domain = {'x': [0, 1], 'y': [0, 1]},
                    title = {'text': "Confidence Level (%)"},
                    gauge = {
                        'axis': {'range': [None, 100]},
                        'bar': {'color': prediction_color},
                        'steps': [
                            {'range': [0, 50], 'color': "lightgray"},
                            {'range': [50, 80], 'color': "yellow"},
                            {'range': [80, 100], 'color': "lightgreen"}
                        ],
                        'threshold': {
                            'line': {'color': "red", 'width': 4},
                            'thickness': 0.75,
                            'value': 90
                        }
                    }
                ))
                fig_gauge.update_layout(height=300, margin=dict(l=20, r=20, t=40, b=20))
                st.plotly_chart(fig_gauge, use_container_width=True)
                
                # Probability breakdown
                prob_df = pd.DataFrame({
                    'Class': ['โ‰ค50K', '>50K'],
                    'Probability': result['probabilities']
                })
                
                fig_bar = px.bar(
                    prob_df, 
                    x='Class', 
                    y='Probability',
                    title='Probability Distribution',
                    color='Probability',
                    color_continuous_scale=['orange', 'green']
                )
                fig_bar.update_layout(height=300, margin=dict(l=20, r=20, t=40, b=20))
                st.plotly_chart(fig_bar, use_container_width=True)
                
        except Exception as e:
            with col2:
                st.error(f"โŒ Prediction Error: {str(e)}")

# Feature Importance section
st.subheader("๐Ÿ“Š Feature Importance")

if 'model' in model_components:
    try:
        feature_names = model_components['feature_names']
        feature_importance = model_components['model'].feature_importances_
        
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_importance
        }).sort_values('Importance', ascending=True)
        
        fig_importance = px.bar(
            importance_df, 
            x='Importance', 
            y='Feature',
            orientation='h',
            title='Feature Importance in Decision Tree Model',
            color='Importance',
            color_continuous_scale='viridis'
        )
        fig_importance.update_layout(height=400, margin=dict(l=20, r=20, t=40, b=20))
        st.plotly_chart(fig_importance, use_container_width=True)
        
    except Exception as e:
        st.error(f"Error displaying feature importance: {str(e)}")

# Handle export
if export_button:
    if 'last_prediction' in st.session_state:
        export_data = export_prediction(
            st.session_state['last_prediction']['input_data'],
            st.session_state['last_prediction']['result']
        )
        
        st.download_button(
            label="๐Ÿ“ฅ Download Prediction Results",
            data=export_data,
            file_name=f"income_prediction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
            mime="application/json"
        )
    else:
        st.warning("โš ๏ธ No prediction results to export. Please make a prediction first.")

# Footer
st.markdown("---")
st.markdown("*Built with Streamlit โ€ข Dr. Eng. Farrikh Alzami, M.Kom*")