Spaces:

alzami
/

ti_data_mining

Sleeping

File size: 13,674 Bytes

8dfedb7
 
 
835b9ef
 
8dfedb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f1dd18
8dfedb7

'''
Materi Dr. Eng. Farrikh Alzami, M.Kom - Universitas Dian Nuswantoro
'''
import streamlit as st

# Page configuration - MUST be first Streamlit command
st.set_page_config(
    page_title="Income Prediction App - Materi Dr.Eng. Farrikh Alzami, M.Kom",
    page_icon="💰",
    layout="wide",
    initial_sidebar_state="collapsed"
)

import pandas as pd
import numpy as np
import joblib
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import json

# Load model components
@st.cache_resource
def load_model():
    """Load the trained model components"""
    try:
        components = joblib.load('./src/income_prediction_components.joblib')
        return components
    except FileNotFoundError:
        st.error("Model file 'income_prediction_components.joblib' not found!")
        st.stop()
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        st.stop()

def predict_income(data, model_components):
    """Make income predictions using the trained model"""
    # Convert to DataFrame if needed
    if isinstance(data, dict):
        df = pd.DataFrame([data])
    else:
        df = data.copy()
    
    # Get components
    model = model_components['model']
    encoding_maps = model_components['encoding_maps']
    feature_names = model_components['feature_names']
    
    # Apply encodings to categorical columns
    for column in df.columns:
        if column in encoding_maps and column != 'income':
            df[column] = df[column].map(encoding_maps[column])
    
    # Ensure we only use features that the model was trained on
    df_for_pred = df[feature_names].copy()
    
    # Make prediction
    prediction = model.predict(df_for_pred)[0]
    probabilities = model.predict_proba(df_for_pred)[0]
    
    # Get income label
    income_map_inverse = {v: k for k, v in encoding_maps['income'].items()}
    prediction_label = income_map_inverse[prediction]
    
    return {
        'prediction': int(prediction),
        'prediction_label': prediction_label,
        'probability': float(probabilities[prediction]),
        'probabilities': probabilities.tolist()
    }

def validate_inputs(data):
    """Validate input data"""
    errors = []
    
    # Age validation
    if data['age'] < 17 or data['age'] > 90:
        errors.append("Age should be between 17 and 90")
    
    # Education number validation
    if data['education_num'] < 1 or data['education_num'] > 16:
        errors.append("Education number should be between 1 and 16")
    
    # Hours per week validation
    if data['hours_per_week'] < 1 or data['hours_per_week'] > 99:
        errors.append("Hours per week should be between 1 and 99")
    
    # Capital gain/loss validation
    if data['capital_gain'] < 0 or data['capital_gain'] > 99999:
        errors.append("Capital gain should be between 0 and 99999")
    
    if data['capital_loss'] < 0 or data['capital_loss'] > 4356:
        errors.append("Capital loss should be between 0 and 4356")
    
    # Final weight validation
    if data['fnlwgt'] < 12285 or data['fnlwgt'] > 1484705:
        errors.append("Final weight should be between 12285 and 1484705")
    
    return errors

def export_prediction(data, result):
    """Export prediction result to JSON"""
    export_data = {
        'timestamp': datetime.now().isoformat(),
        'input_data': data,
        'prediction': {
            'class': result['prediction_label'],
            'confidence': result['probability'],
            'raw_prediction': result['prediction']
        }
    }
    return json.dumps(export_data, indent=2)

def reset_session_state():
    """Reset all input values to default"""
    keys_to_reset = [
        'age', 'workclass', 'fnlwgt', 'education_num', 'marital_status',
        'occupation', 'relationship', 'race', 'sex', 'capital_gain',
        'capital_loss', 'hours_per_week', 'native_country'
    ]
    for key in keys_to_reset:
        if key in st.session_state:
            del st.session_state[key]

# Load model
model_components = load_model()

# Define mappings (from the original notebook)
workclass_options = ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 
                    'Local-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked']

marital_status_options = ['Never-married', 'Married-civ-spouse', 'Divorced', 
                         'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']

occupation_options = ['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Prof-specialty',
                     'Other-service', 'Sales', 'Craft-repair', 'Transport-moving',
                     'Farming-fishing', 'Machine-op-inspct', 'Tech-support',
                     'Protective-serv', 'Armed-Forces', 'Priv-house-serv']

relationship_options = ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']

race_options = ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']

sex_options = ['Male', 'Female']

native_country_options = ['United-States', 'Cuba', 'Jamaica', 'India', 'Mexico', 'South',
                         'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany', 'Iran',
                         'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia', 'Thailand', 'Ecuador',
                         'Laos', 'Taiwan', 'Haiti', 'Portugal', 'Dominican-Republic', 'El-Salvador',
                         'France', 'Guatemala', 'China', 'Japan', 'Yugoslavia', 'Peru',
                         'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago', 'Greece',
                         'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary', 'Holand-Netherlands']

# Main app
st.title("💰 Income Prediction App - Dr. Eng. Farrikh Alzami, M.Kom")
st.markdown("Predict whether income exceeds $50K/year based on demographic data")

# Create two columns for layout
col1, col2 = st.columns([2, 1])

with col1:
    st.subheader("📝 Input Features")
    
    # Create form for inputs
    with st.form("prediction_form"):
        # Demographic Information
        st.markdown("**Demographic Information**")
        col_demo1, col_demo2 = st.columns(2)
        
        with col_demo1:
            age = st.number_input("Age", min_value=17, max_value=90, value=39, key="age")
            sex = st.selectbox("Sex", sex_options, key="sex")
            race = st.selectbox("Race", race_options, key="race")
        
        with col_demo2:
            marital_status = st.selectbox("Marital Status", marital_status_options, key="marital_status")
            relationship = st.selectbox("Relationship", relationship_options, key="relationship")
            native_country = st.selectbox("Native Country", native_country_options, key="native_country")
        
        st.divider()
        
        # Work Information
        st.markdown("**Work Information**")
        col_work1, col_work2 = st.columns(2)
        
        with col_work1:
            workclass = st.selectbox("Work Class", workclass_options, key="workclass")
            occupation = st.selectbox("Occupation", occupation_options, key="occupation")
            hours_per_week = st.number_input("Hours per Week", min_value=1, max_value=99, value=40, key="hours_per_week")
        
        with col_work2:
            education_num = st.number_input("Education Level (Years)", min_value=1, max_value=16, value=10, key="education_num")
            fnlwgt = st.number_input("Final Weight", min_value=12285, max_value=1484705, value=77516, key="fnlwgt")
        
        st.divider()
        
        # Financial Information
        st.markdown("**Financial Information**")
        col_fin1, col_fin2 = st.columns(2)
        
        with col_fin1:
            capital_gain = st.number_input("Capital Gain", min_value=0, max_value=99999, value=0, key="capital_gain")
        
        with col_fin2:
            capital_loss = st.number_input("Capital Loss", min_value=0, max_value=4356, value=0, key="capital_loss")
        
        # Buttons
        col_btn1, col_btn2, col_btn3 = st.columns(3)
        with col_btn1:
            predict_button = st.form_submit_button("🔮 Predict", type="primary")
        with col_btn2:
            reset_button = st.form_submit_button("🔄 Reset")
        with col_btn3:
            export_button = st.form_submit_button("📤 Export Last Result")

# Handle reset button
if reset_button:
    reset_session_state()
    st.rerun()

# Handle prediction
if predict_button:
    # Collect input data
    input_data = {
        'age': age,
        'workclass': workclass,
        'fnlwgt': fnlwgt,
        'education_num': education_num,
        'marital_status': marital_status,
        'occupation': occupation,
        'relationship': relationship,
        'race': race,
        'sex': sex,
        'capital_gain': capital_gain,
        'capital_loss': capital_loss,
        'hours_per_week': hours_per_week,
        'native_country': native_country
    }
    
    # Validate inputs
    validation_errors = validate_inputs(input_data)
    
    if validation_errors:
        with col2:
            st.error("❌ Validation Errors:")
            for error in validation_errors:
                st.error(f"• {error}")
    else:
        # Make prediction
        try:
            result = predict_income(input_data, model_components)
            
            # Store result in session state for export
            st.session_state['last_prediction'] = {
                'input_data': input_data,
                'result': result
            }
            
            with col2:
                st.subheader("🎯 Prediction Results")
                
                # Display prediction
                prediction_color = "green" if result['prediction_label'] == '>50K' else "orange"
                st.markdown(f"**Predicted Income:** :{prediction_color}[{result['prediction_label']}]")
                
                # Confidence level with gauge
                confidence = result['probability'] * 100
                
                fig_gauge = go.Figure(go.Indicator(
                    mode = "gauge+number+delta",
                    value = confidence,
                    domain = {'x': [0, 1], 'y': [0, 1]},
                    title = {'text': "Confidence Level (%)"},
                    gauge = {
                        'axis': {'range': [None, 100]},
                        'bar': {'color': prediction_color},
                        'steps': [
                            {'range': [0, 50], 'color': "lightgray"},
                            {'range': [50, 80], 'color': "yellow"},
                            {'range': [80, 100], 'color': "lightgreen"}
                        ],
                        'threshold': {
                            'line': {'color': "red", 'width': 4},
                            'thickness': 0.75,
                            'value': 90
                        }
                    }
                ))
                fig_gauge.update_layout(height=300, margin=dict(l=20, r=20, t=40, b=20))
                st.plotly_chart(fig_gauge, use_container_width=True)
                
                # Probability breakdown
                prob_df = pd.DataFrame({
                    'Class': ['≤50K', '>50K'],
                    'Probability': result['probabilities']
                })
                
                fig_bar = px.bar(
                    prob_df, 
                    x='Class', 
                    y='Probability',
                    title='Probability Distribution',
                    color='Probability',
                    color_continuous_scale=['orange', 'green']
                )
                fig_bar.update_layout(height=300, margin=dict(l=20, r=20, t=40, b=20))
                st.plotly_chart(fig_bar, use_container_width=True)
                
        except Exception as e:
            with col2:
                st.error(f"❌ Prediction Error: {str(e)}")

# Feature Importance section
st.subheader("📊 Feature Importance")

if 'model' in model_components:
    try:
        feature_names = model_components['feature_names']
        feature_importance = model_components['model'].feature_importances_
        
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_importance
        }).sort_values('Importance', ascending=True)
        
        fig_importance = px.bar(
            importance_df, 
            x='Importance', 
            y='Feature',
            orientation='h',
            title='Feature Importance in Decision Tree Model',
            color='Importance',
            color_continuous_scale='viridis'
        )
        fig_importance.update_layout(height=400, margin=dict(l=20, r=20, t=40, b=20))
        st.plotly_chart(fig_importance, use_container_width=True)
        
    except Exception as e:
        st.error(f"Error displaying feature importance: {str(e)}")

# Handle export
if export_button:
    if 'last_prediction' in st.session_state:
        export_data = export_prediction(
            st.session_state['last_prediction']['input_data'],
            st.session_state['last_prediction']['result']
        )
        
        st.download_button(
            label="📥 Download Prediction Results",
            data=export_data,
            file_name=f"income_prediction_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
            mime="application/json"
        )
    else:
        st.warning("⚠️ No prediction results to export. Please make a prediction first.")

# Footer
st.markdown("---")
st.markdown("*Built with Streamlit • Dr. Eng. Farrikh Alzami, M.Kom*")