SIC / src /streamlit_app.py
wawan17's picture
Update src/streamlit_app.py
d7aa20d verified
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
accuracy_score, precision_score, recall_score, f1_score)
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
print("Library berhasil diimpor!")
"""## 2. Data Loading & Exploration
"""
# Load dataset
print("Memuat dataset diabetes...")
try:
df = pd.read_csv("datasets.csv")
print(f"Dataset berhasil dimuat: {df.shape[0]:,} baris, {df.shape[1]} kolom")
except FileNotFoundError:
print("File 'datasets.csv' tidak ditemukan!")
print("Silakan upload file dataset ke Google Colab terlebih dahulu.")
# Basic dataset information
print(f"\nInformasi Dataset:")
print(f" • Ukuran: {df.shape}")
print(f" • Penggunaan memori: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f" • Nilai kosong: {df.isnull().sum().sum()}")
# Display first few rows
print(f"\n5 Baris Pertama:")
display(df.head())
# Dataset statistics
print(f"\nStatistik Dataset:")
display(df.describe())
"""## 3. Target Variable Analysis
"""
# Target variable distribution
target_column = 'Diabetes_012'
print(f"Variabel target: {target_column}")
target_counts = df[target_column].value_counts().sort_index()
target_percentages = df[target_column].value_counts(normalize=True).sort_index() * 100
# Create target analysis dataframe
target_analysis = pd.DataFrame({
'Kelas': ['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'],
'Jumlah': target_counts.values,
'Persentase': target_percentages.values
})
print("Distribusi:")
display(target_analysis)
# Visualization of target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Pie chart
colors = ['#2E8B57', '#DC143C', '#FF8C00']
ax1.pie(target_counts.values, labels=target_analysis['Kelas'],
autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Distribusi Variabel Target', fontsize=14, fontweight='bold')
# Bar chart
bars = ax2.bar(target_analysis['Kelas'], target_analysis['Jumlah'], color=colors)
ax2.set_title('Jumlah Sampel per Kelas', fontsize=14, fontweight='bold')
ax2.set_ylabel('Jumlah Sampel')
ax2.set_xlabel('Kelas Diabetes')
# Add value labels on bars
for bar, count in zip(bars, target_analysis['Jumlah']):
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000,
f'{count:,}', ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.show()
"""## 4. Data Preprocessing & Model Training
"""
# Prepare data
feature_columns = [col for col in df.columns if col != target_column]
X = df[feature_columns]
y = df[target_column]
print(f"Ukuran data:")
print(f" • Fitur (X): {X.shape}")
print(f" • Target (y): {y.shape}")
# Feature scaling
print(f"\nMenerapkan StandardScaler...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_columns)
print(f"Fitur berhasil dinormalisasi!")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nHasil Pembagian Data:")
print(f" • Data latih: {X_train.shape[0]:,} sampel")
print(f" • Data uji: {X_test.shape[0]:,} sampel")
# Train Random Forest
print(f"\nMelatih Random Forest Classifier...")
rf_classifier = RandomForestClassifier(
n_estimators=100,
random_state=42,
max_depth=10,
min_samples_split=5,
min_samples_leaf=2,
n_jobs=-1
)
rf_classifier.fit(X_train, y_train)
print("Pelatihan model selesai!")
# Make predictions
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)
print(f"\nPrediksi selesai!")
print(f" • Ukuran prediksi: {y_pred.shape}")
print(f" • Ukuran probabilitas: {y_pred_proba.shape}")
"""## 5. Model Evaluation & Visualization
"""
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Metrik Performa:")
print(f" • Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f" • Presisi (terbobot): {precision:.4f}")
print(f" • Recall (terbobot): {recall:.4f}")
print(f" • F1-Score (terbobot): {f1:.4f}")
# Detailed classification report
print(f"\nLaporan Klasifikasi Detail:")
print(classification_report(y_test, y_pred,
target_names=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2']))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nMatriks Konfusi:")
print(cm)
# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
xticklabels=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'],
yticklabels=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'])
ax1.set_title('Matriks Konfusi', fontsize=14, fontweight='bold')
ax1.set_xlabel('Prediksi')
ax1.set_ylabel('Aktual')
# Feature Importance
feature_importance = pd.DataFrame({
'feature': feature_columns,
'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)
# Top 10 features
top_features = feature_importance.head(10)
ax2.barh(top_features['feature'], top_features['importance'], color='skyblue')
ax2.set_title('10 Fitur Terpenting', fontsize=14, fontweight='bold')
ax2.set_xlabel('Tingkat Kepentingan')
plt.tight_layout()
plt.show()
# Display feature importance table
print(f"\nTingkat Kepentingan Fitur (10 Teratas):")
display(feature_importance.head(10))
"""## 6. Model Inference Example
"""
# Example prediction
print("Contoh: Memprediksi diabetes untuk pasien baru...")
# Sample patient data
sample_patient = {
'HighBP': 1.0, # Tekanan darah tinggi: Ya
'HighChol': 0.0, # Kolesterol tinggi: Tidak
'CholCheck': 1.0, # Pengecekan kolesterol: Ya
'BMI': 28.5, # BMI: 28.5 (kelebihan berat badan)
'Smoker': 0.0, # Perokok: Tidak
'Stroke': 0.0, # Riwayat stroke: Tidak
'HeartDiseaseorAttack': 0.0, # Penyakit jantung: Tidak
'PhysActivity': 1.0, # Aktivitas fisik: Ya
'Fruits': 1.0, # Konsumsi buah: Ya
'Veggies': 1.0, # Konsumsi sayuran: Ya
'HvyAlcoholConsump': 0.0, # Konsumsi alkohol berat: Tidak
'AnyHealthcare': 1.0, # Akses layanan kesehatan: Ya
'NoDocbcCost': 0.0, # Tidak ke dokter karena biaya: Tidak
'GenHlth': 3.0, # Kesehatan umum: 3 (cukup)
'MentHlth': 5.0, # Hari kesehatan mental buruk: 5
'PhysHlth': 5.0, # Hari kesehatan fisik buruk: 5
'DiffWalk': 0.0, # Kesulitan berjalan: Tidak
'Sex': 0.0, # Jenis kelamin: Perempuan
'Age': 9.0, # Kategori usia: 9 (45-49)
'Education': 4.0, # Pendidikan: 4 (kuliah sebagian)
'Income': 3.0 # Pendapatan: 3 ($15k-$20k)
}
print(f"Data pasien contoh:")
for key, value in sample_patient.items():
print(f" • {key}: {value}")
# Create DataFrame for prediction
patient_df = pd.DataFrame([sample_patient])
patient_scaled = scaler.transform(patient_df)
patient_scaled_df = pd.DataFrame(patient_scaled, columns=feature_columns)
# Make prediction
prediction = rf_classifier.predict(patient_scaled_df)[0]
prediction_proba = rf_classifier.predict_proba(patient_scaled_df)[0]
print(f"\nHasil Prediksi:")
print(f" • Kelas yang diprediksi: {prediction}")
print(f" • Nama kelas: {['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'][int(prediction)]}")
print(f"\nProbabilitas Prediksi:")
for i, prob in enumerate(prediction_proba):
class_name = ['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'][i]
print(f" • {class_name}: {prob:.4f} ({prob*100:.2f}%)")
"""## 7. Save Model & Summary
"""
# Save model and scaler
import joblib
print("💾 Menyimpan model dan scaler...")
# Save the trained model
model_filename = 'diabetes_rf_model.joblib'
joblib.dump(rf_classifier, model_filename)
print(f" Model disimpan sebagai: {model_filename}")
# Save the scaler
scaler_filename = 'diabetes_scaler.joblib'
joblib.dump(scaler, scaler_filename)
print(f"Scaler disimpan sebagai: {scaler_filename}")
print(f"\n🎉 Pelatihan dan evaluasi model berhasil diselesaikan!")
print(f" File yang dibuat:")
print(f" • {model_filename}")
print(f" • {scaler_filename}")
# Summary
print(f"\n Ringkasan Performa Model:")
print(f" • Dataset: {df.shape[0]:,} sampel, {df.shape[1]} fitur")
print(f" • Model: Random Forest Classifier")
print(f" • Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f" • F1-Score: {f1:.4f}")
print(f" • Kelas: 3 (Tidak Diabetes, Diabetes Tipe 1, Diabetes Tipe 2)")
print(f"\n 5 Fitur Terpenting:")
for i, (_, row) in enumerate(feature_importance.head(5).iterrows()):
print(f" {i+1}. {row['feature']}: {row['importance']:.4f}")
print(f"\n Model siap untuk deployment!")