| |
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| |
| from sklearn.model_selection import train_test_split |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.metrics import (classification_report, confusion_matrix, |
| accuracy_score, precision_score, recall_score, f1_score) |
|
|
| |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| plt.style.use('seaborn-v0_8') |
|
|
| print("Library berhasil diimpor!") |
|
|
| """## 2. Data Loading & Exploration |
| |
| """ |
|
|
| |
| print("Memuat dataset diabetes...") |
| try: |
| df = pd.read_csv("datasets.csv") |
| print(f"Dataset berhasil dimuat: {df.shape[0]:,} baris, {df.shape[1]} kolom") |
| except FileNotFoundError: |
| print("File 'datasets.csv' tidak ditemukan!") |
| print("Silakan upload file dataset ke Google Colab terlebih dahulu.") |
|
|
| |
| print(f"\nInformasi Dataset:") |
| print(f" • Ukuran: {df.shape}") |
| print(f" • Penggunaan memori: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") |
| print(f" • Nilai kosong: {df.isnull().sum().sum()}") |
|
|
| |
| print(f"\n5 Baris Pertama:") |
| display(df.head()) |
|
|
| |
| print(f"\nStatistik Dataset:") |
| display(df.describe()) |
|
|
| """## 3. Target Variable Analysis |
| |
| """ |
|
|
| |
| target_column = 'Diabetes_012' |
| print(f"Variabel target: {target_column}") |
|
|
| target_counts = df[target_column].value_counts().sort_index() |
| target_percentages = df[target_column].value_counts(normalize=True).sort_index() * 100 |
|
|
| |
| target_analysis = pd.DataFrame({ |
| 'Kelas': ['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'], |
| 'Jumlah': target_counts.values, |
| 'Persentase': target_percentages.values |
| }) |
|
|
| print("Distribusi:") |
| display(target_analysis) |
|
|
| |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) |
|
|
| |
| colors = ['#2E8B57', '#DC143C', '#FF8C00'] |
| ax1.pie(target_counts.values, labels=target_analysis['Kelas'], |
| autopct='%1.1f%%', colors=colors, startangle=90) |
| ax1.set_title('Distribusi Variabel Target', fontsize=14, fontweight='bold') |
|
|
| |
| bars = ax2.bar(target_analysis['Kelas'], target_analysis['Jumlah'], color=colors) |
| ax2.set_title('Jumlah Sampel per Kelas', fontsize=14, fontweight='bold') |
| ax2.set_ylabel('Jumlah Sampel') |
| ax2.set_xlabel('Kelas Diabetes') |
|
|
| |
| for bar, count in zip(bars, target_analysis['Jumlah']): |
| ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000, |
| f'{count:,}', ha='center', va='bottom', fontweight='bold') |
|
|
| plt.tight_layout() |
| plt.show() |
|
|
| """## 4. Data Preprocessing & Model Training |
| |
| """ |
|
|
| |
| feature_columns = [col for col in df.columns if col != target_column] |
| X = df[feature_columns] |
| y = df[target_column] |
|
|
| print(f"Ukuran data:") |
| print(f" • Fitur (X): {X.shape}") |
| print(f" • Target (y): {y.shape}") |
|
|
| |
| print(f"\nMenerapkan StandardScaler...") |
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X) |
| X_scaled = pd.DataFrame(X_scaled, columns=feature_columns) |
|
|
| print(f"Fitur berhasil dinormalisasi!") |
|
|
| |
| X_train, X_test, y_train, y_test = train_test_split( |
| X_scaled, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| print(f"\nHasil Pembagian Data:") |
| print(f" • Data latih: {X_train.shape[0]:,} sampel") |
| print(f" • Data uji: {X_test.shape[0]:,} sampel") |
|
|
| |
| print(f"\nMelatih Random Forest Classifier...") |
| rf_classifier = RandomForestClassifier( |
| n_estimators=100, |
| random_state=42, |
| max_depth=10, |
| min_samples_split=5, |
| min_samples_leaf=2, |
| n_jobs=-1 |
| ) |
|
|
| rf_classifier.fit(X_train, y_train) |
| print("Pelatihan model selesai!") |
|
|
| |
| y_pred = rf_classifier.predict(X_test) |
| y_pred_proba = rf_classifier.predict_proba(X_test) |
|
|
| print(f"\nPrediksi selesai!") |
| print(f" • Ukuran prediksi: {y_pred.shape}") |
| print(f" • Ukuran probabilitas: {y_pred_proba.shape}") |
|
|
| """## 5. Model Evaluation & Visualization |
| |
| """ |
|
|
| |
| accuracy = accuracy_score(y_test, y_pred) |
| precision = precision_score(y_test, y_pred, average='weighted') |
| recall = recall_score(y_test, y_pred, average='weighted') |
| f1 = f1_score(y_test, y_pred, average='weighted') |
|
|
| print(f"Metrik Performa:") |
| print(f" • Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)") |
| print(f" • Presisi (terbobot): {precision:.4f}") |
| print(f" • Recall (terbobot): {recall:.4f}") |
| print(f" • F1-Score (terbobot): {f1:.4f}") |
|
|
| |
| print(f"\nLaporan Klasifikasi Detail:") |
| print(classification_report(y_test, y_pred, |
| target_names=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'])) |
|
|
| |
| cm = confusion_matrix(y_test, y_pred) |
| print(f"\nMatriks Konfusi:") |
| print(cm) |
|
|
| |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) |
|
|
| |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1, |
| xticklabels=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'], |
| yticklabels=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2']) |
| ax1.set_title('Matriks Konfusi', fontsize=14, fontweight='bold') |
| ax1.set_xlabel('Prediksi') |
| ax1.set_ylabel('Aktual') |
|
|
| |
| feature_importance = pd.DataFrame({ |
| 'feature': feature_columns, |
| 'importance': rf_classifier.feature_importances_ |
| }).sort_values('importance', ascending=False) |
|
|
| |
| top_features = feature_importance.head(10) |
| ax2.barh(top_features['feature'], top_features['importance'], color='skyblue') |
| ax2.set_title('10 Fitur Terpenting', fontsize=14, fontweight='bold') |
| ax2.set_xlabel('Tingkat Kepentingan') |
|
|
| plt.tight_layout() |
| plt.show() |
|
|
| |
| print(f"\nTingkat Kepentingan Fitur (10 Teratas):") |
| display(feature_importance.head(10)) |
|
|
| """## 6. Model Inference Example |
| |
| """ |
|
|
| |
| print("Contoh: Memprediksi diabetes untuk pasien baru...") |
|
|
| |
| sample_patient = { |
| 'HighBP': 1.0, |
| 'HighChol': 0.0, |
| 'CholCheck': 1.0, |
| 'BMI': 28.5, |
| 'Smoker': 0.0, |
| 'Stroke': 0.0, |
| 'HeartDiseaseorAttack': 0.0, |
| 'PhysActivity': 1.0, |
| 'Fruits': 1.0, |
| 'Veggies': 1.0, |
| 'HvyAlcoholConsump': 0.0, |
| 'AnyHealthcare': 1.0, |
| 'NoDocbcCost': 0.0, |
| 'GenHlth': 3.0, |
| 'MentHlth': 5.0, |
| 'PhysHlth': 5.0, |
| 'DiffWalk': 0.0, |
| 'Sex': 0.0, |
| 'Age': 9.0, |
| 'Education': 4.0, |
| 'Income': 3.0 |
| } |
|
|
| print(f"Data pasien contoh:") |
| for key, value in sample_patient.items(): |
| print(f" • {key}: {value}") |
|
|
| |
| patient_df = pd.DataFrame([sample_patient]) |
| patient_scaled = scaler.transform(patient_df) |
| patient_scaled_df = pd.DataFrame(patient_scaled, columns=feature_columns) |
|
|
| |
| prediction = rf_classifier.predict(patient_scaled_df)[0] |
| prediction_proba = rf_classifier.predict_proba(patient_scaled_df)[0] |
|
|
| print(f"\nHasil Prediksi:") |
| print(f" • Kelas yang diprediksi: {prediction}") |
| print(f" • Nama kelas: {['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'][int(prediction)]}") |
|
|
| print(f"\nProbabilitas Prediksi:") |
| for i, prob in enumerate(prediction_proba): |
| class_name = ['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'][i] |
| print(f" • {class_name}: {prob:.4f} ({prob*100:.2f}%)") |
|
|
| """## 7. Save Model & Summary |
| |
| """ |
|
|
| |
| import joblib |
|
|
| print("💾 Menyimpan model dan scaler...") |
|
|
| |
| model_filename = 'diabetes_rf_model.joblib' |
| joblib.dump(rf_classifier, model_filename) |
| print(f" Model disimpan sebagai: {model_filename}") |
|
|
| |
| scaler_filename = 'diabetes_scaler.joblib' |
| joblib.dump(scaler, scaler_filename) |
| print(f"Scaler disimpan sebagai: {scaler_filename}") |
|
|
| print(f"\n🎉 Pelatihan dan evaluasi model berhasil diselesaikan!") |
| print(f" File yang dibuat:") |
| print(f" • {model_filename}") |
| print(f" • {scaler_filename}") |
|
|
| |
| print(f"\n Ringkasan Performa Model:") |
| print(f" • Dataset: {df.shape[0]:,} sampel, {df.shape[1]} fitur") |
| print(f" • Model: Random Forest Classifier") |
| print(f" • Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)") |
| print(f" • F1-Score: {f1:.4f}") |
| print(f" • Kelas: 3 (Tidak Diabetes, Diabetes Tipe 1, Diabetes Tipe 2)") |
|
|
| print(f"\n 5 Fitur Terpenting:") |
| for i, (_, row) in enumerate(feature_importance.head(5).iterrows()): |
| print(f" {i+1}. {row['feature']}: {row['importance']:.4f}") |
|
|
| print(f"\n Model siap untuk deployment!") |