# Import libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # Machine Learning Libraries from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score) # Visualization import matplotlib.pyplot as plt import seaborn as sns plt.style.use('seaborn-v0_8') print("Library berhasil diimpor!") """## 2. Data Loading & Exploration """ # Load dataset print("Memuat dataset diabetes...") try: df = pd.read_csv("datasets.csv") print(f"Dataset berhasil dimuat: {df.shape[0]:,} baris, {df.shape[1]} kolom") except FileNotFoundError: print("File 'datasets.csv' tidak ditemukan!") print("Silakan upload file dataset ke Google Colab terlebih dahulu.") # Basic dataset information print(f"\nInformasi Dataset:") print(f" • Ukuran: {df.shape}") print(f" • Penggunaan memori: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") print(f" • Nilai kosong: {df.isnull().sum().sum()}") # Display first few rows print(f"\n5 Baris Pertama:") display(df.head()) # Dataset statistics print(f"\nStatistik Dataset:") display(df.describe()) """## 3. Target Variable Analysis """ # Target variable distribution target_column = 'Diabetes_012' print(f"Variabel target: {target_column}") target_counts = df[target_column].value_counts().sort_index() target_percentages = df[target_column].value_counts(normalize=True).sort_index() * 100 # Create target analysis dataframe target_analysis = pd.DataFrame({ 'Kelas': ['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'], 'Jumlah': target_counts.values, 'Persentase': target_percentages.values }) print("Distribusi:") display(target_analysis) # Visualization of target distribution fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) # Pie chart colors = ['#2E8B57', '#DC143C', '#FF8C00'] ax1.pie(target_counts.values, labels=target_analysis['Kelas'], autopct='%1.1f%%', colors=colors, startangle=90) ax1.set_title('Distribusi Variabel Target', fontsize=14, fontweight='bold') # Bar chart bars = ax2.bar(target_analysis['Kelas'], target_analysis['Jumlah'], color=colors) ax2.set_title('Jumlah Sampel per Kelas', fontsize=14, fontweight='bold') ax2.set_ylabel('Jumlah Sampel') ax2.set_xlabel('Kelas Diabetes') # Add value labels on bars for bar, count in zip(bars, target_analysis['Jumlah']): ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000, f'{count:,}', ha='center', va='bottom', fontweight='bold') plt.tight_layout() plt.show() """## 4. Data Preprocessing & Model Training """ # Prepare data feature_columns = [col for col in df.columns if col != target_column] X = df[feature_columns] y = df[target_column] print(f"Ukuran data:") print(f" • Fitur (X): {X.shape}") print(f" • Target (y): {y.shape}") # Feature scaling print(f"\nMenerapkan StandardScaler...") scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled = pd.DataFrame(X_scaled, columns=feature_columns) print(f"Fitur berhasil dinormalisasi!") # Split data X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.2, random_state=42, stratify=y ) print(f"\nHasil Pembagian Data:") print(f" • Data latih: {X_train.shape[0]:,} sampel") print(f" • Data uji: {X_test.shape[0]:,} sampel") # Train Random Forest print(f"\nMelatih Random Forest Classifier...") rf_classifier = RandomForestClassifier( n_estimators=100, random_state=42, max_depth=10, min_samples_split=5, min_samples_leaf=2, n_jobs=-1 ) rf_classifier.fit(X_train, y_train) print("Pelatihan model selesai!") # Make predictions y_pred = rf_classifier.predict(X_test) y_pred_proba = rf_classifier.predict_proba(X_test) print(f"\nPrediksi selesai!") print(f" • Ukuran prediksi: {y_pred.shape}") print(f" • Ukuran probabilitas: {y_pred_proba.shape}") """## 5. Model Evaluation & Visualization """ # Calculate metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') print(f"Metrik Performa:") print(f" • Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)") print(f" • Presisi (terbobot): {precision:.4f}") print(f" • Recall (terbobot): {recall:.4f}") print(f" • F1-Score (terbobot): {f1:.4f}") # Detailed classification report print(f"\nLaporan Klasifikasi Detail:") print(classification_report(y_test, y_pred, target_names=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'])) # Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(f"\nMatriks Konfusi:") print(cm) # Visualization fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) # Confusion Matrix Heatmap sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1, xticklabels=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'], yticklabels=['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2']) ax1.set_title('Matriks Konfusi', fontsize=14, fontweight='bold') ax1.set_xlabel('Prediksi') ax1.set_ylabel('Aktual') # Feature Importance feature_importance = pd.DataFrame({ 'feature': feature_columns, 'importance': rf_classifier.feature_importances_ }).sort_values('importance', ascending=False) # Top 10 features top_features = feature_importance.head(10) ax2.barh(top_features['feature'], top_features['importance'], color='skyblue') ax2.set_title('10 Fitur Terpenting', fontsize=14, fontweight='bold') ax2.set_xlabel('Tingkat Kepentingan') plt.tight_layout() plt.show() # Display feature importance table print(f"\nTingkat Kepentingan Fitur (10 Teratas):") display(feature_importance.head(10)) """## 6. Model Inference Example """ # Example prediction print("Contoh: Memprediksi diabetes untuk pasien baru...") # Sample patient data sample_patient = { 'HighBP': 1.0, # Tekanan darah tinggi: Ya 'HighChol': 0.0, # Kolesterol tinggi: Tidak 'CholCheck': 1.0, # Pengecekan kolesterol: Ya 'BMI': 28.5, # BMI: 28.5 (kelebihan berat badan) 'Smoker': 0.0, # Perokok: Tidak 'Stroke': 0.0, # Riwayat stroke: Tidak 'HeartDiseaseorAttack': 0.0, # Penyakit jantung: Tidak 'PhysActivity': 1.0, # Aktivitas fisik: Ya 'Fruits': 1.0, # Konsumsi buah: Ya 'Veggies': 1.0, # Konsumsi sayuran: Ya 'HvyAlcoholConsump': 0.0, # Konsumsi alkohol berat: Tidak 'AnyHealthcare': 1.0, # Akses layanan kesehatan: Ya 'NoDocbcCost': 0.0, # Tidak ke dokter karena biaya: Tidak 'GenHlth': 3.0, # Kesehatan umum: 3 (cukup) 'MentHlth': 5.0, # Hari kesehatan mental buruk: 5 'PhysHlth': 5.0, # Hari kesehatan fisik buruk: 5 'DiffWalk': 0.0, # Kesulitan berjalan: Tidak 'Sex': 0.0, # Jenis kelamin: Perempuan 'Age': 9.0, # Kategori usia: 9 (45-49) 'Education': 4.0, # Pendidikan: 4 (kuliah sebagian) 'Income': 3.0 # Pendapatan: 3 ($15k-$20k) } print(f"Data pasien contoh:") for key, value in sample_patient.items(): print(f" • {key}: {value}") # Create DataFrame for prediction patient_df = pd.DataFrame([sample_patient]) patient_scaled = scaler.transform(patient_df) patient_scaled_df = pd.DataFrame(patient_scaled, columns=feature_columns) # Make prediction prediction = rf_classifier.predict(patient_scaled_df)[0] prediction_proba = rf_classifier.predict_proba(patient_scaled_df)[0] print(f"\nHasil Prediksi:") print(f" • Kelas yang diprediksi: {prediction}") print(f" • Nama kelas: {['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'][int(prediction)]}") print(f"\nProbabilitas Prediksi:") for i, prob in enumerate(prediction_proba): class_name = ['Tidak Diabetes', 'Diabetes Tipe 1', 'Diabetes Tipe 2'][i] print(f" • {class_name}: {prob:.4f} ({prob*100:.2f}%)") """## 7. Save Model & Summary """ # Save model and scaler import joblib print("💾 Menyimpan model dan scaler...") # Save the trained model model_filename = 'diabetes_rf_model.joblib' joblib.dump(rf_classifier, model_filename) print(f" Model disimpan sebagai: {model_filename}") # Save the scaler scaler_filename = 'diabetes_scaler.joblib' joblib.dump(scaler, scaler_filename) print(f"Scaler disimpan sebagai: {scaler_filename}") print(f"\n🎉 Pelatihan dan evaluasi model berhasil diselesaikan!") print(f" File yang dibuat:") print(f" • {model_filename}") print(f" • {scaler_filename}") # Summary print(f"\n Ringkasan Performa Model:") print(f" • Dataset: {df.shape[0]:,} sampel, {df.shape[1]} fitur") print(f" • Model: Random Forest Classifier") print(f" • Akurasi: {accuracy:.4f} ({accuracy*100:.2f}%)") print(f" • F1-Score: {f1:.4f}") print(f" • Kelas: 3 (Tidak Diabetes, Diabetes Tipe 1, Diabetes Tipe 2)") print(f"\n 5 Fitur Terpenting:") for i, (_, row) in enumerate(feature_importance.head(5).iterrows()): print(f" {i+1}. {row['feature']}: {row['importance']:.4f}") print(f"\n Model siap untuk deployment!")