Upload 21 files
Browse files- Prediksi Performa Akademik/edtech/backend/data/processed/cleaned_education_data.csv +0 -0
- Prediksi Performa Akademik/edtech/backend/data/raw/personalized_education_data.csv +0 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/data_processor.pkl +3 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/model_metrics.json +8 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model.pkl +3 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_20250709_221148_params.json +10 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_shap_values.npy +3 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/actual_vs_predicted.png +0 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.csv +5 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.png +0 -0
- Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/residual_plot.png +0 -0
- Prediksi Performa Akademik/edtech/backend/src/app.py +431 -0
- Prediksi Performa Akademik/edtech/backend/src/models/recommenders/collaborative/collab_model.joblib +3 -0
- Prediksi Performa Akademik/edtech/backend/src/models/recommenders/content_based/content_model.joblib +3 -0
- Prediksi Performa Akademik/edtech/backend/src/models/recommenders/hybrid/hybrid_model.joblib +3 -0
- Prediksi Performa Akademik/edtech/backend/src/performance_prediction/__init__.py +11 -0
- Prediksi Performa Akademik/edtech/backend/src/performance_prediction/data_processor.py +180 -0
- Prediksi Performa Akademik/edtech/backend/src/performance_prediction/evaluator.py +255 -0
- Prediksi Performa Akademik/edtech/backend/src/performance_prediction/model_trainer.py +412 -0
- Prediksi Performa Akademik/edtech/backend/src/performance_prediction/predictor.py +289 -0
- Prediksi Performa Akademik/edtech/backend/src/train_performance_predictor.py +164 -0
Prediksi Performa Akademik/edtech/backend/data/processed/cleaned_education_data.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prediksi Performa Akademik/edtech/backend/data/raw/personalized_education_data.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/data_processor.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9510e6be685fb7b5fdfd38517a116a0403faa75c922b2e81a9f215921ac2e0be
|
| 3 |
+
size 217195
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/model_metrics.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mse": 0.05486344948816889,
|
| 3 |
+
"rmse": 0.23422948039939143,
|
| 4 |
+
"mae": 0.1660625786187038,
|
| 5 |
+
"r2": 0.29007536468986816,
|
| 6 |
+
"max_error": 0.7487417459487915,
|
| 7 |
+
"mape": 27499842257.400738
|
| 8 |
+
}
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c0de80b55e64dc99f1e1ffcc69f7ff3799341b393000728bcf641c64ea02b27
|
| 3 |
+
size 50035
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_20250709_221148_params.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"subsample": 0.8,
|
| 3 |
+
"reg_lambda": 10,
|
| 4 |
+
"reg_alpha": 1,
|
| 5 |
+
"min_child_weight": 1,
|
| 6 |
+
"max_depth": 9,
|
| 7 |
+
"learning_rate": 0.1,
|
| 8 |
+
"gamma": 0.2,
|
| 9 |
+
"colsample_bytree": 1.0
|
| 10 |
+
}
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/trained_model/performance_model_shap_values.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de8239f80d8fb314e44e471031c7d93c12ed854bcb692f041b89d65ad19c136c
|
| 3 |
+
size 9728
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/actual_vs_predicted.png
ADDED
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feature,importance
|
| 2 |
+
f18,22.0
|
| 3 |
+
f4,19.0
|
| 4 |
+
f12,3.0
|
| 5 |
+
f3,1.0
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/feature_importance.png
ADDED
|
Prediksi Performa Akademik/edtech/backend/models/performance_predictor/training_logs/residual_plot.png
ADDED
|
Prediksi Performa Akademik/edtech/backend/src/app.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Depends, status
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel, Field, field_validator
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import joblib
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import numpy as np
|
| 9 |
+
import sys
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
from prometheus_fastapi_instrumentator import Instrumentator
|
| 13 |
+
import uvicorn
|
| 14 |
+
import xgboost as xgb
|
| 15 |
+
import shap
|
| 16 |
+
import json
|
| 17 |
+
from contextlib import asynccontextmanager
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
import os
|
| 20 |
+
|
| 21 |
+
# Setup logging
|
| 22 |
+
logging.basicConfig(
|
| 23 |
+
level=logging.INFO,
|
| 24 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 25 |
+
handlers=[
|
| 26 |
+
logging.StreamHandler(),
|
| 27 |
+
logging.FileHandler('api.log')
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
# Setup paths - Disesuaikan dengan struktur folder Anda
|
| 33 |
+
BASE_DIR = Path(__file__).parent.parent # Menyesuaikan dengan lokasi app.py
|
| 34 |
+
MODEL_DIR = BASE_DIR / "models" / "performance_predictor" / "trained_model"
|
| 35 |
+
MODEL_PATH = MODEL_DIR / "performance_model.pkl"
|
| 36 |
+
PREPROCESSOR_PATH = MODEL_DIR / "data_processor.pkl"
|
| 37 |
+
METRICS_PATH = MODEL_DIR / "model_metrics.json"
|
| 38 |
+
|
| 39 |
+
# Pastikan direktori model ada
|
| 40 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
# Lifespan handler untuk manajemen siklus hidup aplikasi
|
| 43 |
+
@asynccontextmanager
|
| 44 |
+
async def lifespan(app: FastAPI):
|
| 45 |
+
"""Mengelola startup dan shutdown aplikasi"""
|
| 46 |
+
try:
|
| 47 |
+
# Muat komponen saat startup
|
| 48 |
+
app.state.model_components = await load_components()
|
| 49 |
+
|
| 50 |
+
# Muat metrik model
|
| 51 |
+
if METRICS_PATH.exists():
|
| 52 |
+
with open(METRICS_PATH) as f:
|
| 53 |
+
app.state.model_metrics = json.load(f)
|
| 54 |
+
else:
|
| 55 |
+
app.state.model_metrics = {
|
| 56 |
+
"mse": 0.05486344948816889,
|
| 57 |
+
"rmse": 0.23422948039939143,
|
| 58 |
+
"mae": 0.1660625786187038,
|
| 59 |
+
"r2": 0.29007536468986816,
|
| 60 |
+
"max_error": 0.7487417459487915
|
| 61 |
+
}
|
| 62 |
+
logger.warning("File metrik model tidak ditemukan, menggunakan nilai default")
|
| 63 |
+
|
| 64 |
+
logger.info("Aplikasi siap menerima request")
|
| 65 |
+
yield
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"Startup error: {str(e)}")
|
| 69 |
+
raise HTTPException(
|
| 70 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 71 |
+
detail="Gagal memulai aplikasi"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Inisialisasi FastAPI
|
| 75 |
+
app = FastAPI(
|
| 76 |
+
title="EdTech Performance Prediction API",
|
| 77 |
+
description="API untuk memprediksi performa akademik siswa menggunakan model XGBoost",
|
| 78 |
+
version="2.0.0",
|
| 79 |
+
docs_url="/docs",
|
| 80 |
+
redoc_url="/redoc",
|
| 81 |
+
lifespan=lifespan
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Enable CORS
|
| 85 |
+
app.add_middleware(
|
| 86 |
+
CORSMiddleware,
|
| 87 |
+
allow_origins=["http://localhost:3024", "http://192.168.56.1:3024"],
|
| 88 |
+
allow_credentials=True,
|
| 89 |
+
allow_methods=["*"],
|
| 90 |
+
allow_headers=["*"],
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Setup Prometheus metrics
|
| 94 |
+
Instrumentator().instrument(app).expose(app)
|
| 95 |
+
|
| 96 |
+
# Definisi Model Pydantic
|
| 97 |
+
class FeatureInput(BaseModel):
|
| 98 |
+
grade: float = Field(..., gt=0, le=12, description="Kelas siswa (1-12)")
|
| 99 |
+
tech_savvy: int = Field(..., ge=1, le=5, description="Kemampuan teknologi (skala 1-5)")
|
| 100 |
+
duration_minutes: float = Field(..., gt=0, description="Durasi belajar dalam menit")
|
| 101 |
+
engagement_score: float = Field(..., ge=0, le=1, description="Skor engagement (0-1)")
|
| 102 |
+
completion_rate: float = Field(..., ge=0, le=1, description="Tingkat penyelesaian materi (0-1)")
|
| 103 |
+
material_rating: float = Field(..., ge=1, le=5, description="Rating materi (skala 1-5)")
|
| 104 |
+
interaction_duration: float = Field(..., gt=0, description="Durasi interaksi dengan materi")
|
| 105 |
+
material_engagement_score: float = Field(..., ge=0, le=1, description="Skor engagement dengan materi")
|
| 106 |
+
feature_engagement: float = Field(..., ge=0, le=1, description="Engagement dengan fitur platform")
|
| 107 |
+
jam_belajar: float = Field(..., ge=0, le=24, description="Jam belajar (0-24)")
|
| 108 |
+
hari_dalam_minggu: float = Field(..., ge=0, le=6, description="Hari dalam minggu (0-6)")
|
| 109 |
+
akhir_pekan: float = Field(..., ge=0, le=1, description="Indikator akhir pekan (0/1)")
|
| 110 |
+
efisiensi_belajar: float = Field(..., ge=0, description="Indeks efisiensi belajar")
|
| 111 |
+
rasio_penyelesaian: float = Field(..., ge=0, le=1, description="Rasio penyelesaian tugas")
|
| 112 |
+
interaksi_total: float = Field(..., ge=0, description="Total interaksi dengan platform")
|
| 113 |
+
preferensi_materi: float = Field(..., ge=0, le=1, description="Preferensi jenis materi")
|
| 114 |
+
jumlah_pengakses: float = Field(..., ge=0, description="Jumlah pengakses materi")
|
| 115 |
+
engagement_rata2: float = Field(..., ge=0, le=1, description="Rata-rata engagement")
|
| 116 |
+
performance_label_encoded: int = Field(..., ge=0, description="Label performa (encoded)")
|
| 117 |
+
learning_speed_encoded: int = Field(..., ge=0, description="Kecepatan belajar (encoded)")
|
| 118 |
+
student_feedback_encoded: int = Field(..., ge=0, description="Feedback siswa (encoded)")
|
| 119 |
+
achievement_status_encoded: int = Field(..., ge=0, description="Status pencapaian (encoded)")
|
| 120 |
+
|
| 121 |
+
@field_validator('engagement_score', 'completion_rate', 'material_engagement_score',
|
| 122 |
+
'feature_engagement', 'efisiensi_belajar', 'rasio_penyelesaian',
|
| 123 |
+
'preferensi_materi', 'engagement_rata2')
|
| 124 |
+
@classmethod
|
| 125 |
+
def check_proportion(cls, v):
|
| 126 |
+
if not 0 <= v <= 1:
|
| 127 |
+
raise ValueError("Nilai harus antara 0 dan 1")
|
| 128 |
+
return v
|
| 129 |
+
|
| 130 |
+
class PredictionInput(BaseModel):
|
| 131 |
+
features: FeatureInput
|
| 132 |
+
|
| 133 |
+
class BatchPredictionInput(BaseModel):
|
| 134 |
+
samples: List[FeatureInput]
|
| 135 |
+
|
| 136 |
+
class FeatureContribution(BaseModel):
|
| 137 |
+
feature: str
|
| 138 |
+
value: float
|
| 139 |
+
contribution: float
|
| 140 |
+
|
| 141 |
+
class PredictionResponse(BaseModel):
|
| 142 |
+
prediction: float = Field(..., description="Nilai prediksi skor kuis")
|
| 143 |
+
confidence_interval: List[float] = Field(..., description="Interval kepercayaan prediksi")
|
| 144 |
+
feature_contributions: Optional[List[FeatureContribution]] = Field(
|
| 145 |
+
None,
|
| 146 |
+
description="Kontribusi masing-masing fitur terhadap prediksi"
|
| 147 |
+
)
|
| 148 |
+
execution_time_ms: float = Field(..., description="Waktu eksekusi dalam milidetik")
|
| 149 |
+
model_version: str = Field(..., description="Versi model yang digunakan")
|
| 150 |
+
|
| 151 |
+
class BatchPredictionResponse(BaseModel):
|
| 152 |
+
predictions: List[float]
|
| 153 |
+
confidence_intervals: List[List[float]]
|
| 154 |
+
feature_contributions: Optional[List[List[FeatureContribution]]]
|
| 155 |
+
execution_time_ms: float
|
| 156 |
+
model_version: str
|
| 157 |
+
total_samples: int
|
| 158 |
+
avg_time_per_sample_ms: float
|
| 159 |
+
|
| 160 |
+
class HealthCheckResponse(BaseModel):
|
| 161 |
+
status: str
|
| 162 |
+
model_version: str
|
| 163 |
+
model_metrics: dict
|
| 164 |
+
uptime_seconds: float
|
| 165 |
+
|
| 166 |
+
class ModelInfoResponse(BaseModel):
|
| 167 |
+
features: List[str]
|
| 168 |
+
model_type: str
|
| 169 |
+
training_date: Optional[str]
|
| 170 |
+
performance_metrics: dict
|
| 171 |
+
|
| 172 |
+
# Dependency untuk memuat komponen model
|
| 173 |
+
async def load_components():
|
| 174 |
+
"""Memuat model dan preprocessor dari file"""
|
| 175 |
+
try:
|
| 176 |
+
start_time = time.time()
|
| 177 |
+
|
| 178 |
+
# Verifikasi file ada
|
| 179 |
+
if not MODEL_PATH.exists():
|
| 180 |
+
raise FileNotFoundError(f"File model tidak ditemukan di {MODEL_PATH}")
|
| 181 |
+
if not PREPROCESSOR_PATH.exists():
|
| 182 |
+
raise FileNotFoundError(f"File preprocessor tidak ditemukan di {PREPROCESSOR_PATH}")
|
| 183 |
+
|
| 184 |
+
# Load model
|
| 185 |
+
model = joblib.load(MODEL_PATH)
|
| 186 |
+
logger.info(f"Model berhasil dimuat dari {MODEL_PATH}")
|
| 187 |
+
|
| 188 |
+
# Load preprocessor
|
| 189 |
+
processor_data = joblib.load(PREPROCESSOR_PATH)
|
| 190 |
+
preprocessor = processor_data['preprocessor']
|
| 191 |
+
feature_names = processor_data['feature_names']
|
| 192 |
+
logger.info(f"Preprocessor berhasil dimuat dari {PREPROCESSOR_PATH}")
|
| 193 |
+
|
| 194 |
+
load_time = time.time() - start_time
|
| 195 |
+
logger.info(f"Komponen model berhasil dimuat dalam {load_time:.2f} detik")
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
"model": model,
|
| 199 |
+
"preprocessor": preprocessor,
|
| 200 |
+
"feature_names": feature_names,
|
| 201 |
+
"load_time": load_time
|
| 202 |
+
}
|
| 203 |
+
except FileNotFoundError as e:
|
| 204 |
+
logger.error(f"File tidak ditemukan: {str(e)}")
|
| 205 |
+
raise HTTPException(
|
| 206 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 207 |
+
detail=f"File model/preprocessor tidak ditemukan: {str(e)}"
|
| 208 |
+
)
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.error(f"Gagal memuat model/preprocessor: {str(e)}")
|
| 211 |
+
raise HTTPException(
|
| 212 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 213 |
+
detail=f"Gagal memuat komponen model: {str(e)}"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Endpoint Utama
|
| 217 |
+
@app.get("/", include_in_schema=False)
|
| 218 |
+
async def root():
|
| 219 |
+
"""Endpoint root untuk informasi dasar API"""
|
| 220 |
+
return {
|
| 221 |
+
"message": "Selamat datang di EdTech Performance Prediction API",
|
| 222 |
+
"version": app.version,
|
| 223 |
+
"endpoints": {
|
| 224 |
+
"docs": "/docs",
|
| 225 |
+
"health": "/health",
|
| 226 |
+
"model_info": "/model/info",
|
| 227 |
+
"predict": "/predict",
|
| 228 |
+
"batch_predict": "/predict/batch"
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
@app.get("/health", response_model=HealthCheckResponse)
|
| 233 |
+
async def health_check():
|
| 234 |
+
"""Endpoint untuk health check dan monitoring"""
|
| 235 |
+
return {
|
| 236 |
+
"status": "healthy",
|
| 237 |
+
"model_version": app.version,
|
| 238 |
+
"model_metrics": app.state.model_metrics,
|
| 239 |
+
"uptime_seconds": time.time() - app.state.model_components.get("load_time", time.time())
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
@app.get("/model/info", response_model=ModelInfoResponse)
|
| 243 |
+
async def model_info():
|
| 244 |
+
"""Endpoint untuk mendapatkan informasi tentang model"""
|
| 245 |
+
return {
|
| 246 |
+
"features": app.state.model_components["feature_names"],
|
| 247 |
+
"model_type": "XGBoost Regressor",
|
| 248 |
+
"training_date": datetime.fromtimestamp(MODEL_PATH.stat().st_mtime).isoformat(),
|
| 249 |
+
"performance_metrics": app.state.model_metrics
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
@app.post("/predict", response_model=PredictionResponse)
|
| 253 |
+
async def predict_performance(
|
| 254 |
+
input_data: PredictionInput
|
| 255 |
+
):
|
| 256 |
+
"""Endpoint untuk prediksi tunggal performa siswa"""
|
| 257 |
+
start_time = time.time()
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
components = app.state.model_components
|
| 261 |
+
model = components["model"]
|
| 262 |
+
preprocessor = components["preprocessor"]
|
| 263 |
+
feature_names = components["feature_names"]
|
| 264 |
+
|
| 265 |
+
# Konversi input ke DataFrame
|
| 266 |
+
input_dict = input_data.features.dict()
|
| 267 |
+
input_df = pd.DataFrame([input_dict])
|
| 268 |
+
|
| 269 |
+
# Validasi fitur
|
| 270 |
+
missing_cols = set(feature_names) - set(input_df.columns)
|
| 271 |
+
if missing_cols:
|
| 272 |
+
raise ValueError(f"Kolom berikut tidak ditemukan: {missing_cols}")
|
| 273 |
+
|
| 274 |
+
# Urutkan kolom sesuai dengan yang diharapkan model
|
| 275 |
+
input_df = input_df[feature_names]
|
| 276 |
+
|
| 277 |
+
# Preprocess input
|
| 278 |
+
processed_input = preprocessor.transform(input_df)
|
| 279 |
+
|
| 280 |
+
# Buat prediksi
|
| 281 |
+
if isinstance(model, xgb.Booster):
|
| 282 |
+
dmatrix = xgb.DMatrix(processed_input)
|
| 283 |
+
prediction = model.predict(dmatrix)[0]
|
| 284 |
+
else:
|
| 285 |
+
prediction = model.predict(processed_input)[0]
|
| 286 |
+
|
| 287 |
+
# Hitung confidence interval berdasarkan metrik model
|
| 288 |
+
std_dev = np.sqrt(app.state.model_metrics.get('mse', 0.05486344948816889))
|
| 289 |
+
confidence = [max(0, prediction - 1.96*std_dev), min(1, prediction + 1.96*std_dev)]
|
| 290 |
+
|
| 291 |
+
# Hitung feature contributions menggunakan SHAP
|
| 292 |
+
feature_contributions = None
|
| 293 |
+
if hasattr(model, 'feature_names_in_'):
|
| 294 |
+
try:
|
| 295 |
+
explainer = shap.Explainer(model)
|
| 296 |
+
shap_values = explainer(processed_input)
|
| 297 |
+
|
| 298 |
+
feature_contributions = []
|
| 299 |
+
for i, feature in enumerate(feature_names):
|
| 300 |
+
feature_contributions.append({
|
| 301 |
+
"feature": feature,
|
| 302 |
+
"value": input_df.iloc[0][feature],
|
| 303 |
+
"contribution": float(shap_values[0].values[i])
|
| 304 |
+
})
|
| 305 |
+
# Urutkan berdasarkan kontribusi absolut terbesar
|
| 306 |
+
feature_contributions.sort(key=lambda x: abs(x["contribution"]), reverse=True)
|
| 307 |
+
except Exception as e:
|
| 308 |
+
logger.warning(f"Tidak dapat menghitung SHAP values: {str(e)}")
|
| 309 |
+
|
| 310 |
+
# Hitung waktu response
|
| 311 |
+
exec_time = (time.time() - start_time) * 1000 # dalam milidetik
|
| 312 |
+
|
| 313 |
+
return {
|
| 314 |
+
"prediction": float(prediction),
|
| 315 |
+
"confidence_interval": confidence,
|
| 316 |
+
"feature_contributions": feature_contributions,
|
| 317 |
+
"execution_time_ms": exec_time,
|
| 318 |
+
"model_version": app.version
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
except ValueError as e:
|
| 322 |
+
logger.error(f"Input validation error: {str(e)}")
|
| 323 |
+
raise HTTPException(
|
| 324 |
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
| 325 |
+
detail=f"Input tidak valid: {str(e)}"
|
| 326 |
+
)
|
| 327 |
+
except Exception as e:
|
| 328 |
+
logger.error(f"Error dalam prediksi: {str(e)}")
|
| 329 |
+
raise HTTPException(
|
| 330 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 331 |
+
detail=f"Error dalam prediksi: {str(e)}"
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
@app.post("/predict/batch", response_model=BatchPredictionResponse)
|
| 335 |
+
async def batch_predict_performance(
|
| 336 |
+
input_data: BatchPredictionInput
|
| 337 |
+
):
|
| 338 |
+
"""Endpoint untuk prediksi batch performa siswa"""
|
| 339 |
+
start_time = time.time()
|
| 340 |
+
|
| 341 |
+
try:
|
| 342 |
+
components = app.state.model_components
|
| 343 |
+
model = components["model"]
|
| 344 |
+
preprocessor = components["preprocessor"]
|
| 345 |
+
feature_names = components["feature_names"]
|
| 346 |
+
|
| 347 |
+
# Konversi input ke DataFrame
|
| 348 |
+
samples = [sample.dict() for sample in input_data.samples]
|
| 349 |
+
input_df = pd.DataFrame(samples)
|
| 350 |
+
|
| 351 |
+
# Validasi fitur
|
| 352 |
+
missing_cols = set(feature_names) - set(input_df.columns)
|
| 353 |
+
if missing_cols:
|
| 354 |
+
raise ValueError(f"Kolom berikut tidak ditemukan: {missing_cols}")
|
| 355 |
+
|
| 356 |
+
# Urutkan kolom
|
| 357 |
+
input_df = input_df[feature_names]
|
| 358 |
+
|
| 359 |
+
# Preprocess input
|
| 360 |
+
processed_input = preprocessor.transform(input_df)
|
| 361 |
+
|
| 362 |
+
# Buat prediksi
|
| 363 |
+
if isinstance(model, xgb.Booster):
|
| 364 |
+
dmatrix = xgb.DMatrix(processed_input)
|
| 365 |
+
predictions = model.predict(dmatrix)
|
| 366 |
+
else:
|
| 367 |
+
predictions = model.predict(processed_input)
|
| 368 |
+
|
| 369 |
+
# Hitung confidence intervals
|
| 370 |
+
std_dev = np.sqrt(app.state.model_metrics.get('mse', 0.05486344948816889))
|
| 371 |
+
conf_intervals = [
|
| 372 |
+
[max(0, p - 1.96*std_dev), min(1, p + 1.96*std_dev)]
|
| 373 |
+
for p in predictions
|
| 374 |
+
]
|
| 375 |
+
|
| 376 |
+
# Hitung feature contributions
|
| 377 |
+
feature_contributions_list = None
|
| 378 |
+
if hasattr(model, 'feature_names_in_'):
|
| 379 |
+
try:
|
| 380 |
+
explainer = shap.Explainer(model)
|
| 381 |
+
shap_values = explainer(processed_input)
|
| 382 |
+
|
| 383 |
+
feature_contributions_list = []
|
| 384 |
+
for i in range(len(predictions)):
|
| 385 |
+
contributions = []
|
| 386 |
+
for j, feature in enumerate(feature_names):
|
| 387 |
+
contributions.append({
|
| 388 |
+
"feature": feature,
|
| 389 |
+
"value": input_df.iloc[i][feature],
|
| 390 |
+
"contribution": float(shap_values[i].values[j])
|
| 391 |
+
})
|
| 392 |
+
# Urutkan berdasarkan kontribusi absolut terbesar
|
| 393 |
+
contributions.sort(key=lambda x: abs(x["contribution"]), reverse=True)
|
| 394 |
+
feature_contributions_list.append(contributions)
|
| 395 |
+
except Exception as e:
|
| 396 |
+
logger.warning(f"Tidak dapat menghitung SHAP values untuk batch: {str(e)}")
|
| 397 |
+
|
| 398 |
+
# Hitung waktu response
|
| 399 |
+
exec_time = (time.time() - start_time) * 1000 # dalam milidetik
|
| 400 |
+
avg_time_per_sample = exec_time / len(predictions)
|
| 401 |
+
|
| 402 |
+
return {
|
| 403 |
+
"predictions": [float(p) for p in predictions],
|
| 404 |
+
"confidence_intervals": conf_intervals,
|
| 405 |
+
"feature_contributions": feature_contributions_list,
|
| 406 |
+
"execution_time_ms": exec_time,
|
| 407 |
+
"model_version": app.version,
|
| 408 |
+
"total_samples": len(predictions),
|
| 409 |
+
"avg_time_per_sample_ms": avg_time_per_sample
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
except ValueError as e:
|
| 413 |
+
logger.error(f"Input validation error: {str(e)}")
|
| 414 |
+
raise HTTPException(
|
| 415 |
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
| 416 |
+
detail=f"Input tidak valid: {str(e)}"
|
| 417 |
+
)
|
| 418 |
+
except Exception as e:
|
| 419 |
+
logger.error(f"Error dalam batch prediction: {str(e)}")
|
| 420 |
+
raise HTTPException(
|
| 421 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 422 |
+
detail=f"Error dalam batch prediction: {str(e)}"
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
if __name__ == "__main__":
|
| 426 |
+
uvicorn.run(
|
| 427 |
+
"app:app",
|
| 428 |
+
host="192.168.56.1",
|
| 429 |
+
port=8024,
|
| 430 |
+
reload=True
|
| 431 |
+
)
|
Prediksi Performa Akademik/edtech/backend/src/models/recommenders/collaborative/collab_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4aef73c6272415cb11002c1ff5c96f65587498acaa7c86ad4f7167d1d73fe48
|
| 3 |
+
size 6080
|
Prediksi Performa Akademik/edtech/backend/src/models/recommenders/content_based/content_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63d1a2f5acb72fa4e6c3825586d578da46d850c31d82883ef50f618789722977
|
| 3 |
+
size 5211833
|
Prediksi Performa Akademik/edtech/backend/src/models/recommenders/hybrid/hybrid_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d173427052471e467df306ab61013e0599cfb0a80ff3805e464f9b7a25166933
|
| 3 |
+
size 32
|
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .data_processor import PerformanceDataProcessor
|
| 2 |
+
from .model_trainer import PerformanceModelTrainer
|
| 3 |
+
from .evaluator import PerformanceEvaluator
|
| 4 |
+
from .predictor import PerformancePredictor
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
'PerformanceDataProcessor',
|
| 8 |
+
'PerformanceModelTrainer',
|
| 9 |
+
'PerformanceEvaluator',
|
| 10 |
+
'PerformancePredictor'
|
| 11 |
+
]
|
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/data_processor.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# backend/src/performance_prediction/data_processor.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
|
| 7 |
+
from sklearn.impute import SimpleImputer
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.compose import ColumnTransformer
|
| 10 |
+
import joblib
|
| 11 |
+
import os
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import logging
|
| 15 |
+
import json
|
| 16 |
+
|
| 17 |
+
class PerformanceDataProcessor:
|
| 18 |
+
def __init__(self, data_path, config_path=None):
|
| 19 |
+
self.data_path = data_path
|
| 20 |
+
self.config_path = config_path
|
| 21 |
+
self.features = None
|
| 22 |
+
self.target = None
|
| 23 |
+
self.preprocessor = None
|
| 24 |
+
self.logger = self._setup_logger()
|
| 25 |
+
|
| 26 |
+
def _setup_logger(self):
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
logger.setLevel(logging.INFO)
|
| 29 |
+
handler = logging.StreamHandler()
|
| 30 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 31 |
+
handler.setFormatter(formatter)
|
| 32 |
+
logger.addHandler(handler)
|
| 33 |
+
return logger
|
| 34 |
+
|
| 35 |
+
def load_data(self):
|
| 36 |
+
"""Memuat data dengan penanganan error yang lebih baik"""
|
| 37 |
+
try:
|
| 38 |
+
data = pd.read_csv(self.data_path)
|
| 39 |
+
|
| 40 |
+
# Log informasi dasar data
|
| 41 |
+
self.logger.info(f"Data berhasil dimuat. Shape: {data.shape}")
|
| 42 |
+
self.logger.info(f"Kolom yang tersedia: {list(data.columns)}")
|
| 43 |
+
self.logger.info(f"Contoh data:\n{data.head(2)}")
|
| 44 |
+
|
| 45 |
+
return data
|
| 46 |
+
except Exception as e:
|
| 47 |
+
self.logger.error(f"Gagal memuat data: {str(e)}")
|
| 48 |
+
raise
|
| 49 |
+
|
| 50 |
+
def prepare_features_target(self, data, target_col='quiz_score'):
|
| 51 |
+
"""
|
| 52 |
+
Menyiapkan fitur dan target dengan penanganan data yang lebih komprehensif
|
| 53 |
+
"""
|
| 54 |
+
try:
|
| 55 |
+
# Load feature configuration if available
|
| 56 |
+
if self.config_path:
|
| 57 |
+
with open(self.config_path) as f:
|
| 58 |
+
config = json.load(f)
|
| 59 |
+
relevant_features = config.get('features', [])
|
| 60 |
+
else:
|
| 61 |
+
# Default features
|
| 62 |
+
relevant_features = [
|
| 63 |
+
'grade', 'tech_savvy', 'duration_minutes', 'engagement_score',
|
| 64 |
+
'completion_rate', 'material_rating', 'interaction_duration',
|
| 65 |
+
'material_engagement_score', 'feature_engagement', 'jam_belajar',
|
| 66 |
+
'hari_dalam_minggu', 'akhir_pekan', 'efisiensi_belajar',
|
| 67 |
+
'rasio_penyelesaian', 'interaksi_total', 'preferensi_materi',
|
| 68 |
+
'jumlah_pengakses', 'engagement_rata2', 'performance_label_encoded',
|
| 69 |
+
'learning_speed_encoded', 'student_feedback_encoded',
|
| 70 |
+
'achievement_status_encoded'
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
# Tambahkan fitur interaksi baru
|
| 74 |
+
data['efisiensi_engagement'] = data['engagement_score'] / (data['duration_minutes'] + 1e-6)
|
| 75 |
+
data['learning_consistency'] = data['completion_rate'] * data['material_rating']
|
| 76 |
+
relevant_features.extend(['efisiensi_engagement', 'learning_consistency'])
|
| 77 |
+
|
| 78 |
+
# Pastikan kolom target ada
|
| 79 |
+
if target_col not in data.columns:
|
| 80 |
+
raise ValueError(f"Kolom target '{target_col}' tidak ditemukan")
|
| 81 |
+
|
| 82 |
+
# Handle missing values
|
| 83 |
+
data[relevant_features] = data[relevant_features].fillna(data[relevant_features].median())
|
| 84 |
+
|
| 85 |
+
self.features = data[relevant_features]
|
| 86 |
+
self.target = data[target_col]
|
| 87 |
+
|
| 88 |
+
# Setup preprocessing pipeline
|
| 89 |
+
numeric_features = self.features.select_dtypes(include=['int64', 'float64']).columns
|
| 90 |
+
categorical_features = self.features.select_dtypes(include=['object', 'category']).columns
|
| 91 |
+
|
| 92 |
+
numeric_transformer = Pipeline(steps=[
|
| 93 |
+
('imputer', SimpleImputer(strategy='median')),
|
| 94 |
+
('scaler', RobustScaler()) # Lebih robust terhadap outlier
|
| 95 |
+
])
|
| 96 |
+
|
| 97 |
+
categorical_transformer = Pipeline(steps=[
|
| 98 |
+
('imputer', SimpleImputer(strategy='most_frequent')),
|
| 99 |
+
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
| 100 |
+
])
|
| 101 |
+
|
| 102 |
+
self.preprocessor = ColumnTransformer(
|
| 103 |
+
transformers=[
|
| 104 |
+
('num', numeric_transformer, numeric_features),
|
| 105 |
+
('cat', categorical_transformer, categorical_features)
|
| 106 |
+
])
|
| 107 |
+
|
| 108 |
+
return self.features, self.target
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
self.logger.error(f"Error dalam menyiapkan fitur: {str(e)}")
|
| 112 |
+
raise
|
| 113 |
+
|
| 114 |
+
def split_data(self, test_size=0.2, val_size=0.2, random_state=42):
|
| 115 |
+
"""Membagi data menjadi train, validation, dan test set"""
|
| 116 |
+
try:
|
| 117 |
+
if self.features is None or self.target is None:
|
| 118 |
+
raise ValueError("Fitur atau target belum disiapkan")
|
| 119 |
+
|
| 120 |
+
# Bagi data menjadi train+val dan test
|
| 121 |
+
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
| 122 |
+
self.features, self.target,
|
| 123 |
+
test_size=test_size,
|
| 124 |
+
random_state=random_state
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Bagi train_val menjadi train dan validation
|
| 128 |
+
val_size_adjusted = val_size / (1 - test_size) # Adjust untuk ukuran asli dataset
|
| 129 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
| 130 |
+
X_train_val, y_train_val,
|
| 131 |
+
test_size=val_size_adjusted,
|
| 132 |
+
random_state=random_state
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Preprocess data
|
| 136 |
+
X_train = self.preprocessor.fit_transform(X_train)
|
| 137 |
+
X_val = self.preprocessor.transform(X_val)
|
| 138 |
+
X_test = self.preprocessor.transform(X_test)
|
| 139 |
+
|
| 140 |
+
# Validasi data
|
| 141 |
+
self._validate_data(X_train, y_train)
|
| 142 |
+
self._validate_data(X_val, y_val)
|
| 143 |
+
self._validate_data(X_test, y_test)
|
| 144 |
+
|
| 145 |
+
return X_train, X_val, X_test, y_train, y_val, y_test
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
self.logger.error(f"Error dalam membagi data: {str(e)}")
|
| 149 |
+
raise
|
| 150 |
+
|
| 151 |
+
def _validate_data(self, X, y):
|
| 152 |
+
"""Validasi kualitas data"""
|
| 153 |
+
if isinstance(X, np.ndarray):
|
| 154 |
+
if np.any(np.isnan(X)) or np.any(np.isinf(X)):
|
| 155 |
+
raise ValueError("Data mengandung NaN atau infinity")
|
| 156 |
+
if len(X) != len(y):
|
| 157 |
+
raise ValueError("Jumlah sampel X dan y tidak sama")
|
| 158 |
+
if len(y) == 0:
|
| 159 |
+
raise ValueError("Data target kosong")
|
| 160 |
+
|
| 161 |
+
def save_processor(self, save_dir):
|
| 162 |
+
"""Menyimpan processor dan preprocessing pipeline"""
|
| 163 |
+
try:
|
| 164 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 165 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 166 |
+
save_path = Path(save_dir) / f"data_processor_{timestamp}.pkl"
|
| 167 |
+
|
| 168 |
+
# Simpan seluruh objek processor
|
| 169 |
+
joblib.dump({
|
| 170 |
+
'processor': self,
|
| 171 |
+
'preprocessor': self.preprocessor,
|
| 172 |
+
'feature_names': list(self.features.columns) if self.features is not None else None
|
| 173 |
+
}, save_path)
|
| 174 |
+
|
| 175 |
+
self.logger.info(f"Processor disimpan di: {save_path}")
|
| 176 |
+
return str(save_path)
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
self.logger.error(f"Gagal menyimpan processor: {str(e)}")
|
| 180 |
+
raise
|
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/evaluator.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# backend/src/performance_prediction/evaluator.py
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
from sklearn.metrics import (
|
| 7 |
+
mean_squared_error,
|
| 8 |
+
mean_absolute_error,
|
| 9 |
+
r2_score,
|
| 10 |
+
explained_variance_score,
|
| 11 |
+
max_error,
|
| 12 |
+
mean_absolute_percentage_error
|
| 13 |
+
)
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import logging
|
| 16 |
+
from typing import Dict, Tuple, Optional
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
import json
|
| 19 |
+
import shap
|
| 20 |
+
|
| 21 |
+
class PerformanceEvaluator:
|
| 22 |
+
def __init__(self, y_true: np.ndarray, y_pred: np.ndarray, model=None, X_test=None):
|
| 23 |
+
"""
|
| 24 |
+
Inisialisasi evaluator dengan tambahan SHAP values dan model interpretability
|
| 25 |
+
|
| 26 |
+
Parameters:
|
| 27 |
+
y_true (np.ndarray): Nilai sebenarnya
|
| 28 |
+
y_pred (np.ndarray): Nilai prediksi
|
| 29 |
+
model (optional): Model yang sudah dilatih untuk interpretasi
|
| 30 |
+
X_test (optional): Data fitur untuk interpretasi model
|
| 31 |
+
"""
|
| 32 |
+
self.y_true = y_true
|
| 33 |
+
self.y_pred = y_pred
|
| 34 |
+
self.model = model
|
| 35 |
+
self.X_test = X_test
|
| 36 |
+
self.shap_values = None
|
| 37 |
+
self.logger = self._setup_logger()
|
| 38 |
+
self.metrics = self.calculate_metrics()
|
| 39 |
+
|
| 40 |
+
def _setup_logger(self):
|
| 41 |
+
"""Setup logger untuk evaluator"""
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
logger.setLevel(logging.INFO)
|
| 44 |
+
handler = logging.StreamHandler()
|
| 45 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 46 |
+
handler.setFormatter(formatter)
|
| 47 |
+
logger.addHandler(handler)
|
| 48 |
+
return logger
|
| 49 |
+
|
| 50 |
+
def calculate_metrics(self) -> Dict[str, float]:
|
| 51 |
+
"""
|
| 52 |
+
Menghitung berbagai metrik evaluasi dengan penanganan kasus khusus
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Dict berisi berbagai metrik evaluasi
|
| 56 |
+
"""
|
| 57 |
+
metrics = {
|
| 58 |
+
'mse': mean_squared_error(self.y_true, self.y_pred),
|
| 59 |
+
'rmse': np.sqrt(mean_squared_error(self.y_true, self.y_pred)),
|
| 60 |
+
'mae': mean_absolute_error(self.y_true, self.y_pred),
|
| 61 |
+
'r2': r2_score(self.y_true, self.y_pred),
|
| 62 |
+
'explained_variance': explained_variance_score(self.y_true, self.y_pred),
|
| 63 |
+
'max_error': max_error(self.y_true, self.y_pred),
|
| 64 |
+
'mean_error': np.mean(self.y_true - self.y_pred),
|
| 65 |
+
'std_error': np.std(self.y_true - self.y_pred)
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Hitung MAPE hanya jika tidak ada nilai 0 di y_true
|
| 69 |
+
try:
|
| 70 |
+
metrics['mape'] = mean_absolute_percentage_error(self.y_true, self.y_pred) * 100
|
| 71 |
+
except ValueError:
|
| 72 |
+
metrics['mape'] = np.inf
|
| 73 |
+
self.logger.warning("Terdapat nilai 0 pada y_true, MAPE tidak dapat dihitung")
|
| 74 |
+
|
| 75 |
+
# Hitung metrik tambahan jika model tersedia
|
| 76 |
+
if self.model is not None and self.X_test is not None:
|
| 77 |
+
try:
|
| 78 |
+
self._calculate_shap_values()
|
| 79 |
+
metrics['mean_abs_shap'] = np.mean(np.abs(self.shap_values))
|
| 80 |
+
except Exception as e:
|
| 81 |
+
self.logger.warning(f"Tidak dapat menghitung SHAP values: {str(e)}")
|
| 82 |
+
|
| 83 |
+
return metrics
|
| 84 |
+
|
| 85 |
+
def _calculate_shap_values(self, sample_size: int = 100):
|
| 86 |
+
"""Menghitung SHAP values untuk interpretasi model"""
|
| 87 |
+
if self.model is None or self.X_test is None:
|
| 88 |
+
raise ValueError("Model dan X_test diperlukan untuk menghitung SHAP values")
|
| 89 |
+
|
| 90 |
+
# Sample data untuk efisiensi
|
| 91 |
+
if len(self.X_test) > sample_size:
|
| 92 |
+
sample_idx = np.random.choice(len(self.X_test), sample_size, replace=False)
|
| 93 |
+
X_sample = self.X_test[sample_idx]
|
| 94 |
+
else:
|
| 95 |
+
X_sample = self.X_test
|
| 96 |
+
|
| 97 |
+
# Hitung SHAP values
|
| 98 |
+
if hasattr(self.model, 'predict_proba'):
|
| 99 |
+
explainer = shap.Explainer(self.model)
|
| 100 |
+
self.shap_values = explainer(X_sample).values
|
| 101 |
+
else:
|
| 102 |
+
explainer = shap.Explainer(self.model)
|
| 103 |
+
self.shap_values = explainer(X_sample).values
|
| 104 |
+
|
| 105 |
+
def get_performance_report(self) -> str:
|
| 106 |
+
"""Membuat laporan performa model dalam format string"""
|
| 107 |
+
report = "\n=== MODEL PERFORMANCE REPORT ===\n"
|
| 108 |
+
for name, value in self.metrics.items():
|
| 109 |
+
report += f"{name.upper():<20}: {value:.4f}\n"
|
| 110 |
+
return report
|
| 111 |
+
|
| 112 |
+
def plot_residuals(self, save_path: Optional[str] = None) -> Optional[plt.Figure]:
|
| 113 |
+
"""
|
| 114 |
+
Visualisasi residual plot dengan informasi tambahan
|
| 115 |
+
|
| 116 |
+
Parameters:
|
| 117 |
+
save_path (optional): Path untuk menyimpan plot
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
plt.Figure jika save_path tidak ditentukan
|
| 121 |
+
"""
|
| 122 |
+
residuals = self.y_true - self.y_pred
|
| 123 |
+
|
| 124 |
+
plt.figure(figsize=(12, 8))
|
| 125 |
+
sns.scatterplot(x=self.y_pred, y=residuals, alpha=0.6)
|
| 126 |
+
|
| 127 |
+
# Tambahkan garis referensi
|
| 128 |
+
plt.axhline(y=0, color='r', linestyle='--')
|
| 129 |
+
|
| 130 |
+
# Tambahkan garis rata-rata residual
|
| 131 |
+
mean_residual = np.mean(residuals)
|
| 132 |
+
plt.axhline(y=mean_residual, color='b', linestyle='-',
|
| 133 |
+
label=f'Mean Residual: {mean_residual:.2f}')
|
| 134 |
+
|
| 135 |
+
# Hitung dan plot interval kepercayaan
|
| 136 |
+
std_residual = np.std(residuals)
|
| 137 |
+
plt.axhline(y=mean_residual + 1.96*std_residual, color='g', linestyle=':',
|
| 138 |
+
label='95% Confidence Interval')
|
| 139 |
+
plt.axhline(y=mean_residual - 1.96*std_residual, color='g', linestyle=':')
|
| 140 |
+
|
| 141 |
+
plt.xlabel('Predicted Values')
|
| 142 |
+
plt.ylabel('Residuals')
|
| 143 |
+
plt.title('Residual Analysis')
|
| 144 |
+
plt.legend()
|
| 145 |
+
|
| 146 |
+
if save_path:
|
| 147 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 148 |
+
plt.close()
|
| 149 |
+
self.logger.info(f"Residual plot disimpan di: {save_path}")
|
| 150 |
+
else:
|
| 151 |
+
return plt
|
| 152 |
+
|
| 153 |
+
def plot_actual_vs_predicted(self, save_path: Optional[str] = None) -> Optional[plt.Figure]:
|
| 154 |
+
"""Visualisasi aktual vs prediksi dengan informasi tambahan"""
|
| 155 |
+
plt.figure(figsize=(12, 8))
|
| 156 |
+
|
| 157 |
+
# Scatter plot
|
| 158 |
+
ax = sns.scatterplot(x=self.y_true, y=self.y_pred, alpha=0.6)
|
| 159 |
+
|
| 160 |
+
# Garis diagonal
|
| 161 |
+
min_val = min(self.y_true.min(), self.y_pred.min())
|
| 162 |
+
max_val = max(self.y_true.max(), self.y_pred.max())
|
| 163 |
+
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Ideal Prediction')
|
| 164 |
+
|
| 165 |
+
# Garis regresi
|
| 166 |
+
coef = np.polyfit(self.y_true, self.y_pred, 1)
|
| 167 |
+
poly1d_fn = np.poly1d(coef)
|
| 168 |
+
plt.plot(self.y_true, poly1d_fn(self.y_true), 'b-',
|
| 169 |
+
label=f'Regression Line (slope={coef[0]:.2f})')
|
| 170 |
+
|
| 171 |
+
plt.xlabel('Actual Values')
|
| 172 |
+
plt.ylabel('Predicted Values')
|
| 173 |
+
plt.title('Actual vs Predicted Values')
|
| 174 |
+
plt.legend()
|
| 175 |
+
|
| 176 |
+
if save_path:
|
| 177 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 178 |
+
plt.close()
|
| 179 |
+
self.logger.info(f"Actual vs Predicted plot disimpan di: {save_path}")
|
| 180 |
+
else:
|
| 181 |
+
return plt
|
| 182 |
+
|
| 183 |
+
def plot_error_distribution(self, save_path: Optional[str] = None) -> Optional[plt.Figure]:
|
| 184 |
+
"""Visualisasi distribusi error dengan informasi statistik"""
|
| 185 |
+
errors = self.y_true - self.y_pred
|
| 186 |
+
|
| 187 |
+
plt.figure(figsize=(12, 8))
|
| 188 |
+
|
| 189 |
+
# Histogram dengan KDE
|
| 190 |
+
ax = sns.histplot(errors, kde=True, bins=30)
|
| 191 |
+
|
| 192 |
+
# Tambahkan garis statistik
|
| 193 |
+
mean_error = np.mean(errors)
|
| 194 |
+
std_error = np.std(errors)
|
| 195 |
+
|
| 196 |
+
plt.axvline(mean_error, color='r', linestyle='-',
|
| 197 |
+
label=f'Mean Error: {mean_error:.2f}')
|
| 198 |
+
plt.axvline(mean_error + std_error, color='g', linestyle='--',
|
| 199 |
+
label=f'±1 Std Dev: {std_error:.2f}')
|
| 200 |
+
plt.axvline(mean_error - std_error, color='g', linestyle='--')
|
| 201 |
+
|
| 202 |
+
plt.xlabel('Prediction Error')
|
| 203 |
+
plt.ylabel('Frequency')
|
| 204 |
+
plt.title('Prediction Error Distribution')
|
| 205 |
+
plt.legend()
|
| 206 |
+
|
| 207 |
+
if save_path:
|
| 208 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 209 |
+
plt.close()
|
| 210 |
+
self.logger.info(f"Error distribution plot disimpan di: {save_path}")
|
| 211 |
+
else:
|
| 212 |
+
return plt
|
| 213 |
+
|
| 214 |
+
def plot_shap_summary(self, feature_names: list = None, save_path: Optional[str] = None) -> Optional[plt.Figure]:
|
| 215 |
+
"""Visualisasi SHAP summary plot"""
|
| 216 |
+
if self.shap_values is None:
|
| 217 |
+
self.logger.warning("SHAP values belum dihitung")
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
plt.figure(figsize=(14, 8))
|
| 221 |
+
shap.summary_plot(self.shap_values, self.X_test, feature_names=feature_names, show=False)
|
| 222 |
+
plt.title('SHAP Feature Importance')
|
| 223 |
+
plt.tight_layout()
|
| 224 |
+
|
| 225 |
+
if save_path:
|
| 226 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 227 |
+
plt.close()
|
| 228 |
+
self.logger.info(f"SHAP summary plot disimpan di: {save_path}")
|
| 229 |
+
else:
|
| 230 |
+
return plt
|
| 231 |
+
|
| 232 |
+
def save_evaluation_results(self, save_dir: str):
|
| 233 |
+
"""
|
| 234 |
+
Menyimpan semua hasil evaluasi termasuk plot dan metrik
|
| 235 |
+
|
| 236 |
+
Parameters:
|
| 237 |
+
save_dir: Direktori untuk menyimpan hasil
|
| 238 |
+
"""
|
| 239 |
+
save_path = Path(save_dir)
|
| 240 |
+
save_path.mkdir(parents=True, exist_ok=True)
|
| 241 |
+
|
| 242 |
+
# Simpan metrik
|
| 243 |
+
with open(save_path / 'evaluation_metrics.json', 'w') as f:
|
| 244 |
+
json.dump(self.metrics, f, indent=4)
|
| 245 |
+
|
| 246 |
+
# Simpan plot
|
| 247 |
+
self.plot_residuals(save_path / 'residual_plot.png')
|
| 248 |
+
self.plot_actual_vs_predicted(save_path / 'actual_vs_predicted.png')
|
| 249 |
+
self.plot_error_distribution(save_path / 'error_distribution.png')
|
| 250 |
+
|
| 251 |
+
# Simpan SHAP plot jika tersedia
|
| 252 |
+
if self.shap_values is not None:
|
| 253 |
+
self.plot_shap_summary(save_path=save_path / 'shap_summary.png')
|
| 254 |
+
|
| 255 |
+
self.logger.info(f"Hasil evaluasi disimpan di: {save_path}")
|
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/model_trainer.py
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# backend/src/performance_prediction/model_trainer.py
|
| 2 |
+
|
| 3 |
+
import xgboost as xgb
|
| 4 |
+
import optuna
|
| 5 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
| 6 |
+
from sklearn.model_selection import cross_val_score, KFold
|
| 7 |
+
import numpy as np
|
| 8 |
+
import joblib
|
| 9 |
+
import os
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
from functools import partial
|
| 17 |
+
import shap
|
| 18 |
+
import random
|
| 19 |
+
|
| 20 |
+
class PerformanceModelTrainer:
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.model = None
|
| 23 |
+
self.feature_importance = None
|
| 24 |
+
self.shap_values = None
|
| 25 |
+
self.best_params = None
|
| 26 |
+
self.cv_results = None
|
| 27 |
+
self.logger = self._setup_logger()
|
| 28 |
+
self.study = None
|
| 29 |
+
|
| 30 |
+
def _setup_logger(self):
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
logger.setLevel(logging.INFO)
|
| 33 |
+
handler = logging.StreamHandler()
|
| 34 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 35 |
+
handler.setFormatter(formatter)
|
| 36 |
+
logger.addHandler(handler)
|
| 37 |
+
return logger
|
| 38 |
+
|
| 39 |
+
def objective(self, trial, X, y):
|
| 40 |
+
"""Fungsi objective untuk Optuna dengan error handling yang lebih baik"""
|
| 41 |
+
try:
|
| 42 |
+
params = {
|
| 43 |
+
'objective': 'reg:squarederror',
|
| 44 |
+
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
|
| 45 |
+
'max_depth': trial.suggest_int('max_depth', 3, 12),
|
| 46 |
+
'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True), # Diperbarui range
|
| 47 |
+
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
|
| 48 |
+
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
|
| 49 |
+
'gamma': trial.suggest_float('gamma', 0, 1.0),
|
| 50 |
+
'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
|
| 51 |
+
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
|
| 52 |
+
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
|
| 53 |
+
'random_state': 42,
|
| 54 |
+
'n_jobs': 1
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
model = xgb.XGBRegressor(**params)
|
| 58 |
+
|
| 59 |
+
# Gunakan KFold cross-validation dengan error handling
|
| 60 |
+
kf = KFold(n_splits=3, shuffle=True, random_state=42) # Kurangi splits untuk efisiensi
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
scores = cross_val_score(
|
| 64 |
+
model, X, y,
|
| 65 |
+
cv=kf,
|
| 66 |
+
scoring='neg_mean_squared_error',
|
| 67 |
+
n_jobs=1,
|
| 68 |
+
error_score='raise'
|
| 69 |
+
)
|
| 70 |
+
return np.mean(scores)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
self.logger.warning(f"Trial gagal: {str(e)}")
|
| 73 |
+
return float('-inf') # Return nilai terburuk jika gagal
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
self.logger.error(f"Error dalam objective function: {str(e)}")
|
| 77 |
+
return float('-inf')
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def hyperparameter_tuning(self, X_train, y_train, n_trials=30):
|
| 81 |
+
"""Alternatif sederhana jika Optuna bermasalah"""
|
| 82 |
+
param_grid = {
|
| 83 |
+
'n_estimators': [100, 200, 500],
|
| 84 |
+
'max_depth': [3, 6, 9],
|
| 85 |
+
'learning_rate': [0.01, 0.1, 0.2],
|
| 86 |
+
'subsample': [0.6, 0.8, 1.0],
|
| 87 |
+
'colsample_bytree': [0.6, 0.8, 1.0]
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
best_score = float('-inf')
|
| 91 |
+
best_params = {}
|
| 92 |
+
|
| 93 |
+
for _ in range(n_trials):
|
| 94 |
+
params = {k: random.choice(v) for k, v in param_grid.items()}
|
| 95 |
+
# Hapus n_estimators untuk xgb.train
|
| 96 |
+
train_params = params.copy()
|
| 97 |
+
train_params.pop('n_estimators', None)
|
| 98 |
+
|
| 99 |
+
model = xgb.XGBRegressor(**params, random_state=42)
|
| 100 |
+
score = cross_val_score(model, X_train, y_train,
|
| 101 |
+
cv=3, scoring='neg_mean_squared_error').mean()
|
| 102 |
+
|
| 103 |
+
if score > best_score:
|
| 104 |
+
best_score = score
|
| 105 |
+
best_params = params
|
| 106 |
+
|
| 107 |
+
self.best_params = best_params
|
| 108 |
+
return best_params
|
| 109 |
+
|
| 110 |
+
def train_model(self, X_train, y_train, X_val=None, y_val=None, params=None):
|
| 111 |
+
"""Melatih model final dengan early stopping"""
|
| 112 |
+
try:
|
| 113 |
+
self.logger.info("\n=== TRAINING FINAL MODEL ===")
|
| 114 |
+
|
| 115 |
+
if params is None and self.best_params is not None:
|
| 116 |
+
params = self.best_params
|
| 117 |
+
|
| 118 |
+
# Parameter default
|
| 119 |
+
default_params = {
|
| 120 |
+
'objective': 'reg:squarederror',
|
| 121 |
+
'random_state': 42,
|
| 122 |
+
'verbosity': 1
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
# Hapus n_estimators jika menggunakan xgb.train
|
| 126 |
+
if 'n_estimators' in params:
|
| 127 |
+
params.pop('n_estimators')
|
| 128 |
+
|
| 129 |
+
final_params = {**default_params, **(params or {})}
|
| 130 |
+
|
| 131 |
+
if X_val is not None and y_val is not None:
|
| 132 |
+
self.logger.info("Menggunakan early stopping dengan validation set")
|
| 133 |
+
|
| 134 |
+
dtrain = xgb.DMatrix(X_train, label=y_train)
|
| 135 |
+
dval = xgb.DMatrix(X_val, label=y_val)
|
| 136 |
+
|
| 137 |
+
evals = [(dtrain, 'train'), (dval, 'val')]
|
| 138 |
+
evals_result = {}
|
| 139 |
+
model = xgb.train(
|
| 140 |
+
final_params,
|
| 141 |
+
dtrain,
|
| 142 |
+
num_boost_round=1000,
|
| 143 |
+
evals=evals,
|
| 144 |
+
early_stopping_rounds=50,
|
| 145 |
+
verbose_eval=50,
|
| 146 |
+
evals_result=evals_result
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Simpan evals_result
|
| 150 |
+
self.evals_result = evals_result
|
| 151 |
+
else:
|
| 152 |
+
self.logger.info("Training tanpa early stopping")
|
| 153 |
+
model = xgb.XGBRegressor(**final_params)
|
| 154 |
+
model.fit(X_train, y_train)
|
| 155 |
+
|
| 156 |
+
self.model = model
|
| 157 |
+
|
| 158 |
+
# Hitung feature importance dan SHAP values
|
| 159 |
+
self._calculate_feature_importance(X_train)
|
| 160 |
+
self._calculate_shap_values(X_train)
|
| 161 |
+
|
| 162 |
+
return model
|
| 163 |
+
except Exception as e:
|
| 164 |
+
self.logger.error(f"Error dalam training model: {str(e)}")
|
| 165 |
+
raise
|
| 166 |
+
|
| 167 |
+
def _calculate_feature_importance(self, X_train):
|
| 168 |
+
"""Menghitung feature importance"""
|
| 169 |
+
try:
|
| 170 |
+
if isinstance(self.model, xgb.Booster):
|
| 171 |
+
# Untuk model Booster (xgb.train)
|
| 172 |
+
importance = self.model.get_score(importance_type='weight')
|
| 173 |
+
# Konversi ke format yang konsisten
|
| 174 |
+
self.feature_importance = {k: float(v) for k, v in importance.items()}
|
| 175 |
+
elif hasattr(self.model, 'feature_importances_'):
|
| 176 |
+
# Untuk model scikit-learn API (XGBRegressor)
|
| 177 |
+
self.feature_importance = dict(zip(
|
| 178 |
+
self.model.get_booster().feature_names,
|
| 179 |
+
self.model.feature_importances_
|
| 180 |
+
))
|
| 181 |
+
else:
|
| 182 |
+
self.logger.warning("Tipe model tidak dikenali untuk menghitung feature importance")
|
| 183 |
+
self.feature_importance = None
|
| 184 |
+
except Exception as e:
|
| 185 |
+
self.logger.error(f"Gagal menghitung feature importance: {str(e)}")
|
| 186 |
+
self.feature_importance = None
|
| 187 |
+
|
| 188 |
+
def _calculate_shap_values(self, X_train, sample_size=100):
|
| 189 |
+
"""Menghitung SHAP values untuk interpretasi model"""
|
| 190 |
+
try:
|
| 191 |
+
if self.model is None:
|
| 192 |
+
raise ValueError("Model belum dilatih")
|
| 193 |
+
|
| 194 |
+
if isinstance(self.model, xgb.Booster):
|
| 195 |
+
explainer = shap.TreeExplainer(self.model)
|
| 196 |
+
X_sample = shap.utils.sample(X_train, sample_size)
|
| 197 |
+
self.shap_values = explainer.shap_values(X_sample)
|
| 198 |
+
else:
|
| 199 |
+
explainer = shap.Explainer(self.model)
|
| 200 |
+
self.shap_values = explainer(X_train)
|
| 201 |
+
except Exception as e:
|
| 202 |
+
self.logger.warning(f"Tidak dapat menghitung SHAP values: {str(e)}")
|
| 203 |
+
self.shap_values = None
|
| 204 |
+
|
| 205 |
+
def evaluate_model(self, X_test, y_test):
|
| 206 |
+
"""Evaluasi model dengan metrik lengkap"""
|
| 207 |
+
try:
|
| 208 |
+
if self.model is None:
|
| 209 |
+
raise ValueError("Model belum dilatih")
|
| 210 |
+
|
| 211 |
+
dtest = xgb.DMatrix(X_test)
|
| 212 |
+
predictions = self.model.predict(dtest)
|
| 213 |
+
|
| 214 |
+
# Hitung berbagai metrik evaluasi
|
| 215 |
+
metrics = self._calculate_all_metrics(y_test, predictions)
|
| 216 |
+
|
| 217 |
+
self.logger.info("\n=== HASIL EVALUASI MODEL ===")
|
| 218 |
+
for name, value in metrics.items():
|
| 219 |
+
self.logger.info(f"{name}: {value:.4f}")
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
'metrics': metrics,
|
| 223 |
+
'predictions': predictions,
|
| 224 |
+
'shap_values': self.shap_values
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
self.logger.error(f"Error dalam evaluasi model: {str(e)}")
|
| 229 |
+
raise
|
| 230 |
+
|
| 231 |
+
def _calculate_all_metrics(self, y_true, y_pred):
|
| 232 |
+
"""Menghitung semua metrik evaluasi"""
|
| 233 |
+
metrics = {
|
| 234 |
+
'mse': mean_squared_error(y_true, y_pred),
|
| 235 |
+
'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
|
| 236 |
+
'mae': mean_absolute_error(y_true, y_pred),
|
| 237 |
+
'r2': r2_score(y_true, y_pred),
|
| 238 |
+
'max_error': np.max(np.abs(y_true - y_pred))
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
# Hitung MAPE dengan penanganan nilai 0
|
| 242 |
+
try:
|
| 243 |
+
# Tambahkan epsilon kecil untuk menghindari division by zero
|
| 244 |
+
y_true_adjusted = np.where(y_true == 0, 1e-10, y_true)
|
| 245 |
+
metrics['mape'] = np.mean(np.abs((y_true - y_pred) / y_true_adjusted)) * 100
|
| 246 |
+
except Exception as e:
|
| 247 |
+
metrics['mape'] = np.inf
|
| 248 |
+
self.logger.warning(f"Tidak dapat menghitung MAPE: {str(e)}")
|
| 249 |
+
|
| 250 |
+
return metrics
|
| 251 |
+
|
| 252 |
+
def plot_learning_curve(self, X_train, y_train, X_val, y_val, save_path=None):
|
| 253 |
+
"""Visualisasi learning curve"""
|
| 254 |
+
try:
|
| 255 |
+
# Gunakan evals_result yang sudah disimpan
|
| 256 |
+
if not hasattr(self, 'evals_result') or not self.evals_result:
|
| 257 |
+
self.logger.warning("Tidak ada evals_result tersedia untuk learning curve")
|
| 258 |
+
return None
|
| 259 |
+
|
| 260 |
+
results = self.evals_result
|
| 261 |
+
epochs = len(results['train']['rmse']) if 'train' in results else 0
|
| 262 |
+
|
| 263 |
+
if epochs == 0:
|
| 264 |
+
self.logger.warning("Data learning curve kosong")
|
| 265 |
+
return None
|
| 266 |
+
|
| 267 |
+
x_axis = range(0, epochs)
|
| 268 |
+
|
| 269 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
| 270 |
+
ax.plot(x_axis, results['train']['rmse'], label='Train')
|
| 271 |
+
|
| 272 |
+
if 'val' in results:
|
| 273 |
+
ax.plot(x_axis, results['val']['rmse'], label='Validation')
|
| 274 |
+
|
| 275 |
+
ax.legend()
|
| 276 |
+
plt.ylabel('RMSE')
|
| 277 |
+
plt.xlabel('Epochs')
|
| 278 |
+
plt.title('XGBoost Learning Curve')
|
| 279 |
+
|
| 280 |
+
if save_path:
|
| 281 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 282 |
+
plt.close()
|
| 283 |
+
self.logger.info(f"Learning curve disimpan di: {save_path}")
|
| 284 |
+
else:
|
| 285 |
+
return plt
|
| 286 |
+
|
| 287 |
+
except Exception as e:
|
| 288 |
+
self.logger.error(f"Error membuat learning curve: {str(e)}")
|
| 289 |
+
raise
|
| 290 |
+
|
| 291 |
+
def plot_feature_importance(self, feature_names=None, top_n=20, save_path=None):
|
| 292 |
+
"""Visualisasi feature importance"""
|
| 293 |
+
try:
|
| 294 |
+
if self.feature_importance is None:
|
| 295 |
+
self._calculate_feature_importance(feature_names) # Coba hitung lagi
|
| 296 |
+
|
| 297 |
+
if self.feature_importance is None:
|
| 298 |
+
raise ValueError("Feature importance belum dihitung. Model mungkin belum dilatih atau terjadi error dalam perhitungan.")
|
| 299 |
+
|
| 300 |
+
# Buat DataFrame dari feature importance
|
| 301 |
+
importance_df = pd.DataFrame({
|
| 302 |
+
'feature': list(self.feature_importance.keys()),
|
| 303 |
+
'importance': list(self.feature_importance.values())
|
| 304 |
+
}).sort_values('importance', ascending=False)
|
| 305 |
+
|
| 306 |
+
# Jika ada feature_names, pastikan urutannya benar
|
| 307 |
+
if feature_names is not None:
|
| 308 |
+
importance_df = importance_df[importance_df['feature'].isin(feature_names)]
|
| 309 |
+
|
| 310 |
+
# Ambil top N features
|
| 311 |
+
top_features = importance_df.head(top_n)
|
| 312 |
+
|
| 313 |
+
# Plot
|
| 314 |
+
plt.figure(figsize=(14, 10))
|
| 315 |
+
bars = plt.barh(top_features['feature'], top_features['importance'])
|
| 316 |
+
plt.xlabel('Importance Score')
|
| 317 |
+
plt.title('Top Feature Importance')
|
| 318 |
+
|
| 319 |
+
# Tambahkan nilai importance
|
| 320 |
+
for bar in bars:
|
| 321 |
+
width = bar.get_width()
|
| 322 |
+
plt.text(width + 0.001, bar.get_y() + bar.get_height()/2,
|
| 323 |
+
f'{width:.4f}',
|
| 324 |
+
va='center', ha='left')
|
| 325 |
+
|
| 326 |
+
plt.gca().invert_yaxis()
|
| 327 |
+
plt.tight_layout()
|
| 328 |
+
|
| 329 |
+
if save_path:
|
| 330 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 331 |
+
plt.close()
|
| 332 |
+
self.logger.info(f"Feature importance plot disimpan di: {save_path}")
|
| 333 |
+
return None, importance_df
|
| 334 |
+
else:
|
| 335 |
+
return plt, importance_df
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
self.logger.error(f"Error membuat feature importance plot: {str(e)}")
|
| 339 |
+
raise
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def plot_shap_summary(self, feature_names=None, save_path=None):
|
| 343 |
+
"""Visualisasi SHAP summary plot"""
|
| 344 |
+
try:
|
| 345 |
+
if self.shap_values is None:
|
| 346 |
+
raise ValueError("SHAP values belum dihitung")
|
| 347 |
+
|
| 348 |
+
plt.figure(figsize=(14, 10))
|
| 349 |
+
shap.summary_plot(self.shap_values, feature_names=feature_names, show=False)
|
| 350 |
+
plt.tight_layout()
|
| 351 |
+
|
| 352 |
+
if save_path:
|
| 353 |
+
plt.savefig(save_path, bbox_inches='tight')
|
| 354 |
+
plt.close()
|
| 355 |
+
self.logger.info(f"SHAP summary plot disimpan di: {save_path}")
|
| 356 |
+
else:
|
| 357 |
+
return plt
|
| 358 |
+
|
| 359 |
+
except Exception as e:
|
| 360 |
+
self.logger.error(f"Error membuat SHAP summary plot: {str(e)}")
|
| 361 |
+
raise
|
| 362 |
+
|
| 363 |
+
def save_model(self, save_dir, model_name=None):
|
| 364 |
+
"""Menyimpan model dan semua hasil terkait"""
|
| 365 |
+
try:
|
| 366 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 367 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 368 |
+
|
| 369 |
+
if not model_name:
|
| 370 |
+
model_name = f"performance_model_{timestamp}"
|
| 371 |
+
|
| 372 |
+
# Path untuk berbagai file
|
| 373 |
+
model_path = Path(save_dir) / f"{model_name}.pkl"
|
| 374 |
+
params_path = Path(save_dir) / f"{model_name}_params.json"
|
| 375 |
+
cv_path = Path(save_dir) / f"{model_name}_cv_results.csv"
|
| 376 |
+
shap_path = Path(save_dir) / f"{model_name}_shap_values.npy"
|
| 377 |
+
study_path = Path(save_dir) / f"{model_name}_optuna_study.pkl"
|
| 378 |
+
|
| 379 |
+
# Simpan model
|
| 380 |
+
joblib.dump(self.model, model_path)
|
| 381 |
+
|
| 382 |
+
# Simpan parameter terbaik
|
| 383 |
+
with open(params_path, 'w') as f:
|
| 384 |
+
json.dump(self.best_params, f, indent=4)
|
| 385 |
+
|
| 386 |
+
# Simpan hasil CV jika ada
|
| 387 |
+
if self.cv_results is not None:
|
| 388 |
+
pd.DataFrame(self.cv_results).to_csv(cv_path, index=False)
|
| 389 |
+
|
| 390 |
+
# Simpan SHAP values jika ada
|
| 391 |
+
if self.shap_values is not None:
|
| 392 |
+
np.save(shap_path, self.shap_values, allow_pickle=True)
|
| 393 |
+
|
| 394 |
+
# Simpan optuna study jika ada
|
| 395 |
+
if self.study is not None:
|
| 396 |
+
joblib.dump(self.study, study_path)
|
| 397 |
+
|
| 398 |
+
self.logger.info("\n=== MODEL DISIMPAN ===")
|
| 399 |
+
self.logger.info(f"Model: {model_path}")
|
| 400 |
+
self.logger.info(f"Parameter: {params_path}")
|
| 401 |
+
if self.cv_results is not None:
|
| 402 |
+
self.logger.info(f"Hasil CV: {cv_path}")
|
| 403 |
+
if self.shap_values is not None:
|
| 404 |
+
self.logger.info(f"SHAP values: {shap_path}")
|
| 405 |
+
if self.study is not None:
|
| 406 |
+
self.logger.info(f"Optuna study: {study_path}")
|
| 407 |
+
|
| 408 |
+
return str(model_path)
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
self.logger.error(f"Error menyimpan model: {str(e)}")
|
| 412 |
+
raise
|
Prediksi Performa Akademik/edtech/backend/src/performance_prediction/predictor.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# backend/src/performance_prediction/predictor.py
|
| 2 |
+
|
| 3 |
+
import joblib
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Union, Dict, List, Optional
|
| 9 |
+
import xgboost as xgb
|
| 10 |
+
import shap
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
class PerformancePredictor:
|
| 14 |
+
def __init__(self, model_path: str, preprocessor_path: Optional[str] = None):
|
| 15 |
+
"""
|
| 16 |
+
Inisialisasi predictor dengan model dan preprocessor
|
| 17 |
+
|
| 18 |
+
Parameters:
|
| 19 |
+
model_path: Path ke model yang sudah dilatih
|
| 20 |
+
preprocessor_path: Path ke preprocessor (opsional)
|
| 21 |
+
"""
|
| 22 |
+
self.model_path = model_path
|
| 23 |
+
self.preprocessor_path = preprocessor_path
|
| 24 |
+
self.model = None
|
| 25 |
+
self.preprocessor = None
|
| 26 |
+
self.feature_names = None
|
| 27 |
+
self.shap_explainer = None
|
| 28 |
+
self.logger = self._setup_logger()
|
| 29 |
+
self._load_components()
|
| 30 |
+
|
| 31 |
+
def _setup_logger(self):
|
| 32 |
+
"""Setup logger untuk predictor"""
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
logger.setLevel(logging.INFO)
|
| 35 |
+
handler = logging.StreamHandler()
|
| 36 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 37 |
+
handler.setFormatter(formatter)
|
| 38 |
+
logger.addHandler(handler)
|
| 39 |
+
return logger
|
| 40 |
+
|
| 41 |
+
def _load_components(self):
|
| 42 |
+
"""Memuat model dan preprocessor"""
|
| 43 |
+
try:
|
| 44 |
+
# Load model
|
| 45 |
+
self.model = joblib.load(self.model_path)
|
| 46 |
+
self.logger.info(f"Model berhasil dimuat dari {self.model_path}")
|
| 47 |
+
|
| 48 |
+
# Load preprocessor jika ada
|
| 49 |
+
if self.preprocessor_path:
|
| 50 |
+
processor_data = joblib.load(self.preprocessor_path)
|
| 51 |
+
self.preprocessor = processor_data['preprocessor']
|
| 52 |
+
self.feature_names = processor_data['feature_names']
|
| 53 |
+
self.logger.info(f"Preprocessor berhasil dimuat dari {self.preprocessor_path}")
|
| 54 |
+
|
| 55 |
+
# Setup SHAP explainer
|
| 56 |
+
self._setup_shap_explainer()
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
self.logger.error(f"Gagal memuat komponen: {str(e)}")
|
| 60 |
+
raise
|
| 61 |
+
|
| 62 |
+
def _setup_shap_explainer(self):
|
| 63 |
+
"""Mempersiapkan SHAP explainer untuk interpretasi"""
|
| 64 |
+
try:
|
| 65 |
+
if hasattr(self.model, 'predict_proba'):
|
| 66 |
+
self.shap_explainer = shap.Explainer(self.model)
|
| 67 |
+
else:
|
| 68 |
+
self.shap_explainer = shap.Explainer(self.model)
|
| 69 |
+
self.logger.info("SHAP explainer berhasil diinisialisasi")
|
| 70 |
+
except Exception as e:
|
| 71 |
+
self.logger.warning(f"Tidak dapat menginisialisasi SHAP explainer: {str(e)}")
|
| 72 |
+
self.shap_explainer = None
|
| 73 |
+
|
| 74 |
+
def _prepare_input(self, input_data: Union[Dict, List[Dict]], return_dataframe: bool = False) -> Union[np.ndarray, pd.DataFrame]:
|
| 75 |
+
"""
|
| 76 |
+
Mempersiapkan input data untuk prediksi
|
| 77 |
+
|
| 78 |
+
Parameters:
|
| 79 |
+
input_data: Input data dalam bentuk dict atau list of dicts
|
| 80 |
+
return_dataframe: Jika True kembalikan DataFrame, jika False kembalikan array
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Data yang sudah diproses dalam bentuk array atau DataFrame
|
| 84 |
+
"""
|
| 85 |
+
# Konversi input ke DataFrame
|
| 86 |
+
if isinstance(input_data, dict):
|
| 87 |
+
input_df = pd.DataFrame([input_data])
|
| 88 |
+
elif isinstance(input_data, list):
|
| 89 |
+
input_df = pd.DataFrame(input_data)
|
| 90 |
+
elif isinstance(input_data, pd.DataFrame):
|
| 91 |
+
input_df = input_data.copy()
|
| 92 |
+
else:
|
| 93 |
+
raise ValueError("Input harus berupa dict, list of dicts, atau DataFrame")
|
| 94 |
+
|
| 95 |
+
# Validasi kolom
|
| 96 |
+
if self.feature_names is not None:
|
| 97 |
+
missing_cols = set(self.feature_names) - set(input_df.columns)
|
| 98 |
+
if missing_cols:
|
| 99 |
+
raise ValueError(f"Kolom berikut tidak ditemukan dalam input: {missing_cols}")
|
| 100 |
+
|
| 101 |
+
# Urutkan kolom sesuai dengan yang diharapkan model
|
| 102 |
+
input_df = input_df[self.feature_names]
|
| 103 |
+
|
| 104 |
+
# Preprocess data jika ada preprocessor
|
| 105 |
+
if self.preprocessor is not None:
|
| 106 |
+
processed_data = self.preprocessor.transform(input_df)
|
| 107 |
+
else:
|
| 108 |
+
processed_data = input_df.values if not return_dataframe else input_df
|
| 109 |
+
|
| 110 |
+
return processed_data if not return_dataframe else input_df
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def predict(self, input_data: Union[Dict, List[Dict]],
|
| 114 |
+
return_contributions: bool = False) -> Dict:
|
| 115 |
+
"""
|
| 116 |
+
Membuat prediksi dari input data dengan opsi interpretasi
|
| 117 |
+
|
| 118 |
+
Parameters:
|
| 119 |
+
input_data: Input data dalam bentuk dict atau list of dicts
|
| 120 |
+
return_contributions: Jika True, kembalikan kontribusi fitur
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Dict berisi prediksi dan informasi tambahan
|
| 124 |
+
"""
|
| 125 |
+
start_time = datetime.now()
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
# Persiapkan input
|
| 129 |
+
processed_input = self._prepare_input(input_data)
|
| 130 |
+
|
| 131 |
+
# Buat prediksi
|
| 132 |
+
if isinstance(self.model, xgb.Booster):
|
| 133 |
+
dmatrix = xgb.DMatrix(processed_input)
|
| 134 |
+
predictions = self.model.predict(dmatrix)
|
| 135 |
+
else:
|
| 136 |
+
predictions = self.model.predict(processed_input)
|
| 137 |
+
|
| 138 |
+
# Hitung confidence interval (simplified)
|
| 139 |
+
if hasattr(self.model, 'predict_quantiles'):
|
| 140 |
+
quantiles = self.model.predict_quantiles(processed_input, quantiles=(0.025, 0.975))
|
| 141 |
+
confidence_intervals = list(zip(quantiles[0], quantiles[1]))
|
| 142 |
+
else:
|
| 143 |
+
# Fallback untuk model tanpa quantile prediction
|
| 144 |
+
std_dev = np.std(predictions)
|
| 145 |
+
confidence_intervals = [(p - 1.96*std_dev, p + 1.96*std_dev) for p in predictions]
|
| 146 |
+
|
| 147 |
+
# Hitung feature contributions jika diminta
|
| 148 |
+
feature_contributions = None
|
| 149 |
+
if return_contributions and self.shap_explainer is not None:
|
| 150 |
+
feature_contributions = self._calculate_feature_contributions(processed_input)
|
| 151 |
+
|
| 152 |
+
# Hitung waktu eksekusi
|
| 153 |
+
exec_time = (datetime.now() - start_time).total_seconds()
|
| 154 |
+
|
| 155 |
+
# Format hasil
|
| 156 |
+
if isinstance(predictions, np.ndarray) and predictions.ndim == 1:
|
| 157 |
+
predictions = predictions.tolist()
|
| 158 |
+
|
| 159 |
+
result = {
|
| 160 |
+
'predictions': predictions,
|
| 161 |
+
'confidence_intervals': confidence_intervals,
|
| 162 |
+
'execution_time_seconds': exec_time,
|
| 163 |
+
'timestamp': start_time.isoformat()
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
if feature_contributions is not None:
|
| 167 |
+
result['feature_contributions'] = feature_contributions
|
| 168 |
+
|
| 169 |
+
return result
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
self.logger.error(f"Error dalam prediksi: {str(e)}")
|
| 173 |
+
raise
|
| 174 |
+
|
| 175 |
+
def _calculate_feature_contributions(self, processed_input: np.ndarray) -> List[Dict]:
|
| 176 |
+
"""
|
| 177 |
+
Menghitung kontribusi fitur menggunakan SHAP values
|
| 178 |
+
|
| 179 |
+
Parameters:
|
| 180 |
+
processed_input: Input data yang sudah diproses
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
List berisi kontribusi setiap fitur untuk setiap sampel
|
| 184 |
+
"""
|
| 185 |
+
if self.shap_explainer is None:
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
# Hitung SHAP values
|
| 189 |
+
shap_values = self.shap_explainer(processed_input)
|
| 190 |
+
|
| 191 |
+
# Format hasil
|
| 192 |
+
contributions = []
|
| 193 |
+
for i in range(len(processed_input)):
|
| 194 |
+
sample_contributions = []
|
| 195 |
+
|
| 196 |
+
for j, feature_name in enumerate(self.feature_names):
|
| 197 |
+
sample_contributions.append({
|
| 198 |
+
'feature': feature_name,
|
| 199 |
+
'value': processed_input[i][j] if isinstance(processed_input, np.ndarray) else processed_input.iloc[i][j],
|
| 200 |
+
'contribution': float(shap_values.values[i][j]),
|
| 201 |
+
'abs_contribution': float(np.abs(shap_values.values[i][j]))
|
| 202 |
+
})
|
| 203 |
+
|
| 204 |
+
# Urutkan berdasarkan kontribusi absolut terbesar
|
| 205 |
+
sample_contributions.sort(key=lambda x: x['abs_contribution'], reverse=True)
|
| 206 |
+
contributions.append(sample_contributions)
|
| 207 |
+
|
| 208 |
+
return contributions
|
| 209 |
+
|
| 210 |
+
def batch_predict(self, input_data: List[Dict], batch_size: int = 100,
|
| 211 |
+
return_contributions: bool = False) -> Dict:
|
| 212 |
+
"""
|
| 213 |
+
Membuat prediksi dalam batch untuk efisiensi
|
| 214 |
+
|
| 215 |
+
Parameters:
|
| 216 |
+
input_data: List of dicts berisi input data
|
| 217 |
+
batch_size: Ukuran batch untuk prediksi
|
| 218 |
+
return_contributions: Jika True, kembalikan kontribusi fitur
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
Dict berisi hasil prediksi untuk semua sampel
|
| 222 |
+
"""
|
| 223 |
+
start_time = datetime.now()
|
| 224 |
+
total_samples = len(input_data)
|
| 225 |
+
results = []
|
| 226 |
+
|
| 227 |
+
self.logger.info(f"Memulai batch prediction untuk {total_samples} sampel (batch_size={batch_size})")
|
| 228 |
+
|
| 229 |
+
for i in range(0, total_samples, batch_size):
|
| 230 |
+
batch = input_data[i:i+batch_size]
|
| 231 |
+
try:
|
| 232 |
+
batch_result = self.predict(batch, return_contributions)
|
| 233 |
+
results.extend(batch_result['predictions'])
|
| 234 |
+
except Exception as e:
|
| 235 |
+
self.logger.error(f"Error pada batch {i//batch_size}: {str(e)}")
|
| 236 |
+
raise
|
| 237 |
+
|
| 238 |
+
exec_time = (datetime.now() - start_time).total_seconds()
|
| 239 |
+
avg_time_per_sample = exec_time / total_samples
|
| 240 |
+
|
| 241 |
+
self.logger.info(
|
| 242 |
+
f"Batch prediction selesai. Total waktu: {exec_time:.2f} detik "
|
| 243 |
+
f"({avg_time_per_sample:.4f} detik/sampel)"
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
return {
|
| 247 |
+
'predictions': results,
|
| 248 |
+
'total_samples': total_samples,
|
| 249 |
+
'total_time_seconds': exec_time,
|
| 250 |
+
'avg_time_per_sample': avg_time_per_sample,
|
| 251 |
+
'timestamp': start_time.isoformat()
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
def evaluate_model(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict:
|
| 255 |
+
"""
|
| 256 |
+
Evaluasi model pada dataset test
|
| 257 |
+
|
| 258 |
+
Parameters:
|
| 259 |
+
X_test: Data fitur test
|
| 260 |
+
y_test: Target test
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Dict berisi metrik evaluasi
|
| 264 |
+
"""
|
| 265 |
+
from .evaluator import PerformanceEvaluator
|
| 266 |
+
|
| 267 |
+
evaluator = PerformanceEvaluator(y_test, self.predict(X_test)['predictions'],
|
| 268 |
+
self.model, X_test)
|
| 269 |
+
return evaluator.metrics
|
| 270 |
+
|
| 271 |
+
def save_predictor(self, save_dir: str):
|
| 272 |
+
"""
|
| 273 |
+
Menyimpan objek predictor untuk penggunaan nanti
|
| 274 |
+
|
| 275 |
+
Parameters:
|
| 276 |
+
save_dir: Direktori untuk menyimpan predictor
|
| 277 |
+
"""
|
| 278 |
+
save_path = Path(save_dir)
|
| 279 |
+
save_path.mkdir(parents=True, exist_ok=True)
|
| 280 |
+
|
| 281 |
+
# Nama file berdasarkan timestamp
|
| 282 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 283 |
+
save_file = save_path / f"predictor_{timestamp}.pkl"
|
| 284 |
+
|
| 285 |
+
# Simpan objek predictor
|
| 286 |
+
joblib.dump(self, save_file)
|
| 287 |
+
self.logger.info(f"Predictor disimpan di: {save_file}")
|
| 288 |
+
|
| 289 |
+
return str(save_file)
|
Prediksi Performa Akademik/edtech/backend/src/train_performance_predictor.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# backend/src/train_performance_predictor.py
|
| 2 |
+
import numpy as np
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import joblib
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
# Setup logging
|
| 14 |
+
logging.basicConfig(
|
| 15 |
+
level=logging.INFO,
|
| 16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 17 |
+
handlers=[
|
| 18 |
+
logging.StreamHandler(),
|
| 19 |
+
logging.FileHandler('training.log')
|
| 20 |
+
]
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Setup paths
|
| 25 |
+
current_dir = Path(__file__).parent
|
| 26 |
+
sys.path.append(str(current_dir))
|
| 27 |
+
|
| 28 |
+
from performance_prediction.data_processor import PerformanceDataProcessor
|
| 29 |
+
from performance_prediction.model_trainer import PerformanceModelTrainer
|
| 30 |
+
from performance_prediction.evaluator import PerformanceEvaluator
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
try:
|
| 34 |
+
logger.info("=== MEMULAI PELATIHAN MODEL PREDIKSI PERFORMA ===")
|
| 35 |
+
|
| 36 |
+
# Setup paths
|
| 37 |
+
BASE_DIR = current_dir.parent.parent
|
| 38 |
+
DATA_PATH = BASE_DIR / "backend/data/processed/cleaned_education_data.csv" # Ensure this is the correct path
|
| 39 |
+
MODEL_SAVE_DIR = BASE_DIR / "models/performance_predictor/trained_model"
|
| 40 |
+
LOG_DIR = BASE_DIR / "models/performance_predictor/training_logs"
|
| 41 |
+
CONFIG_PATH = BASE_DIR / "config/model_config.json"
|
| 42 |
+
|
| 43 |
+
# Buat direktori jika belum ada
|
| 44 |
+
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
|
| 45 |
+
os.makedirs(LOG_DIR, exist_ok=True)
|
| 46 |
+
|
| 47 |
+
# 1. Persiapan Data
|
| 48 |
+
logger.info("\n=== MEMUAT DAN MEMPROSES DATA ===")
|
| 49 |
+
processor = PerformanceDataProcessor(DATA_PATH, CONFIG_PATH)
|
| 50 |
+
data = processor.load_data()
|
| 51 |
+
|
| 52 |
+
# Cek data
|
| 53 |
+
if data is None or data.empty:
|
| 54 |
+
logger.error("Data kosong atau gagal dimuat")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
# Siapkan fitur dan target
|
| 58 |
+
features, target = processor.prepare_features_target(data)
|
| 59 |
+
|
| 60 |
+
# Bagi data menjadi train, validation, dan test set
|
| 61 |
+
X_train, X_val, X_test, y_train, y_val, y_test = processor.split_data(
|
| 62 |
+
test_size=0.2,
|
| 63 |
+
val_size=0.2
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Gunakan subset data untuk testing jika perlu
|
| 67 |
+
# X_train, y_train = X_train[:1000], y_train[:1000]
|
| 68 |
+
# X_val, y_val = X_val[:1000], y_val[:1000]
|
| 69 |
+
|
| 70 |
+
# 2. Pelatihan Model
|
| 71 |
+
logger.info("\n=== MELATIH MODEL ===")
|
| 72 |
+
trainer = PerformanceModelTrainer()
|
| 73 |
+
|
| 74 |
+
# Gunakan parameter yang lebih konservatif untuk testing
|
| 75 |
+
best_params = {
|
| 76 |
+
'max_depth': 6,
|
| 77 |
+
'learning_rate': 0.1,
|
| 78 |
+
'subsample': 0.8,
|
| 79 |
+
'colsample_bytree': 0.8,
|
| 80 |
+
'reg_alpha': 0.1,
|
| 81 |
+
'reg_lambda': 1.0,
|
| 82 |
+
'min_child_weight': 1,
|
| 83 |
+
'gamma': 0
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Latih model final dengan parameter
|
| 87 |
+
model = trainer.train_model(
|
| 88 |
+
X_train=X_train,
|
| 89 |
+
y_train=y_train,
|
| 90 |
+
X_val=X_val,
|
| 91 |
+
y_val=y_val,
|
| 92 |
+
params=best_params
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# 3. Evaluasi Model
|
| 96 |
+
logger.info("\n=== EVALUASI MODEL ===")
|
| 97 |
+
evaluation = trainer.evaluate_model(X_test, y_test)
|
| 98 |
+
|
| 99 |
+
logger.info("\n=== DETAIL EVALUASI ===")
|
| 100 |
+
logger.info(f"Contoh 5 prediksi pertama: {evaluation['predictions'][:5]}")
|
| 101 |
+
logger.info(f"Contoh 5 nilai sebenarnya: {y_test[:5]}")
|
| 102 |
+
logger.info(f"Perbedaan prediksi dan aktual: {np.abs(y_test[:5] - evaluation['predictions'][:5])}")
|
| 103 |
+
|
| 104 |
+
# Simpan metrik evaluasi
|
| 105 |
+
metrics = evaluation['metrics']
|
| 106 |
+
with open(MODEL_SAVE_DIR / "model_metrics.json", 'w') as f:json.dump(metrics, f, indent=4)
|
| 107 |
+
|
| 108 |
+
# Visualisasi evaluasi
|
| 109 |
+
evaluator = PerformanceEvaluator(y_test, evaluation['predictions'])
|
| 110 |
+
|
| 111 |
+
# Plot dan simpan visualisasi
|
| 112 |
+
plots = {
|
| 113 |
+
"residual_plot": evaluator.plot_residuals(),
|
| 114 |
+
"actual_vs_predicted": evaluator.plot_actual_vs_predicted(),
|
| 115 |
+
"error_distribution": evaluator.plot_error_distribution()
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
for name, plot in plots.items():
|
| 119 |
+
plot_path = LOG_DIR / f"{name}.png"
|
| 120 |
+
plot.savefig(plot_path, bbox_inches='tight')
|
| 121 |
+
plt.close()
|
| 122 |
+
logger.info(f"Plot {name} disimpan di: {plot_path}")
|
| 123 |
+
|
| 124 |
+
# Plot dari model trainer
|
| 125 |
+
trainer.plot_learning_curve(
|
| 126 |
+
X_train=X_train,
|
| 127 |
+
y_train=y_train,
|
| 128 |
+
X_val=X_val,
|
| 129 |
+
y_val=y_val,
|
| 130 |
+
save_path=LOG_DIR / "learning_curve.png"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
feature_plot, importance_df = trainer.plot_feature_importance(
|
| 134 |
+
feature_names=processor.features.columns,
|
| 135 |
+
save_path=LOG_DIR / "feature_importance.png"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Simpan feature importance
|
| 139 |
+
importance_df.to_csv(LOG_DIR / "feature_importance.csv", index=False)
|
| 140 |
+
|
| 141 |
+
# SHAP summary plot
|
| 142 |
+
try:
|
| 143 |
+
trainer.plot_shap_summary(
|
| 144 |
+
feature_names=processor.features.columns,
|
| 145 |
+
save_path=LOG_DIR / "shap_summary.png"
|
| 146 |
+
)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.warning(f"Tidak dapat membuat SHAP plot: {str(e)}")
|
| 149 |
+
|
| 150 |
+
# 4. Simpan Model dan Processor
|
| 151 |
+
logger.info("\n=== MENYIMPAN MODEL ===")
|
| 152 |
+
model_path = trainer.save_model(MODEL_SAVE_DIR)
|
| 153 |
+
processor_path = processor.save_processor(MODEL_SAVE_DIR)
|
| 154 |
+
|
| 155 |
+
logger.info("\n=== PELATIHAN SELESAI ===")
|
| 156 |
+
logger.info(f"Model disimpan di: {model_path}")
|
| 157 |
+
logger.info(f"Processor disimpan di: {processor_path}")
|
| 158 |
+
print(f"Log dan visualisasi disimpan di: {LOG_DIR}")
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.error(f"Terjadi kesalahan saat melatih model: {str(e)}")
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
main()
|