Spaces:
Sleeping
Sleeping
Commit ·
48b5cb1
0
Parent(s):
awal bukanlah akhir
Browse files- .gitignore +13 -0
- app.py +335 -0
- config.py +170 -0
- data_loader.py +88 -0
- data_processor.py +350 -0
- data_validator.py +467 -0
- evaluator.py +103 -0
- prophet_predictor.py +253 -0
- requirements.txt +10 -0
- utils.py +23 -0
.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
article/
|
| 3 |
+
docs/
|
| 4 |
+
output/
|
| 5 |
+
validation_output/
|
| 6 |
+
venv/
|
| 7 |
+
backtesting.py
|
| 8 |
+
data_exploration.ipynb
|
| 9 |
+
main.py
|
| 10 |
+
optimize_data.py
|
| 11 |
+
WORKFLOW.md
|
| 12 |
+
data/
|
| 13 |
+
hf_cache/
|
app.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from typing import Optional, Tuple
|
| 9 |
+
|
| 10 |
+
from config import Config
|
| 11 |
+
from data_processor import DataProcessor
|
| 12 |
+
from evaluator import Evaluator
|
| 13 |
+
from prophet_predictor import ProphetPredictor
|
| 14 |
+
from utils import setup_logging
|
| 15 |
+
|
| 16 |
+
setup_logging("INFO")
|
| 17 |
+
logger = logging.getLogger("GradioApp")
|
| 18 |
+
|
| 19 |
+
_processor: Optional[DataProcessor] = None
|
| 20 |
+
_predictor: Optional[ProphetPredictor] = None
|
| 21 |
+
_config: Optional[Config] = None
|
| 22 |
+
_df_enrollment: Optional[pd.DataFrame] = None
|
| 23 |
+
_elective_codes: Optional[set] = None
|
| 24 |
+
_backtest_metrics: Optional[dict] = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def initialize_system():
|
| 28 |
+
"""Initialize the prediction system (called once at startup)."""
|
| 29 |
+
global _processor, _predictor, _config, _df_enrollment, _elective_codes, _backtest_metrics
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
logger.info("Initializing prediction system...")
|
| 33 |
+
_config = Config()
|
| 34 |
+
|
| 35 |
+
_processor = DataProcessor(_config)
|
| 36 |
+
_df_enrollment, _elective_codes = _processor.load_and_process()
|
| 37 |
+
|
| 38 |
+
_predictor = ProphetPredictor(_config)
|
| 39 |
+
_predictor.train_student_population_model(
|
| 40 |
+
_processor.raw_data["students_yearly"]
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
logger.info("✓ System initialized successfully")
|
| 44 |
+
return True
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Failed to initialize system: {e}", exc_info=True)
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def generate_predictions(year: int, semester: int) -> Tuple[str, Optional[pd.DataFrame], Optional[pd.DataFrame]]:
|
| 51 |
+
"""
|
| 52 |
+
Generate enrollment predictions for a given year and semester.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
year: Target year (e.g., 2025)
|
| 56 |
+
semester: Target semester (1 = Ganjil/Odd, 2 = Genap/Even)
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Tuple of (summary_text, recommendations_df, all_predictions_df)
|
| 60 |
+
"""
|
| 61 |
+
global _processor, _predictor, _config, _df_enrollment, _elective_codes, _backtest_metrics
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
if semester not in [1, 2]:
|
| 65 |
+
return "❌ Error: Semester must be 1 (Ganjil) or 2 (Genap)", None, None
|
| 66 |
+
|
| 67 |
+
if year < 2020 or year > 2030:
|
| 68 |
+
return "❌ Error: Year must be between 2020 and 2030", None, None
|
| 69 |
+
|
| 70 |
+
if _config is None or _predictor is None or _processor is None or _df_enrollment is None or _elective_codes is None:
|
| 71 |
+
return "❌ Error: System not initialized. Please restart the app.", None, None
|
| 72 |
+
|
| 73 |
+
logger.info(f"Generating predictions for {year} Semester {semester}...")
|
| 74 |
+
|
| 75 |
+
_config.prediction.PREDICT_YEAR = year
|
| 76 |
+
_config.prediction.PREDICT_SEMESTER = semester
|
| 77 |
+
|
| 78 |
+
if _backtest_metrics is None:
|
| 79 |
+
logger.info("Running backtest for the first time...")
|
| 80 |
+
evaluator = Evaluator(_config)
|
| 81 |
+
backtest_results = evaluator.run_backtest(_df_enrollment, _predictor)
|
| 82 |
+
|
| 83 |
+
if backtest_results is None or len(backtest_results) == 0:
|
| 84 |
+
logger.warning("Backtest returned no results, using defaults")
|
| 85 |
+
_backtest_metrics = {'mae': 0, 'rmse': 0}
|
| 86 |
+
else:
|
| 87 |
+
_backtest_metrics = evaluator.generate_metrics(backtest_results)
|
| 88 |
+
if _backtest_metrics is None:
|
| 89 |
+
logger.warning("Metrics calculation failed, using defaults")
|
| 90 |
+
_backtest_metrics = {'mae': 0, 'rmse': 0}
|
| 91 |
+
else:
|
| 92 |
+
logger.info("Using cached backtest metrics")
|
| 93 |
+
|
| 94 |
+
metrics = _backtest_metrics
|
| 95 |
+
|
| 96 |
+
predictions = _predictor.generate_batch_predictions(
|
| 97 |
+
_df_enrollment,
|
| 98 |
+
_processor.raw_data["courses"],
|
| 99 |
+
_elective_codes,
|
| 100 |
+
year,
|
| 101 |
+
semester,
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
recommended = predictions[predictions["recommendation"] == "BUKA"].copy()
|
| 105 |
+
|
| 106 |
+
semester_name = "Ganjil (Odd)" if semester == 1 else "Genap (Even)"
|
| 107 |
+
summary = f"""
|
| 108 |
+
## 📊 Prediction Summary for {year} Semester {semester_name}
|
| 109 |
+
|
| 110 |
+
### Model Performance (Backtest)
|
| 111 |
+
- **Mean Absolute Error (MAE)**: {metrics['mae']:.2f} students
|
| 112 |
+
- **Root Mean Squared Error (RMSE)**: {metrics['rmse']:.2f} students
|
| 113 |
+
|
| 114 |
+
### Recommendations
|
| 115 |
+
- **Courses to Open**: {len(recommended)}
|
| 116 |
+
- **Total Seats Needed**: {int(recommended['recommended_quota'].sum()) if not recommended.empty else 0}
|
| 117 |
+
- **Estimated Students**: {int(recommended['predicted_enrollment'].sum()) if not recommended.empty else 0}
|
| 118 |
+
|
| 119 |
+
### Top Course
|
| 120 |
+
"""
|
| 121 |
+
|
| 122 |
+
if not recommended.empty:
|
| 123 |
+
top_course = recommended.iloc[0]
|
| 124 |
+
summary += f"- **{top_course['nama_mk']}** ({top_course['kode_mk']})\n"
|
| 125 |
+
summary += f" - Predicted: {top_course['predicted_enrollment']:.0f} students\n"
|
| 126 |
+
summary += f" - Recommended Quota: {top_course['recommended_quota']:.0f} seats"
|
| 127 |
+
else:
|
| 128 |
+
summary += "- No courses recommended to open"
|
| 129 |
+
|
| 130 |
+
if not recommended.empty:
|
| 131 |
+
recommended_display = recommended[[
|
| 132 |
+
'kode_mk', 'nama_mk', 'predicted_enrollment',
|
| 133 |
+
'recommended_quota', 'strategy'
|
| 134 |
+
]].copy()
|
| 135 |
+
recommended_display.columns = [
|
| 136 |
+
'Course Code', 'Course Name', 'Predicted Students',
|
| 137 |
+
'Recommended Quota', 'Prediction Strategy'
|
| 138 |
+
]
|
| 139 |
+
recommended_display['Predicted Students'] = recommended_display['Predicted Students'].round(1)
|
| 140 |
+
recommended_display['Recommended Quota'] = recommended_display['Recommended Quota'].astype(int)
|
| 141 |
+
recommended_display = recommended_display.sort_values('Predicted Students', ascending=False)
|
| 142 |
+
else:
|
| 143 |
+
recommended_display = pd.DataFrame()
|
| 144 |
+
|
| 145 |
+
# All predictions
|
| 146 |
+
all_predictions_display = predictions[[
|
| 147 |
+
'kode_mk', 'nama_mk', 'predicted_enrollment',
|
| 148 |
+
'recommended_quota', 'recommendation', 'strategy'
|
| 149 |
+
]].copy()
|
| 150 |
+
all_predictions_display.columns = [
|
| 151 |
+
'Course Code', 'Course Name', 'Predicted Students',
|
| 152 |
+
'Recommended Quota', 'Recommendation', 'Strategy'
|
| 153 |
+
]
|
| 154 |
+
all_predictions_display['Predicted Students'] = all_predictions_display['Predicted Students'].round(1)
|
| 155 |
+
all_predictions_display['Recommended Quota'] = all_predictions_display['Recommended Quota'].astype(int)
|
| 156 |
+
all_predictions_display = all_predictions_display.sort_values('Predicted Students', ascending=False)
|
| 157 |
+
|
| 158 |
+
logger.info(f"✓ Predictions generated successfully")
|
| 159 |
+
return summary, recommended_display, all_predictions_display
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
error_msg = f"❌ Error generating predictions: {str(e)}"
|
| 163 |
+
logger.error(error_msg, exc_info=True)
|
| 164 |
+
return error_msg, None, None
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def get_data_info() -> str:
|
| 168 |
+
"""Get information about the loaded dataset."""
|
| 169 |
+
global _processor, _config
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
if _processor is None or _config is None:
|
| 173 |
+
return "❌ System not initialized"
|
| 174 |
+
|
| 175 |
+
courses = _processor.raw_data.get("courses")
|
| 176 |
+
students = _processor.raw_data.get("students_yearly")
|
| 177 |
+
|
| 178 |
+
if courses is None or students is None:
|
| 179 |
+
return "❌ Data not loaded"
|
| 180 |
+
|
| 181 |
+
# Get elective courses
|
| 182 |
+
elective_courses = courses[courses["kategori_mk"] == "P"]
|
| 183 |
+
|
| 184 |
+
info = f"""
|
| 185 |
+
## 📁 Dataset Information
|
| 186 |
+
|
| 187 |
+
### Course Catalog
|
| 188 |
+
- **Total Courses**: {len(courses)}
|
| 189 |
+
- **Elective Courses**: {len(elective_courses)}
|
| 190 |
+
- **Mandatory Courses**: {len(courses) - len(elective_courses)}
|
| 191 |
+
|
| 192 |
+
### Student Population
|
| 193 |
+
- **Years Available**: {students['thn'].min()} - {students['thn'].max()}
|
| 194 |
+
- **Total Records**: {len(students)}
|
| 195 |
+
|
| 196 |
+
### Data Source
|
| 197 |
+
- File: `{_config.data.FILE_PATH}`
|
| 198 |
+
- Last Updated: October 8, 2025
|
| 199 |
+
"""
|
| 200 |
+
return info
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
return f"❌ Error getting data info: {str(e)}"
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# Initialize system at startup
|
| 207 |
+
logger.info("Starting Gradio app...")
|
| 208 |
+
init_success = initialize_system()
|
| 209 |
+
|
| 210 |
+
if not init_success:
|
| 211 |
+
logger.error("Failed to initialize system. App may not work correctly.")
|
| 212 |
+
|
| 213 |
+
# Create Gradio Interface
|
| 214 |
+
with gr.Blocks(title="SKS Enrollment Predictor") as demo:
|
| 215 |
+
|
| 216 |
+
# Show disclaimer banner if using demo data
|
| 217 |
+
if os.getenv("DEMO_MODE", "false").lower() == "true":
|
| 218 |
+
gr.Markdown(
|
| 219 |
+
"""
|
| 220 |
+
<div style='padding: 15px; background-color: #fff3cd; border-left: 5px solid #ffc107; margin-bottom: 20px;'>
|
| 221 |
+
<h3 style='margin-top: 0; color: #856404;'>⚠️ Demo Version - Anonymized Data</h3>
|
| 222 |
+
<p style='margin-bottom: 0; color: #856404;'>
|
| 223 |
+
This demonstration uses <strong>anonymized enrollment data</strong> to protect student privacy.
|
| 224 |
+
All predictions and functionality are identical to the production version.
|
| 225 |
+
</p>
|
| 226 |
+
<details style='margin-top: 10px;'>
|
| 227 |
+
<summary style='cursor: pointer; color: #856404;'><strong>Changes made for demo:</strong></summary>
|
| 228 |
+
<ul style='color: #856404;'>
|
| 229 |
+
<li>Student IDs replaced with anonymous codes (STU000001, STU000002, ...)</li>
|
| 230 |
+
<li>Population counts have ±3% random noise added</li>
|
| 231 |
+
<li>Course information and enrollment patterns fully preserved</li>
|
| 232 |
+
</ul>
|
| 233 |
+
</details>
|
| 234 |
+
</div>
|
| 235 |
+
""",
|
| 236 |
+
sanitize_html=False
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
with gr.Tabs():
|
| 240 |
+
with gr.Tab("Generate Predictions"):
|
| 241 |
+
|
| 242 |
+
with gr.Row():
|
| 243 |
+
with gr.Column(scale=1):
|
| 244 |
+
year_input = gr.Number(
|
| 245 |
+
label="Target Year",
|
| 246 |
+
value=2025,
|
| 247 |
+
precision=0,
|
| 248 |
+
minimum=2020,
|
| 249 |
+
maximum=2030,
|
| 250 |
+
info="Masukkan tahun yang ingin diprediksi"
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
semester_input = gr.Radio(
|
| 254 |
+
choices=[1, 2],
|
| 255 |
+
label="Semester",
|
| 256 |
+
value=2,
|
| 257 |
+
info="1 = Ganjil, 2 = Genap"
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
predict_btn = gr.Button(
|
| 261 |
+
"Generate Predictions",
|
| 262 |
+
variant="primary",
|
| 263 |
+
size="lg"
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
with gr.Column(scale=2):
|
| 267 |
+
summary_output = gr.Markdown(
|
| 268 |
+
label="Summary",
|
| 269 |
+
value="Click 'Generate Predictions' to start"
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
gr.Markdown("### Recommended Courses to Open")
|
| 273 |
+
recommended_output = gr.Dataframe(
|
| 274 |
+
label="Courses Recommended to Open",
|
| 275 |
+
wrap=True,
|
| 276 |
+
interactive=False
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
with gr.Accordion("View All Predictions", open=False):
|
| 280 |
+
all_predictions_output = gr.Dataframe(
|
| 281 |
+
label="All Elective Courses",
|
| 282 |
+
wrap=True,
|
| 283 |
+
interactive=False
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
with gr.Tab("Data Information"):
|
| 287 |
+
gr.Markdown(
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
data_info_btn = gr.Button("Refresh Data Info", variant="secondary")
|
| 291 |
+
data_info_output = gr.Markdown()
|
| 292 |
+
|
| 293 |
+
data_info_btn.click(
|
| 294 |
+
fn=get_data_info,
|
| 295 |
+
inputs=[],
|
| 296 |
+
outputs=data_info_output
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
demo.load(fn=get_data_info, inputs=[], outputs=data_info_output)
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
predict_btn.click(
|
| 303 |
+
fn=generate_predictions,
|
| 304 |
+
inputs=[year_input, semester_input],
|
| 305 |
+
outputs=[summary_output, recommended_output, all_predictions_output]
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Footer
|
| 309 |
+
if os.getenv("DEMO_MODE", "false").lower() == "true":
|
| 310 |
+
gr.Markdown(
|
| 311 |
+
"""
|
| 312 |
+
---
|
| 313 |
+
<div style='text-align: center; color: #666; font-size: 0.9em;'>
|
| 314 |
+
📊 Demo Version with Anonymized Data | For Educational Purposes
|
| 315 |
+
</div>
|
| 316 |
+
"""
|
| 317 |
+
)
|
| 318 |
+
else:
|
| 319 |
+
gr.Markdown(
|
| 320 |
+
"""
|
| 321 |
+
---
|
| 322 |
+
<div style='text-align: center; color: #666; font-size: 0.9em;'>
|
| 323 |
+
🔒 Private & Confidential | For Authorized Use Only
|
| 324 |
+
</div>
|
| 325 |
+
"""
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Launch the app
|
| 329 |
+
if __name__ == "__main__":
|
| 330 |
+
demo.launch(
|
| 331 |
+
server_name="0.0.0.0",
|
| 332 |
+
server_port=7860,
|
| 333 |
+
share=False,
|
| 334 |
+
show_error=True
|
| 335 |
+
)
|
config.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Import data loader for private HF dataset support
|
| 6 |
+
try:
|
| 7 |
+
from data_loader import load_data_file
|
| 8 |
+
DATA_LOADER_AVAILABLE = True
|
| 9 |
+
except ImportError:
|
| 10 |
+
DATA_LOADER_AVAILABLE = False
|
| 11 |
+
def load_data_file() -> str:
|
| 12 |
+
"""Fallback if data_loader not available."""
|
| 13 |
+
return "data/optimized_data.xlsx"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _get_data_file_path() -> str:
|
| 17 |
+
"""
|
| 18 |
+
Get data file path based on environment.
|
| 19 |
+
|
| 20 |
+
Priority:
|
| 21 |
+
1. If HF_TOKEN set: Load from private HF dataset (muhalwan/optimized_data_mhs)
|
| 22 |
+
2. If DEMO_MODE=true: Use demo_data.xlsx (anonymized)
|
| 23 |
+
3. Otherwise: Use local optimized_data.xlsx
|
| 24 |
+
"""
|
| 25 |
+
if os.getenv("HF_TOKEN"):
|
| 26 |
+
return load_data_file() # Loads from HF dataset if HF_TOKEN is set
|
| 27 |
+
elif os.getenv("DEMO_MODE", "false").lower() == "true":
|
| 28 |
+
return "data/demo_data.xlsx"
|
| 29 |
+
else:
|
| 30 |
+
return "data/optimized_data.xlsx"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class DataConfig:
|
| 35 |
+
"""Data source configuration and validation rules."""
|
| 36 |
+
|
| 37 |
+
# Data file path - automatically determined based on environment
|
| 38 |
+
FILE_PATH: str = field(default_factory=_get_data_file_path)
|
| 39 |
+
|
| 40 |
+
# Sheet mappings
|
| 41 |
+
SHEET_COURSES: str = "tabel1_data_matkul"
|
| 42 |
+
SHEET_OFFERINGS: str = "tabel2_data_matkul_dibuka"
|
| 43 |
+
SHEET_STUDENTS_YEARLY: str = "tabel3_data_mahasiswa_per_tahun"
|
| 44 |
+
SHEET_STUDENTS_INDIVIDUAL: str = "tabel4_data_individu_mahasiswa"
|
| 45 |
+
|
| 46 |
+
# Standardization
|
| 47 |
+
OFFERINGS_RENAME: Dict[str, str] = field(
|
| 48 |
+
default_factory=lambda: {"tahun": "thn", "semester": "smt"}
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Elective Course Identification
|
| 52 |
+
# IMPORTANT: Elective courses are identified by kategori_mk = 'P' in tabel1
|
| 53 |
+
# Mandatory/Required courses have kategori_mk = 'W'
|
| 54 |
+
ELECTIVE_CATEGORY: str = "P"
|
| 55 |
+
MANDATORY_CATEGORY: str = "W"
|
| 56 |
+
|
| 57 |
+
# Valid category values (will be normalized to uppercase)
|
| 58 |
+
VALID_CATEGORIES: List[str] = field(default_factory=lambda: ["P", "W"])
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class ModelConfig:
|
| 63 |
+
"""Prophet model hyperparameters and prediction strategies."""
|
| 64 |
+
|
| 65 |
+
# Prophet Hyperparameters
|
| 66 |
+
GROWTH_MODE: str = "logistic"
|
| 67 |
+
CHANGEPOINT_SCALE: float = 0.01
|
| 68 |
+
SEASONALITY_MODE: str = "multiplicative"
|
| 69 |
+
YEARLY_SEASONALITY: bool = True
|
| 70 |
+
FALLBACK_DEFAULT: int = 20
|
| 71 |
+
|
| 72 |
+
# Prediction safety limits
|
| 73 |
+
# Maximum multiplier of historical max enrollment before flagging as unrealistic
|
| 74 |
+
SANITY_CHECK_MAX_MULTIPLIER: float = 3.0
|
| 75 |
+
# Minimum historical data points required for reliable prediction
|
| 76 |
+
MIN_HISTORY_POINTS: int = 3
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@dataclass
|
| 80 |
+
class PredictionConfig:
|
| 81 |
+
"""Business logic for predictions."""
|
| 82 |
+
|
| 83 |
+
PREDICT_YEAR: int = 2025
|
| 84 |
+
PREDICT_SEMESTER: int = 2
|
| 85 |
+
|
| 86 |
+
# Buffer Calculations
|
| 87 |
+
BUFFER_PERCENT: float = 0.20
|
| 88 |
+
MIN_QUOTA_OPEN: int = 25
|
| 89 |
+
MIN_PREDICT_THRESHOLD: int = 15
|
| 90 |
+
|
| 91 |
+
MAX_CAPACITY_MULTIPLIER: float = 2.0
|
| 92 |
+
ABSOLUTE_MAX_STUDENTS: int = 400
|
| 93 |
+
|
| 94 |
+
SEMESTER_TO_MONTH: Dict[int, str] = field(
|
| 95 |
+
default_factory=lambda: {
|
| 96 |
+
1: "09-01",
|
| 97 |
+
2: "03-01",
|
| 98 |
+
}
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@dataclass
|
| 103 |
+
class OutputConfig:
|
| 104 |
+
"""Output settings."""
|
| 105 |
+
|
| 106 |
+
OUTPUT_DIR: str = "output"
|
| 107 |
+
LOG_LEVEL: str = "INFO"
|
| 108 |
+
TOP_N_DISPLAY: int = 30
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@dataclass
|
| 112 |
+
class BacktestConfig:
|
| 113 |
+
"""Backtest settings and validation."""
|
| 114 |
+
|
| 115 |
+
START_YEAR: int = 2010
|
| 116 |
+
END_YEAR: int = 2024
|
| 117 |
+
VERBOSE: bool = True
|
| 118 |
+
|
| 119 |
+
# Minimum elective enrollments required for backtesting
|
| 120 |
+
MIN_ELECTIVE_ENROLLMENTS: int = 1
|
| 121 |
+
# Minimum unique courses required for backtesting
|
| 122 |
+
MIN_UNIQUE_COURSES: int = 1
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
class Config:
|
| 126 |
+
"""
|
| 127 |
+
Master Config Object.
|
| 128 |
+
|
| 129 |
+
ELECTIVE COURSE DEFINITION:
|
| 130 |
+
---------------------------
|
| 131 |
+
Elective courses are identified by kategori_mk = 'P' in tabel1_data_matkul.
|
| 132 |
+
This is the ONLY source of truth for course categories.
|
| 133 |
+
|
| 134 |
+
Examples of elective courses (kategori_mk = 'P'):
|
| 135 |
+
- EF234607: Keamanan Aplikasi
|
| 136 |
+
- EF234613: Game Edukasi dan Simulasi
|
| 137 |
+
- UG234922: Kebudayaan dan Kebangsaan
|
| 138 |
+
- IW184301: Sistem Basis Data
|
| 139 |
+
- KI series: Various computer science electives
|
| 140 |
+
|
| 141 |
+
Mandatory courses have kategori_mk = 'W' (Wajib).
|
| 142 |
+
|
| 143 |
+
DATA REQUIREMENTS FOR BACKTESTING:
|
| 144 |
+
-----------------------------------
|
| 145 |
+
To backtest a semester, you need:
|
| 146 |
+
1. Course catalog (tabel1) with kategori_mk properly set
|
| 147 |
+
2. ACTUAL student enrollments (tabel4) for that semester
|
| 148 |
+
3. At least one elective course with enrollments
|
| 149 |
+
|
| 150 |
+
Note: Course offerings (tabel2) alone are NOT sufficient for backtesting.
|
| 151 |
+
You must have actual enrollment data (tabel4) to validate predictions.
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
def __init__(self):
|
| 155 |
+
self.data: DataConfig = DataConfig()
|
| 156 |
+
self.model: ModelConfig = ModelConfig()
|
| 157 |
+
self.prediction: PredictionConfig = PredictionConfig()
|
| 158 |
+
self.output: OutputConfig = OutputConfig()
|
| 159 |
+
self.backtest: BacktestConfig = BacktestConfig()
|
| 160 |
+
|
| 161 |
+
def get_prediction_target_name(self) -> str:
|
| 162 |
+
sem = "Ganjil" if self.prediction.PREDICT_SEMESTER == 1 else "Genap"
|
| 163 |
+
return f"{self.prediction.PREDICT_YEAR} Semester {sem}"
|
| 164 |
+
|
| 165 |
+
def get_elective_filter_description(self) -> str:
|
| 166 |
+
"""Get human-readable description of elective identification."""
|
| 167 |
+
return f"kategori_mk = '{self.data.ELECTIVE_CATEGORY}' in {self.data.SHEET_COURSES}"
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
default_config = Config()
|
data_loader.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def load_data_file() -> str:
|
| 10 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 11 |
+
|
| 12 |
+
if hf_token:
|
| 13 |
+
try:
|
| 14 |
+
from huggingface_hub import hf_hub_download
|
| 15 |
+
|
| 16 |
+
logger.info("🔐 Loading data from private Hugging Face dataset...")
|
| 17 |
+
logger.info(" Dataset: muhalwan/optimized_data_mhs")
|
| 18 |
+
|
| 19 |
+
file_path = hf_hub_download(
|
| 20 |
+
repo_id="muhalwan/optimized_data_mhs",
|
| 21 |
+
filename="optimized_data.xlsx",
|
| 22 |
+
repo_type="dataset",
|
| 23 |
+
token=hf_token,
|
| 24 |
+
cache_dir="./hf_cache"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
logger.info(f"✓ Data loaded successfully from HF dataset")
|
| 28 |
+
logger.info(f" Cached at: {file_path}")
|
| 29 |
+
return file_path
|
| 30 |
+
|
| 31 |
+
except ImportError:
|
| 32 |
+
logger.error("huggingface_hub not installed. Install with: pip install huggingface_hub")
|
| 33 |
+
raise
|
| 34 |
+
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logger.error(f"Failed to download from HF dataset: {e}")
|
| 37 |
+
logger.error("Falling back to local file if available...")
|
| 38 |
+
|
| 39 |
+
local_path = "data/optimized_data.xlsx"
|
| 40 |
+
|
| 41 |
+
if Path(local_path).exists():
|
| 42 |
+
logger.info(f"📁 Loading data from local file: {local_path}")
|
| 43 |
+
return local_path
|
| 44 |
+
|
| 45 |
+
error_msg = (
|
| 46 |
+
"No data file found!\n"
|
| 47 |
+
"Options:\n"
|
| 48 |
+
"1. Set HF_TOKEN environment variable to load from private dataset\n"
|
| 49 |
+
"2. Place optimized_data.xlsx in data/ folder for local development\n"
|
| 50 |
+
)
|
| 51 |
+
logger.error(error_msg)
|
| 52 |
+
raise FileNotFoundError(error_msg)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def get_data_source_info() -> dict:
|
| 56 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 57 |
+
local_exists = Path("data/optimized_data.xlsx").exists()
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
"hf_token_available": bool(hf_token),
|
| 61 |
+
"local_file_available": local_exists,
|
| 62 |
+
"will_use_hf_dataset": bool(hf_token),
|
| 63 |
+
"will_use_local": not hf_token and local_exists,
|
| 64 |
+
"dataset_repo": "muhalwan/optimized_data_mhs" if hf_token else None,
|
| 65 |
+
"local_path": "data/optimized_data.xlsx" if local_exists else None
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
logging.basicConfig(level=logging.INFO)
|
| 71 |
+
|
| 72 |
+
print("=" * 80)
|
| 73 |
+
print("Data Source Information")
|
| 74 |
+
print("=" * 80)
|
| 75 |
+
|
| 76 |
+
info = get_data_source_info()
|
| 77 |
+
for key, value in info.items():
|
| 78 |
+
print(f" {key}: {value}")
|
| 79 |
+
|
| 80 |
+
print("\n" + "=" * 80)
|
| 81 |
+
print("Attempting to load data...")
|
| 82 |
+
print("=" * 80)
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
file_path = load_data_file()
|
| 86 |
+
print(f"\n✓ Success! Data file: {file_path}")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"\n✗ Failed: {e}")
|
data_processor.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Dict, Set, Tuple
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from config import Config
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DataProcessor:
|
| 13 |
+
def __init__(self, config: Config):
|
| 14 |
+
self.config = config
|
| 15 |
+
self.raw_data: Dict[str, pd.DataFrame] = {}
|
| 16 |
+
self.processed_data: pd.DataFrame = pd.DataFrame()
|
| 17 |
+
self.elective_codes: Set[str] = set()
|
| 18 |
+
|
| 19 |
+
def load_and_process(self) -> Tuple[pd.DataFrame, Set[str]]:
|
| 20 |
+
self._load_excel()
|
| 21 |
+
self._validate_raw_data()
|
| 22 |
+
return self._preprocess()
|
| 23 |
+
|
| 24 |
+
def _load_excel(self):
|
| 25 |
+
logger.info(f"Loading {self.config.data.FILE_PATH}...")
|
| 26 |
+
try:
|
| 27 |
+
sheets = pd.read_excel(self.config.data.FILE_PATH, sheet_name=None)
|
| 28 |
+
self.raw_data = {
|
| 29 |
+
"courses": sheets[self.config.data.SHEET_COURSES],
|
| 30 |
+
"offerings": sheets[self.config.data.SHEET_OFFERINGS],
|
| 31 |
+
"students_yearly": sheets[self.config.data.SHEET_STUDENTS_YEARLY],
|
| 32 |
+
"students_ind": sheets[self.config.data.SHEET_STUDENTS_INDIVIDUAL],
|
| 33 |
+
}
|
| 34 |
+
except Exception as e:
|
| 35 |
+
logger.error(f"Failed to load Excel: {e}")
|
| 36 |
+
raise
|
| 37 |
+
|
| 38 |
+
def _validate_raw_data(self):
|
| 39 |
+
"""Validate required columns and log data quality metrics."""
|
| 40 |
+
req_cols = {
|
| 41 |
+
"courses": ["kode_mk", "kategori_mk"],
|
| 42 |
+
"students_ind": ["kode_mk", "thn", "smt", "kode_mhs"],
|
| 43 |
+
"students_yearly": ["thn", "smt", "jumlah_aktif"],
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
for key, cols in req_cols.items():
|
| 47 |
+
if not all(col in self.raw_data[key].columns for col in cols):
|
| 48 |
+
raise ValueError(f"Missing columns in {key}: {cols}")
|
| 49 |
+
|
| 50 |
+
# Log data quality metrics
|
| 51 |
+
self._log_data_quality()
|
| 52 |
+
|
| 53 |
+
def _log_data_quality(self):
|
| 54 |
+
"""Log data quality metrics for monitoring."""
|
| 55 |
+
courses_df = self.raw_data["courses"]
|
| 56 |
+
students_df = self.raw_data["students_ind"]
|
| 57 |
+
|
| 58 |
+
logger.info("=" * 60)
|
| 59 |
+
logger.info("Data Quality Report:")
|
| 60 |
+
logger.info(f" Courses (tabel1): {len(courses_df)} records")
|
| 61 |
+
logger.info(f" - Unique courses: {courses_df['kode_mk'].nunique()}")
|
| 62 |
+
logger.info(
|
| 63 |
+
f" - Duplicates: {len(courses_df) - courses_df['kode_mk'].nunique()}"
|
| 64 |
+
)
|
| 65 |
+
logger.info(f" Students (tabel4): {len(students_df)} records")
|
| 66 |
+
logger.info(f" - Unique students: {students_df['kode_mhs'].nunique()}")
|
| 67 |
+
logger.info("=" * 60)
|
| 68 |
+
|
| 69 |
+
def _clean_courses_data(self, courses: pd.DataFrame) -> pd.DataFrame:
|
| 70 |
+
"""
|
| 71 |
+
Clean and standardize course catalog data.
|
| 72 |
+
|
| 73 |
+
Cleaning steps:
|
| 74 |
+
1. Remove exact duplicates
|
| 75 |
+
2. Standardize kategori_mk values (uppercase, strip whitespace)
|
| 76 |
+
3. Remove courses with invalid/missing data
|
| 77 |
+
4. Keep first occurrence for duplicate course codes
|
| 78 |
+
5. Validate kategori_mk values
|
| 79 |
+
"""
|
| 80 |
+
initial_count = len(courses)
|
| 81 |
+
|
| 82 |
+
# Step 1: Remove exact duplicate rows
|
| 83 |
+
courses = courses.drop_duplicates()
|
| 84 |
+
if len(courses) < initial_count:
|
| 85 |
+
logger.info(
|
| 86 |
+
f" Removed {initial_count - len(courses)} exact duplicate rows"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Step 2: Standardize kategori_mk
|
| 90 |
+
courses["kategori_mk"] = (
|
| 91 |
+
courses["kategori_mk"]
|
| 92 |
+
.astype(str)
|
| 93 |
+
.str.upper()
|
| 94 |
+
.str.strip()
|
| 95 |
+
.replace("", np.nan)
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Step 3: Remove rows with missing critical data
|
| 99 |
+
before_dropna = len(courses)
|
| 100 |
+
courses = courses.dropna(subset=["kode_mk", "kategori_mk"])
|
| 101 |
+
if len(courses) < before_dropna:
|
| 102 |
+
logger.info(
|
| 103 |
+
f" Removed {before_dropna - len(courses)} rows with missing kode_mk or kategori_mk"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Step 4: Validate kategori_mk values (should be P or W)
|
| 107 |
+
valid_categories = {"P", "W"}
|
| 108 |
+
invalid_mask = ~courses["kategori_mk"].isin(valid_categories)
|
| 109 |
+
if invalid_mask.any():
|
| 110 |
+
invalid_cats = courses[invalid_mask]["kategori_mk"].unique()
|
| 111 |
+
logger.warning(
|
| 112 |
+
f" Found {invalid_mask.sum()} courses with invalid categories: {invalid_cats}"
|
| 113 |
+
)
|
| 114 |
+
logger.warning(" Keeping only valid categories (P, W)")
|
| 115 |
+
courses = courses[~invalid_mask]
|
| 116 |
+
|
| 117 |
+
# Step 5: Remove duplicate course codes (keep first)
|
| 118 |
+
before_dedup = len(courses)
|
| 119 |
+
courses = courses.drop_duplicates(subset="kode_mk", keep="first")
|
| 120 |
+
if len(courses) < before_dedup:
|
| 121 |
+
logger.info(
|
| 122 |
+
f" Removed {before_dedup - len(courses)} duplicate course codes (kept first occurrence)"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
logger.info(f" Final course catalog: {len(courses)} unique courses")
|
| 126 |
+
|
| 127 |
+
return courses
|
| 128 |
+
|
| 129 |
+
def _clean_students_data(self, students: pd.DataFrame) -> pd.DataFrame:
|
| 130 |
+
"""
|
| 131 |
+
Clean and validate student enrollment data.
|
| 132 |
+
|
| 133 |
+
Cleaning steps:
|
| 134 |
+
1. Remove rows with missing critical data
|
| 135 |
+
2. Standardize data types
|
| 136 |
+
3. Remove invalid year/semester values
|
| 137 |
+
4. Remove duplicate enrollment records
|
| 138 |
+
"""
|
| 139 |
+
initial_count = len(students)
|
| 140 |
+
|
| 141 |
+
# Step 1: Remove rows with missing critical data
|
| 142 |
+
students = students.dropna(subset=["kode_mk", "thn", "smt", "kode_mhs"])
|
| 143 |
+
if len(students) < initial_count:
|
| 144 |
+
logger.info(
|
| 145 |
+
f" Removed {initial_count - len(students)} rows with missing critical data"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Step 2: Ensure correct data types
|
| 149 |
+
students["thn"] = pd.to_numeric(students["thn"], errors="coerce")
|
| 150 |
+
students["smt"] = pd.to_numeric(students["smt"], errors="coerce")
|
| 151 |
+
|
| 152 |
+
# Step 3: Remove rows with invalid year/semester after conversion
|
| 153 |
+
before_invalid = len(students)
|
| 154 |
+
students = students.dropna(subset=["thn", "smt"])
|
| 155 |
+
if len(students) < before_invalid:
|
| 156 |
+
logger.info(
|
| 157 |
+
f" Removed {before_invalid - len(students)} rows with invalid year/semester values"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Step 4: Validate semester values (should be 1, 2, or 3)
|
| 161 |
+
valid_semesters = {1, 2, 3}
|
| 162 |
+
invalid_sem = ~students["smt"].isin(valid_semesters)
|
| 163 |
+
if invalid_sem.any():
|
| 164 |
+
logger.warning(
|
| 165 |
+
f" Found {invalid_sem.sum()} records with invalid semester values"
|
| 166 |
+
)
|
| 167 |
+
students = students[~invalid_sem]
|
| 168 |
+
|
| 169 |
+
# Step 5: Validate year range (reasonable academic years)
|
| 170 |
+
current_year = pd.Timestamp.now().year
|
| 171 |
+
invalid_year = (students["thn"] < 2000) | (students["thn"] > current_year + 1)
|
| 172 |
+
if invalid_year.any():
|
| 173 |
+
logger.warning(
|
| 174 |
+
f" Found {invalid_year.sum()} records with unreasonable year values"
|
| 175 |
+
)
|
| 176 |
+
students = students[~invalid_year]
|
| 177 |
+
|
| 178 |
+
# Step 6: Remove exact duplicate enrollments (same student, course, semester)
|
| 179 |
+
before_dedup = len(students)
|
| 180 |
+
students = students.drop_duplicates(
|
| 181 |
+
subset=["kode_mhs", "kode_mk", "thn", "smt"], keep="first"
|
| 182 |
+
)
|
| 183 |
+
if len(students) < before_dedup:
|
| 184 |
+
logger.info(
|
| 185 |
+
f" Removed {before_dedup - len(students)} duplicate enrollment records"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
logger.info(f" Final enrollment records: {len(students)}")
|
| 189 |
+
|
| 190 |
+
return students
|
| 191 |
+
|
| 192 |
+
def _clean_yearly_population(self, yearly_pop: pd.DataFrame) -> pd.DataFrame:
|
| 193 |
+
"""
|
| 194 |
+
Clean and validate yearly student population data.
|
| 195 |
+
|
| 196 |
+
Cleaning steps:
|
| 197 |
+
1. Remove duplicates
|
| 198 |
+
2. Validate and fill missing population data
|
| 199 |
+
3. Ensure chronological order
|
| 200 |
+
"""
|
| 201 |
+
# Remove duplicate year-semester combinations
|
| 202 |
+
before_dedup = len(yearly_pop)
|
| 203 |
+
yearly_pop = yearly_pop.drop_duplicates(subset=["thn", "smt"], keep="first")
|
| 204 |
+
if len(yearly_pop) < before_dedup:
|
| 205 |
+
logger.info(
|
| 206 |
+
f" Removed {before_dedup - len(yearly_pop)} duplicate year-semester records"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# Ensure jumlah_aktif is numeric and positive
|
| 210 |
+
yearly_pop["jumlah_aktif"] = pd.to_numeric(
|
| 211 |
+
yearly_pop["jumlah_aktif"], errors="coerce"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Replace zero or negative values with NaN (will be filled later)
|
| 215 |
+
yearly_pop.loc[yearly_pop["jumlah_aktif"] <= 0, "jumlah_aktif"] = np.nan
|
| 216 |
+
|
| 217 |
+
# Sort by year and semester
|
| 218 |
+
yearly_pop = yearly_pop.sort_values(["thn", "smt"]).reset_index(drop=True)
|
| 219 |
+
|
| 220 |
+
logger.info(f" Yearly population records: {len(yearly_pop)}")
|
| 221 |
+
|
| 222 |
+
return yearly_pop
|
| 223 |
+
|
| 224 |
+
def _preprocess(self) -> Tuple[pd.DataFrame, Set[str]]:
|
| 225 |
+
"""Clean, merge, and aggregate data with comprehensive cleaning."""
|
| 226 |
+
logger.info("Preprocessing data...")
|
| 227 |
+
logger.info("-" * 60)
|
| 228 |
+
|
| 229 |
+
# Step 1: Clean course catalog
|
| 230 |
+
logger.info("Step 1: Cleaning course catalog...")
|
| 231 |
+
courses = self._clean_courses_data(self.raw_data["courses"].copy())
|
| 232 |
+
|
| 233 |
+
# Step 2: Identify elective courses
|
| 234 |
+
elective_category = self.config.data.ELECTIVE_CATEGORY
|
| 235 |
+
self.elective_codes = set(
|
| 236 |
+
courses[courses["kategori_mk"] == elective_category]["kode_mk"]
|
| 237 |
+
)
|
| 238 |
+
logger.info(f"Step 2: Identified {len(self.elective_codes)} elective courses")
|
| 239 |
+
|
| 240 |
+
if len(self.elective_codes) == 0:
|
| 241 |
+
logger.warning(
|
| 242 |
+
f"No elective courses found! Check if kategori_mk = '{elective_category}' exists in data."
|
| 243 |
+
)
|
| 244 |
+
logger.warning(
|
| 245 |
+
f"Elective identification rule: {self.config.get_elective_filter_description()}"
|
| 246 |
+
)
|
| 247 |
+
return pd.DataFrame(), set()
|
| 248 |
+
|
| 249 |
+
# Step 3: Clean student enrollment data
|
| 250 |
+
logger.info("Step 3: Cleaning student enrollment data...")
|
| 251 |
+
students = self._clean_students_data(self.raw_data["students_ind"].copy())
|
| 252 |
+
|
| 253 |
+
# Step 4: Filter for elective courses only
|
| 254 |
+
students = students[students["kode_mk"].isin(self.elective_codes)]
|
| 255 |
+
logger.info(f"Step 4: Filtered to {len(students)} elective enrollment records")
|
| 256 |
+
|
| 257 |
+
if len(students) == 0:
|
| 258 |
+
logger.warning("No enrollment data found for elective courses!")
|
| 259 |
+
return pd.DataFrame(), self.elective_codes
|
| 260 |
+
|
| 261 |
+
# Step 5: Aggregate enrollment by course-semester
|
| 262 |
+
logger.info("Step 5: Aggregating enrollment data...")
|
| 263 |
+
enrollment = (
|
| 264 |
+
students.groupby(["kode_mk", "thn", "smt"])["kode_mhs"]
|
| 265 |
+
.nunique()
|
| 266 |
+
.reset_index(name="enrollment")
|
| 267 |
+
)
|
| 268 |
+
logger.info(f" Created {len(enrollment)} course-semester enrollment records")
|
| 269 |
+
|
| 270 |
+
# Step 6: Clean yearly population data
|
| 271 |
+
logger.info("Step 6: Cleaning yearly population data...")
|
| 272 |
+
yearly_pop = self._clean_yearly_population(
|
| 273 |
+
self.raw_data["students_yearly"][["thn", "smt", "jumlah_aktif"]].copy()
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# Step 7: Merge enrollment with population data
|
| 277 |
+
logger.info("Step 7: Merging enrollment with population data...")
|
| 278 |
+
df = enrollment.merge(yearly_pop, on=["thn", "smt"], how="left")
|
| 279 |
+
|
| 280 |
+
# Step 8: Handle missing population data
|
| 281 |
+
missing_pop = df["jumlah_aktif"].isna().sum()
|
| 282 |
+
if missing_pop > 0:
|
| 283 |
+
logger.warning(
|
| 284 |
+
f" {missing_pop} records missing population data - filling with interpolation"
|
| 285 |
+
)
|
| 286 |
+
df["jumlah_aktif"] = df["jumlah_aktif"].ffill().bfill()
|
| 287 |
+
|
| 288 |
+
# If still missing, use a reasonable default
|
| 289 |
+
if df["jumlah_aktif"].isna().any():
|
| 290 |
+
default_pop = 500 # Reasonable default student population
|
| 291 |
+
logger.warning(
|
| 292 |
+
f" Some population data still missing - using default: {default_pop}"
|
| 293 |
+
)
|
| 294 |
+
df["jumlah_aktif"] = df["jumlah_aktif"].fillna(default_pop)
|
| 295 |
+
|
| 296 |
+
# Step 9: Validate enrollment data
|
| 297 |
+
logger.info("Step 8: Validating final enrollment data...")
|
| 298 |
+
df = self._validate_enrollment_data(df)
|
| 299 |
+
|
| 300 |
+
# Step 10: Sort and finalize
|
| 301 |
+
df = df.sort_values(["kode_mk", "thn", "smt"]).reset_index(drop=True)
|
| 302 |
+
self.processed_data = df
|
| 303 |
+
|
| 304 |
+
logger.info("-" * 60)
|
| 305 |
+
logger.info(
|
| 306 |
+
f"✓ Preprocessing complete. {len(df)} enrollment records generated."
|
| 307 |
+
)
|
| 308 |
+
logger.info(f"✓ Year range: {df['thn'].min():.0f} - {df['thn'].max():.0f}")
|
| 309 |
+
logger.info(f"✓ Courses with data: {df['kode_mk'].nunique()}")
|
| 310 |
+
logger.info("-" * 60)
|
| 311 |
+
|
| 312 |
+
return df, self.elective_codes
|
| 313 |
+
|
| 314 |
+
def _validate_enrollment_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 315 |
+
"""
|
| 316 |
+
Validate and clean the final enrollment dataset.
|
| 317 |
+
|
| 318 |
+
Checks:
|
| 319 |
+
1. Remove records with zero enrollment
|
| 320 |
+
2. Check for outliers
|
| 321 |
+
3. Validate population data
|
| 322 |
+
"""
|
| 323 |
+
initial_count = len(df)
|
| 324 |
+
|
| 325 |
+
# Remove zero enrollments
|
| 326 |
+
df = df[df["enrollment"] > 0]
|
| 327 |
+
if len(df) < initial_count:
|
| 328 |
+
logger.info(
|
| 329 |
+
f" Removed {initial_count - len(df)} records with zero enrollment"
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Check for extreme outliers in enrollment
|
| 333 |
+
for course in df["kode_mk"].unique():
|
| 334 |
+
course_data = df[df["kode_mk"] == course]["enrollment"]
|
| 335 |
+
if len(course_data) > 1:
|
| 336 |
+
q75, q25 = course_data.quantile([0.75, 0.25])
|
| 337 |
+
iqr = q75 - q25
|
| 338 |
+
upper_bound = q75 + (3 * iqr) # Using 3*IQR for outliers
|
| 339 |
+
|
| 340 |
+
outliers = course_data > upper_bound
|
| 341 |
+
if outliers.any():
|
| 342 |
+
logger.debug(
|
| 343 |
+
f" Course {course} has {outliers.sum()} potential outliers (keeping them)"
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
# Ensure population is reasonable
|
| 347 |
+
if (df["jumlah_aktif"] < 50).any():
|
| 348 |
+
logger.warning(" Some semesters have very low student population (<50)")
|
| 349 |
+
|
| 350 |
+
return df
|
data_validator.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Validation Utility
|
| 3 |
+
|
| 4 |
+
Provides pre-flight checks and data quality validation for the enrollment prediction system.
|
| 5 |
+
This module validates data availability, quality, and completeness before processing.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Dict, List, Optional, Tuple
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class ValidationResult:
|
| 19 |
+
"""Result of a validation check."""
|
| 20 |
+
|
| 21 |
+
passed: bool
|
| 22 |
+
message: str
|
| 23 |
+
severity: str = "INFO" # INFO, WARNING, ERROR
|
| 24 |
+
details: Optional[Dict] = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class SemesterDataStatus:
|
| 29 |
+
"""Status of data availability for a specific semester."""
|
| 30 |
+
|
| 31 |
+
year: int
|
| 32 |
+
semester: int
|
| 33 |
+
has_offerings: bool
|
| 34 |
+
has_enrollments: bool
|
| 35 |
+
has_elective_enrollments: bool
|
| 36 |
+
total_enrollments: int
|
| 37 |
+
elective_enrollments: int
|
| 38 |
+
elective_courses: List[str]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class DataValidator:
|
| 42 |
+
"""Validates data quality and availability for the enrollment prediction system."""
|
| 43 |
+
|
| 44 |
+
def __init__(self, file_path: str):
|
| 45 |
+
"""
|
| 46 |
+
Initialize the validator.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
file_path: Path to the Excel data file
|
| 50 |
+
"""
|
| 51 |
+
self.file_path = file_path
|
| 52 |
+
self.validation_results: List[ValidationResult] = []
|
| 53 |
+
|
| 54 |
+
def validate_all(self) -> Tuple[bool, List[ValidationResult]]:
|
| 55 |
+
"""
|
| 56 |
+
Run all validation checks.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Tuple of (all_passed, list of validation results)
|
| 60 |
+
"""
|
| 61 |
+
logger.info("Running comprehensive data validation...")
|
| 62 |
+
|
| 63 |
+
# Load raw data
|
| 64 |
+
try:
|
| 65 |
+
self.raw_data = self._load_raw_data()
|
| 66 |
+
except Exception as e:
|
| 67 |
+
self.validation_results.append(
|
| 68 |
+
ValidationResult(
|
| 69 |
+
passed=False,
|
| 70 |
+
message=f"Failed to load data: {str(e)}",
|
| 71 |
+
severity="ERROR",
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
return False, self.validation_results
|
| 75 |
+
|
| 76 |
+
# Run validation checks
|
| 77 |
+
self._validate_file_structure()
|
| 78 |
+
self._validate_course_catalog()
|
| 79 |
+
self._validate_elective_courses()
|
| 80 |
+
self._validate_enrollment_data()
|
| 81 |
+
self._validate_population_data()
|
| 82 |
+
|
| 83 |
+
# Overall result
|
| 84 |
+
all_passed = all(
|
| 85 |
+
r.passed for r in self.validation_results if r.severity == "ERROR"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
return all_passed, self.validation_results
|
| 89 |
+
|
| 90 |
+
def check_semester_data_availability(
|
| 91 |
+
self, year: int, semester: int
|
| 92 |
+
) -> SemesterDataStatus:
|
| 93 |
+
"""
|
| 94 |
+
Check data availability for a specific semester.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
year: Academic year
|
| 98 |
+
semester: Semester (1 or 2)
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
SemesterDataStatus object with detailed availability info
|
| 102 |
+
"""
|
| 103 |
+
if not hasattr(self, "raw_data"):
|
| 104 |
+
self.raw_data = self._load_raw_data()
|
| 105 |
+
|
| 106 |
+
# Check course offerings (tabel2)
|
| 107 |
+
offerings = self.raw_data["offerings"]
|
| 108 |
+
has_offerings = (
|
| 109 |
+
len(
|
| 110 |
+
offerings[
|
| 111 |
+
(offerings["tahun"] == year) & (offerings["semester"] == semester)
|
| 112 |
+
]
|
| 113 |
+
)
|
| 114 |
+
> 0
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Check enrollments (tabel4)
|
| 118 |
+
students = self.raw_data["students"]
|
| 119 |
+
semester_enrollments = students[
|
| 120 |
+
(students["thn"] == year) & (students["smt"] == semester)
|
| 121 |
+
]
|
| 122 |
+
has_enrollments = len(semester_enrollments) > 0
|
| 123 |
+
|
| 124 |
+
# Check elective enrollments
|
| 125 |
+
elective_codes = self._get_elective_codes()
|
| 126 |
+
elective_enrollments = semester_enrollments[
|
| 127 |
+
semester_enrollments["kode_mk"].isin(elective_codes)
|
| 128 |
+
]
|
| 129 |
+
has_elective_enrollments = len(elective_enrollments) > 0
|
| 130 |
+
|
| 131 |
+
# Get elective courses for this semester
|
| 132 |
+
elective_courses = []
|
| 133 |
+
if has_elective_enrollments:
|
| 134 |
+
elective_courses = (
|
| 135 |
+
elective_enrollments.groupby("kode_mk")["kode_mhs"]
|
| 136 |
+
.nunique()
|
| 137 |
+
.sort_values(ascending=False)
|
| 138 |
+
.to_dict()
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
return SemesterDataStatus(
|
| 142 |
+
year=year,
|
| 143 |
+
semester=semester,
|
| 144 |
+
has_offerings=has_offerings,
|
| 145 |
+
has_enrollments=has_enrollments,
|
| 146 |
+
has_elective_enrollments=has_elective_enrollments,
|
| 147 |
+
total_enrollments=len(semester_enrollments),
|
| 148 |
+
elective_enrollments=len(elective_enrollments),
|
| 149 |
+
elective_courses=elective_courses,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
def get_available_semesters_for_backtesting(self) -> List[Tuple[int, int]]:
|
| 153 |
+
"""
|
| 154 |
+
Get list of semesters that have elective enrollment data (suitable for backtesting).
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
List of (year, semester) tuples
|
| 158 |
+
"""
|
| 159 |
+
if not hasattr(self, "raw_data"):
|
| 160 |
+
self.raw_data = self._load_raw_data()
|
| 161 |
+
|
| 162 |
+
students = self.raw_data["students"]
|
| 163 |
+
elective_codes = self._get_elective_codes()
|
| 164 |
+
|
| 165 |
+
# Filter to elective enrollments only
|
| 166 |
+
elective_students = students[students["kode_mk"].isin(elective_codes)]
|
| 167 |
+
|
| 168 |
+
# Get unique year-semester combinations
|
| 169 |
+
available = (
|
| 170 |
+
elective_students.groupby(["thn", "smt"]).size().reset_index(name="count")
|
| 171 |
+
)
|
| 172 |
+
available = available[available["count"] > 0]
|
| 173 |
+
|
| 174 |
+
semesters = [
|
| 175 |
+
(int(row["thn"]), int(row["smt"])) for _, row in available.iterrows()
|
| 176 |
+
]
|
| 177 |
+
semesters.sort(reverse=True) # Most recent first
|
| 178 |
+
|
| 179 |
+
return semesters
|
| 180 |
+
|
| 181 |
+
def print_validation_summary(self):
|
| 182 |
+
"""Print a summary of validation results."""
|
| 183 |
+
if not self.validation_results:
|
| 184 |
+
print("\nWARNING: No validation has been run yet.")
|
| 185 |
+
return
|
| 186 |
+
|
| 187 |
+
print("\n" + "=" * 80)
|
| 188 |
+
print("DATA VALIDATION SUMMARY")
|
| 189 |
+
print("=" * 80)
|
| 190 |
+
|
| 191 |
+
errors = [r for r in self.validation_results if r.severity == "ERROR"]
|
| 192 |
+
warnings = [r for r in self.validation_results if r.severity == "WARNING"]
|
| 193 |
+
info = [r for r in self.validation_results if r.severity == "INFO"]
|
| 194 |
+
|
| 195 |
+
if errors:
|
| 196 |
+
print(f"\nERROR ({len(errors)}):")
|
| 197 |
+
for result in errors:
|
| 198 |
+
print(f" - {result.message}")
|
| 199 |
+
|
| 200 |
+
if warnings:
|
| 201 |
+
print(f"\nWARNING ({len(warnings)}):")
|
| 202 |
+
for result in warnings:
|
| 203 |
+
print(f" - {result.message}")
|
| 204 |
+
|
| 205 |
+
if info:
|
| 206 |
+
print(f"\nINFO ({len(info)}):")
|
| 207 |
+
for result in info:
|
| 208 |
+
print(f" - {result.message}")
|
| 209 |
+
|
| 210 |
+
print("\n" + "=" * 80)
|
| 211 |
+
if not errors:
|
| 212 |
+
print("VALIDATION PASSED - Data is ready for processing")
|
| 213 |
+
else:
|
| 214 |
+
print("VALIDATION FAILED - Please fix errors before proceeding")
|
| 215 |
+
print("=" * 80)
|
| 216 |
+
|
| 217 |
+
def _load_raw_data(self) -> Dict[str, pd.DataFrame]:
|
| 218 |
+
"""Load raw data from Excel file."""
|
| 219 |
+
logger.info(f"Loading data from {self.file_path}...")
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
"courses": pd.read_excel(self.file_path, sheet_name="tabel1_data_matkul"),
|
| 223 |
+
"offerings": pd.read_excel(
|
| 224 |
+
self.file_path, sheet_name="tabel2_data_matkul_dibuka"
|
| 225 |
+
),
|
| 226 |
+
"population": pd.read_excel(
|
| 227 |
+
self.file_path, sheet_name="tabel3_data_mahasiswa_per_tahun"
|
| 228 |
+
),
|
| 229 |
+
"students": pd.read_excel(
|
| 230 |
+
self.file_path, sheet_name="tabel4_data_individu_mahasiswa"
|
| 231 |
+
),
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
def _validate_file_structure(self):
|
| 235 |
+
"""Validate that all required sheets and columns exist."""
|
| 236 |
+
required_sheets = {
|
| 237 |
+
"courses": ["kode_mk", "nama_mk", "kategori_mk"],
|
| 238 |
+
"offerings": ["kode_mk", "tahun", "semester"],
|
| 239 |
+
"students": ["kode_mk", "kode_mhs", "thn", "smt"],
|
| 240 |
+
"population": ["jumlah_aktif"], # tahun_ajaran and semester may vary
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
for sheet_name, required_cols in required_sheets.items():
|
| 244 |
+
df = self.raw_data.get(sheet_name)
|
| 245 |
+
if df is None:
|
| 246 |
+
self.validation_results.append(
|
| 247 |
+
ValidationResult(
|
| 248 |
+
passed=False,
|
| 249 |
+
message=f"Sheet '{sheet_name}' not found",
|
| 250 |
+
severity="ERROR",
|
| 251 |
+
)
|
| 252 |
+
)
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 256 |
+
if missing_cols:
|
| 257 |
+
self.validation_results.append(
|
| 258 |
+
ValidationResult(
|
| 259 |
+
passed=False,
|
| 260 |
+
message=f"Missing columns in {sheet_name}: {missing_cols}",
|
| 261 |
+
severity="ERROR",
|
| 262 |
+
)
|
| 263 |
+
)
|
| 264 |
+
else:
|
| 265 |
+
self.validation_results.append(
|
| 266 |
+
ValidationResult(
|
| 267 |
+
passed=True,
|
| 268 |
+
message=f"Sheet '{sheet_name}' has all required columns",
|
| 269 |
+
severity="INFO",
|
| 270 |
+
)
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
def _validate_course_catalog(self):
|
| 274 |
+
"""Validate course catalog (tabel1)."""
|
| 275 |
+
courses = self.raw_data["courses"]
|
| 276 |
+
|
| 277 |
+
# Check for duplicates
|
| 278 |
+
total_records = len(courses)
|
| 279 |
+
unique_courses = courses["kode_mk"].nunique()
|
| 280 |
+
duplicate_count = total_records - unique_courses
|
| 281 |
+
|
| 282 |
+
if duplicate_count > 0:
|
| 283 |
+
self.validation_results.append(
|
| 284 |
+
ValidationResult(
|
| 285 |
+
passed=True,
|
| 286 |
+
message=f"Course catalog has {duplicate_count:,} duplicate records (will be cleaned)",
|
| 287 |
+
severity="WARNING",
|
| 288 |
+
details={"total": total_records, "unique": unique_courses},
|
| 289 |
+
)
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# Check for category consistency
|
| 293 |
+
categories = courses["kategori_mk"].unique()
|
| 294 |
+
non_standard = [c for c in categories if c not in ["W", "P"]]
|
| 295 |
+
if non_standard:
|
| 296 |
+
self.validation_results.append(
|
| 297 |
+
ValidationResult(
|
| 298 |
+
passed=True,
|
| 299 |
+
message=f"Non-standard categories found: {non_standard} (will be normalized)",
|
| 300 |
+
severity="WARNING",
|
| 301 |
+
)
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
def _validate_elective_courses(self):
|
| 305 |
+
"""Validate elective course identification."""
|
| 306 |
+
courses = self.raw_data["courses"]
|
| 307 |
+
|
| 308 |
+
# Clean and identify electives
|
| 309 |
+
courses_clean = courses.drop_duplicates(subset="kode_mk").copy()
|
| 310 |
+
courses_clean["kategori_mk"] = (
|
| 311 |
+
courses_clean["kategori_mk"].astype(str).str.upper().str.strip()
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
electives = courses_clean[courses_clean["kategori_mk"] == "P"]
|
| 315 |
+
elective_count = len(electives)
|
| 316 |
+
|
| 317 |
+
if elective_count == 0:
|
| 318 |
+
self.validation_results.append(
|
| 319 |
+
ValidationResult(
|
| 320 |
+
passed=False,
|
| 321 |
+
message="No elective courses found (kategori_mk = 'P')",
|
| 322 |
+
severity="ERROR",
|
| 323 |
+
)
|
| 324 |
+
)
|
| 325 |
+
else:
|
| 326 |
+
self.validation_results.append(
|
| 327 |
+
ValidationResult(
|
| 328 |
+
passed=True,
|
| 329 |
+
message=f"Found {elective_count} elective courses",
|
| 330 |
+
severity="INFO",
|
| 331 |
+
details={"electives": electives["kode_mk"].tolist()},
|
| 332 |
+
)
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
def _validate_enrollment_data(self):
|
| 336 |
+
"""Validate student enrollment data (tabel4)."""
|
| 337 |
+
students = self.raw_data["students"]
|
| 338 |
+
|
| 339 |
+
# Check for missing critical data
|
| 340 |
+
critical_fields = ["kode_mk", "kode_mhs", "thn", "smt"]
|
| 341 |
+
missing_data = students[critical_fields].isnull().any(axis=1).sum()
|
| 342 |
+
|
| 343 |
+
if missing_data > 0:
|
| 344 |
+
self.validation_results.append(
|
| 345 |
+
ValidationResult(
|
| 346 |
+
passed=True,
|
| 347 |
+
message=f"{missing_data} enrollment records have missing data (will be cleaned)",
|
| 348 |
+
severity="WARNING",
|
| 349 |
+
)
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# Check for duplicates
|
| 353 |
+
duplicate_enrollments = students.duplicated(
|
| 354 |
+
subset=["kode_mhs", "kode_mk", "thn", "smt"]
|
| 355 |
+
).sum()
|
| 356 |
+
|
| 357 |
+
if duplicate_enrollments > 0:
|
| 358 |
+
self.validation_results.append(
|
| 359 |
+
ValidationResult(
|
| 360 |
+
passed=True,
|
| 361 |
+
message=f"{duplicate_enrollments:,} duplicate enrollment records (will be cleaned)",
|
| 362 |
+
severity="WARNING",
|
| 363 |
+
)
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
# Check year range
|
| 367 |
+
min_year = students["thn"].min()
|
| 368 |
+
max_year = students["thn"].max()
|
| 369 |
+
|
| 370 |
+
self.validation_results.append(
|
| 371 |
+
ValidationResult(
|
| 372 |
+
passed=True,
|
| 373 |
+
message=f"Enrollment data spans {int(min_year)} to {int(max_year)}",
|
| 374 |
+
severity="INFO",
|
| 375 |
+
)
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
def _validate_population_data(self):
|
| 379 |
+
"""Validate yearly population data (tabel3)."""
|
| 380 |
+
population = self.raw_data["population"]
|
| 381 |
+
|
| 382 |
+
if len(population) == 0:
|
| 383 |
+
self.validation_results.append(
|
| 384 |
+
ValidationResult(
|
| 385 |
+
passed=False,
|
| 386 |
+
message="No population data found",
|
| 387 |
+
severity="ERROR",
|
| 388 |
+
)
|
| 389 |
+
)
|
| 390 |
+
return
|
| 391 |
+
|
| 392 |
+
# Check for required fields (note: actual columns are tahun_ajaran/semester, not in sheet_name definition)
|
| 393 |
+
if "jumlah_aktif" in population.columns:
|
| 394 |
+
min_pop = population["jumlah_aktif"].min()
|
| 395 |
+
max_pop = population["jumlah_aktif"].max()
|
| 396 |
+
|
| 397 |
+
self.validation_results.append(
|
| 398 |
+
ValidationResult(
|
| 399 |
+
passed=True,
|
| 400 |
+
message=f"Population data: {len(population)} records, range {int(min_pop)}-{int(max_pop)} students",
|
| 401 |
+
severity="INFO",
|
| 402 |
+
)
|
| 403 |
+
)
|
| 404 |
+
else:
|
| 405 |
+
self.validation_results.append(
|
| 406 |
+
ValidationResult(
|
| 407 |
+
passed=False,
|
| 408 |
+
message="Population data missing 'jumlah_aktif' column",
|
| 409 |
+
severity="ERROR",
|
| 410 |
+
)
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
def _get_elective_codes(self) -> set:
|
| 414 |
+
"""Get set of elective course codes."""
|
| 415 |
+
courses = self.raw_data["courses"]
|
| 416 |
+
courses_clean = courses.drop_duplicates(subset="kode_mk").copy()
|
| 417 |
+
courses_clean["kategori_mk"] = (
|
| 418 |
+
courses_clean["kategori_mk"].astype(str).str.upper().str.strip()
|
| 419 |
+
)
|
| 420 |
+
return set(courses_clean[courses_clean["kategori_mk"] == "P"]["kode_mk"])
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
if __name__ == "__main__":
|
| 424 |
+
# Example usage
|
| 425 |
+
logging.basicConfig(
|
| 426 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
validator = DataValidator(
|
| 430 |
+
"data/Data Perkuliahan Mahasiswa untuk Penelitian (8 Oktober 2025).xlsx"
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
# Run validation
|
| 434 |
+
passed, results = validator.validate_all()
|
| 435 |
+
validator.print_validation_summary()
|
| 436 |
+
|
| 437 |
+
# Check specific semesters
|
| 438 |
+
print("\n" + "=" * 80)
|
| 439 |
+
print("SEMESTER DATA AVAILABILITY")
|
| 440 |
+
print("=" * 80)
|
| 441 |
+
|
| 442 |
+
for year, semester in [(2024, 2), (2025, 1)]:
|
| 443 |
+
status = validator.check_semester_data_availability(year, semester)
|
| 444 |
+
print(f"\n{year} Semester {semester}:")
|
| 445 |
+
print(f" Offerings: {'Yes' if status.has_offerings else 'No'}")
|
| 446 |
+
print(
|
| 447 |
+
f" Enrollments: {'Yes' if status.has_enrollments else 'No'} ({status.total_enrollments} records)"
|
| 448 |
+
)
|
| 449 |
+
print(
|
| 450 |
+
f" Elective Enrollments: {'Yes' if status.has_elective_enrollments else 'No'} ({status.elective_enrollments} records)"
|
| 451 |
+
)
|
| 452 |
+
if status.elective_courses:
|
| 453 |
+
print(f" Elective courses: {len(status.elective_courses)}")
|
| 454 |
+
for code, count in list(status.elective_courses.items())[:5]:
|
| 455 |
+
print(f" - {code}: {count} students")
|
| 456 |
+
|
| 457 |
+
# Show available semesters for backtesting
|
| 458 |
+
print("\n" + "=" * 80)
|
| 459 |
+
print("SEMESTERS AVAILABLE FOR BACKTESTING")
|
| 460 |
+
print("=" * 80)
|
| 461 |
+
available = validator.get_available_semesters_for_backtesting()
|
| 462 |
+
if available:
|
| 463 |
+
print(f"\nFound {len(available)} semesters with elective enrollment data:")
|
| 464 |
+
for year, sem in available:
|
| 465 |
+
print(f" • {year} Semester {sem}")
|
| 466 |
+
else:
|
| 467 |
+
print("\nERROR: No semesters with elective enrollment data found!")
|
evaluator.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import seaborn as sns
|
| 9 |
+
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
| 10 |
+
|
| 11 |
+
from config import Config
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Evaluator:
|
| 17 |
+
def __init__(self, config: Config):
|
| 18 |
+
self.config = config
|
| 19 |
+
|
| 20 |
+
def run_backtest(self, full_data: pd.DataFrame, predictor):
|
| 21 |
+
"""Simulate past semesters to check accuracy."""
|
| 22 |
+
logger.info("Starting Backtest...")
|
| 23 |
+
results = []
|
| 24 |
+
|
| 25 |
+
start_year: int = self.config.backtest.START_YEAR
|
| 26 |
+
end_year: int = self.config.backtest.END_YEAR
|
| 27 |
+
|
| 28 |
+
for year in range(start_year, end_year + 1):
|
| 29 |
+
for smt in [1, 2]:
|
| 30 |
+
target_mask = (full_data["thn"] == year) & (full_data["smt"] == smt)
|
| 31 |
+
test_set = full_data[target_mask]
|
| 32 |
+
|
| 33 |
+
if test_set.empty:
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
train_set = full_data[
|
| 37 |
+
(full_data["thn"] < year)
|
| 38 |
+
| ((full_data["thn"] == year) & (full_data["smt"] < smt))
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
pop_est = predictor.get_student_forecast(year, smt)
|
| 43 |
+
except Exception:
|
| 44 |
+
pop_est = test_set["jumlah_aktif"].mean()
|
| 45 |
+
|
| 46 |
+
for _, row in test_set.iterrows():
|
| 47 |
+
pred = predictor.predict_course(
|
| 48 |
+
row["kode_mk"], train_set, year, smt, pop_est
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
results.append(
|
| 52 |
+
{
|
| 53 |
+
"year": year,
|
| 54 |
+
"semester": smt,
|
| 55 |
+
"kode_mk": row["kode_mk"],
|
| 56 |
+
"actual": row["enrollment"],
|
| 57 |
+
"predicted": pred["val"],
|
| 58 |
+
"strategy": pred["strategy"],
|
| 59 |
+
"error": abs(row["enrollment"] - pred["val"]),
|
| 60 |
+
}
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return pd.DataFrame(results)
|
| 64 |
+
|
| 65 |
+
def generate_metrics(self, results: pd.DataFrame):
|
| 66 |
+
"""Calculate and log performance metrics."""
|
| 67 |
+
results["error"] = abs(results["predicted"] - results["actual"])
|
| 68 |
+
|
| 69 |
+
mae = mean_absolute_error(results["actual"], results["predicted"])
|
| 70 |
+
rmse = np.sqrt(mean_squared_error(results["actual"], results["predicted"]))
|
| 71 |
+
|
| 72 |
+
logger.info("\n" + "=" * 40)
|
| 73 |
+
logger.info("BACKTEST METRICS")
|
| 74 |
+
logger.info("=" * 40)
|
| 75 |
+
logger.info(f"Overall MAE: {mae:.2f}")
|
| 76 |
+
logger.info(f"Overall RMSE: {rmse:.2f}")
|
| 77 |
+
|
| 78 |
+
logger.info("\nPerformance by Strategy:")
|
| 79 |
+
strat_perf = results.groupby("strategy")["error"].mean()
|
| 80 |
+
logger.info(strat_perf.to_string())
|
| 81 |
+
|
| 82 |
+
self._plot_results(results)
|
| 83 |
+
|
| 84 |
+
return {
|
| 85 |
+
'mae': mae,
|
| 86 |
+
'rmse': rmse
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
def _plot_results(self, df):
|
| 90 |
+
"""Generate simple Actual vs Predicted scatter plot."""
|
| 91 |
+
Path(self.config.output.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
| 92 |
+
|
| 93 |
+
plt.figure(figsize=(10, 6))
|
| 94 |
+
sns.scatterplot(
|
| 95 |
+
data=df, x="actual", y="predicted", hue="strategy", style="strategy"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
limit = max(df["actual"].max(), df["predicted"].max())
|
| 99 |
+
plt.plot([0, limit], [0, limit], "r--", alpha=0.5)
|
| 100 |
+
|
| 101 |
+
plt.title("Actual vs Predicted Enrollment")
|
| 102 |
+
plt.savefig(f"{self.config.output.OUTPUT_DIR}/backtest_scatter.png")
|
| 103 |
+
plt.close()
|
prophet_predictor.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from prophet import Prophet
|
| 7 |
+
|
| 8 |
+
from config import Config
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ProphetPredictor:
|
| 14 |
+
def __init__(self, config: Config):
|
| 15 |
+
self.config = config
|
| 16 |
+
self.student_model: Optional[Prophet] = None
|
| 17 |
+
|
| 18 |
+
def train_student_population_model(self, population_data: pd.DataFrame):
|
| 19 |
+
df = population_data.copy()
|
| 20 |
+
df["ds"] = pd.to_datetime(
|
| 21 |
+
df["thn"].astype(str)
|
| 22 |
+
+ "-"
|
| 23 |
+
+ df["smt"].map(self.config.prediction.SEMESTER_TO_MONTH)
|
| 24 |
+
)
|
| 25 |
+
df["y"] = df["jumlah_aktif"]
|
| 26 |
+
|
| 27 |
+
self.student_model = Prophet(daily_seasonality=False, weekly_seasonality=False) # type: ignore[arg-type]
|
| 28 |
+
self.student_model.fit(df)
|
| 29 |
+
logger.info("Student population model trained.")
|
| 30 |
+
|
| 31 |
+
def get_student_forecast(self, year: int, semester: int) -> float:
|
| 32 |
+
assert self.student_model is not None, "Student model must be trained first"
|
| 33 |
+
target_date = pd.to_datetime(
|
| 34 |
+
f"{year}-{self.config.prediction.SEMESTER_TO_MONTH[semester]}"
|
| 35 |
+
)
|
| 36 |
+
future = pd.DataFrame({"ds": [target_date]})
|
| 37 |
+
forecast = self.student_model.predict(future)
|
| 38 |
+
return max(forecast["yhat"].values[0], 100)
|
| 39 |
+
|
| 40 |
+
def predict_course(
|
| 41 |
+
self,
|
| 42 |
+
course_code: str,
|
| 43 |
+
df_history: pd.DataFrame,
|
| 44 |
+
target_year: int,
|
| 45 |
+
target_smt: int,
|
| 46 |
+
student_pop: float,
|
| 47 |
+
) -> dict:
|
| 48 |
+
hist = df_history[
|
| 49 |
+
(df_history["kode_mk"] == course_code) &
|
| 50 |
+
(df_history["smt"] == target_smt)
|
| 51 |
+
].sort_values(["thn", "smt"])
|
| 52 |
+
|
| 53 |
+
if len(hist) == 0:
|
| 54 |
+
return {
|
| 55 |
+
"val": self.config.model.FALLBACK_DEFAULT,
|
| 56 |
+
"strategy": "cold_start",
|
| 57 |
+
"confidence": "low",
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
return self._predict_prophet_logistic(
|
| 61 |
+
hist, target_year, target_smt, student_pop
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def _predict_prophet_logistic(
|
| 65 |
+
self, hist: pd.DataFrame, year: int, smt: int, pop: float
|
| 66 |
+
) -> dict:
|
| 67 |
+
df = hist.copy()
|
| 68 |
+
df["ds"] = pd.to_datetime(
|
| 69 |
+
df["thn"].astype(int).astype(str)
|
| 70 |
+
+ "-"
|
| 71 |
+
+ df["smt"].astype(int).map(self.config.prediction.SEMESTER_TO_MONTH)
|
| 72 |
+
)
|
| 73 |
+
df["y"] = df["enrollment"]
|
| 74 |
+
|
| 75 |
+
if df["y"].notna().sum() < 2:
|
| 76 |
+
return {
|
| 77 |
+
"val": hist["enrollment"].mean(),
|
| 78 |
+
"strategy": "fallback_mean",
|
| 79 |
+
"confidence": "medium",
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
if "jumlah_aktif" not in df.columns:
|
| 83 |
+
logger.warning(
|
| 84 |
+
"jumlah_aktif column missing from historical data - cannot use regressor"
|
| 85 |
+
)
|
| 86 |
+
return {
|
| 87 |
+
"val": hist["enrollment"].mean(),
|
| 88 |
+
"strategy": "fallback_mean",
|
| 89 |
+
"confidence": "low",
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
hist_max = df["y"].max()
|
| 93 |
+
hist_mean = df["y"].mean()
|
| 94 |
+
|
| 95 |
+
cap_value = min(
|
| 96 |
+
hist_max * self.config.prediction.MAX_CAPACITY_MULTIPLIER,
|
| 97 |
+
self.config.prediction.ABSOLUTE_MAX_STUDENTS,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
df["cap"] = cap_value
|
| 101 |
+
df["floor"] = 0
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
m = Prophet(
|
| 105 |
+
growth=self.config.model.GROWTH_MODE,
|
| 106 |
+
changepoint_prior_scale=self.config.model.CHANGEPOINT_SCALE,
|
| 107 |
+
seasonality_mode=self.config.model.SEASONALITY_MODE,
|
| 108 |
+
daily_seasonality=False, # type: ignore[arg-type]
|
| 109 |
+
weekly_seasonality=False, # type: ignore[arg-type]
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
m.add_regressor("jumlah_aktif", mode="multiplicative")
|
| 113 |
+
m.fit(df[["ds", "y", "cap", "floor", "jumlah_aktif"]])
|
| 114 |
+
|
| 115 |
+
future_date = pd.to_datetime(
|
| 116 |
+
f"{year}-{self.config.prediction.SEMESTER_TO_MONTH[smt]}"
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
future = pd.DataFrame(
|
| 120 |
+
{
|
| 121 |
+
"ds": [future_date],
|
| 122 |
+
"cap": [cap_value],
|
| 123 |
+
"floor": [0],
|
| 124 |
+
"jumlah_aktif": [pop],
|
| 125 |
+
}
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
forecast = m.predict(future)
|
| 129 |
+
raw_pred = forecast["yhat"].values[0]
|
| 130 |
+
|
| 131 |
+
if (
|
| 132 |
+
raw_pred < 0
|
| 133 |
+
or not np.isfinite(raw_pred)
|
| 134 |
+
or raw_pred > hist_max * 5
|
| 135 |
+
or raw_pred > cap_value * 2
|
| 136 |
+
):
|
| 137 |
+
logger.warning(
|
| 138 |
+
f"Prophet prediction ({raw_pred:.1f}) unrealistic. "
|
| 139 |
+
f"Using trend-based fallback. (hist_max={hist_max}, cap={cap_value})"
|
| 140 |
+
)
|
| 141 |
+
if len(df) >= 3:
|
| 142 |
+
recent_trend = df["y"].tail(3).mean()
|
| 143 |
+
pop_growth_factor = pop / df["jumlah_aktif"].mean()
|
| 144 |
+
growth_factor = min(
|
| 145 |
+
max(pop_growth_factor, 0.8), 1.3
|
| 146 |
+
)
|
| 147 |
+
pred = recent_trend * growth_factor
|
| 148 |
+
else:
|
| 149 |
+
pop_growth_factor = pop / df["jumlah_aktif"].mean()
|
| 150 |
+
pred = hist_mean * min(max(pop_growth_factor, 0.8), 1.3)
|
| 151 |
+
|
| 152 |
+
pred = min(max(pred, 0), cap_value)
|
| 153 |
+
|
| 154 |
+
return {
|
| 155 |
+
"val": pred,
|
| 156 |
+
"strategy": "trend_fallback",
|
| 157 |
+
"confidence": "medium",
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
pred = min(max(0, raw_pred), cap_value)
|
| 161 |
+
|
| 162 |
+
return {
|
| 163 |
+
"val": pred,
|
| 164 |
+
"strategy": "prophet_logistic",
|
| 165 |
+
"confidence": "high",
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.warning(f"Prophet failed for course. Error: {e}. Using fallback.")
|
| 170 |
+
return {
|
| 171 |
+
"val": hist["enrollment"].mean(),
|
| 172 |
+
"strategy": "fallback_mean",
|
| 173 |
+
"confidence": "medium",
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
def generate_batch_predictions(
|
| 177 |
+
self,
|
| 178 |
+
full_data: pd.DataFrame,
|
| 179 |
+
course_metadata: pd.DataFrame,
|
| 180 |
+
electives: set,
|
| 181 |
+
year: int,
|
| 182 |
+
smt: int,
|
| 183 |
+
):
|
| 184 |
+
"""Generate predictions for all courses."""
|
| 185 |
+
student_pop = self.get_student_forecast(year, smt)
|
| 186 |
+
results = []
|
| 187 |
+
|
| 188 |
+
logger.info(
|
| 189 |
+
f"Predicting for {len(electives)} courses (Pop: {int(student_pop)})..."
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
for code in electives:
|
| 193 |
+
meta = course_metadata[course_metadata["kode_mk"] == code].iloc[0]
|
| 194 |
+
|
| 195 |
+
pred_result = self.predict_course(code, full_data, year, smt, student_pop)
|
| 196 |
+
pred_val = pred_result["val"]
|
| 197 |
+
|
| 198 |
+
rec_quota = int(
|
| 199 |
+
np.ceil(pred_val * (1 + self.config.prediction.BUFFER_PERCENT))
|
| 200 |
+
)
|
| 201 |
+
rec_quota = max(rec_quota, self.config.prediction.MIN_QUOTA_OPEN)
|
| 202 |
+
|
| 203 |
+
status = (
|
| 204 |
+
"BUKA"
|
| 205 |
+
if pred_val >= self.config.prediction.MIN_PREDICT_THRESHOLD
|
| 206 |
+
else "TUTUP"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
results.append(
|
| 210 |
+
{
|
| 211 |
+
"kode_mk": code,
|
| 212 |
+
"nama_mk": meta["nama_mk"],
|
| 213 |
+
"sks": meta["sks_mk"],
|
| 214 |
+
"predicted_enrollment": round(pred_val, 1),
|
| 215 |
+
"recommended_quota": rec_quota if status == "BUKA" else 0,
|
| 216 |
+
"recommendation": status,
|
| 217 |
+
"strategy": pred_result["strategy"],
|
| 218 |
+
"confidence": pred_result["confidence"],
|
| 219 |
+
"classes_est": int(np.ceil(rec_quota / 40))
|
| 220 |
+
if status == "BUKA"
|
| 221 |
+
else 0,
|
| 222 |
+
}
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
return pd.DataFrame(results).sort_values(
|
| 226 |
+
"predicted_enrollment", ascending=False
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
def predict_course_enrollment(
|
| 230 |
+
self,
|
| 231 |
+
course_code: str,
|
| 232 |
+
train_data: pd.DataFrame,
|
| 233 |
+
test_year: int,
|
| 234 |
+
test_semester: int,
|
| 235 |
+
test_student_count: float,
|
| 236 |
+
) -> tuple[float, str]:
|
| 237 |
+
result = self.predict_course(
|
| 238 |
+
course_code=course_code,
|
| 239 |
+
df_history=train_data,
|
| 240 |
+
target_year=test_year,
|
| 241 |
+
target_smt=test_semester,
|
| 242 |
+
student_pop=test_student_count,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
val = float(result["val"])
|
| 246 |
+
if not np.isfinite(val):
|
| 247 |
+
val = self.config.model.FALLBACK_DEFAULT
|
| 248 |
+
strategy = "fallback_default"
|
| 249 |
+
else:
|
| 250 |
+
val = max(0.0, val)
|
| 251 |
+
strategy = result["strategy"]
|
| 252 |
+
|
| 253 |
+
return val, strategy
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas>=2.0.0
|
| 2 |
+
numpy>=1.24.0
|
| 3 |
+
openpyxl>=3.1.0
|
| 4 |
+
prophet>=1.1.5
|
| 5 |
+
scikit-learn>=1.3.0
|
| 6 |
+
matplotlib>=3.7.0
|
| 7 |
+
seaborn>=0.12.0
|
| 8 |
+
gradio>=4.0.0
|
| 9 |
+
python-dateutil>=2.8.2
|
| 10 |
+
huggingface_hub>=0.20.0
|
utils.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def setup_logging(level: str):
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
level=level,
|
| 11 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 12 |
+
datefmt="%H:%M:%S",
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
logging.getLogger("prophet").setLevel(logging.WARNING)
|
| 16 |
+
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def save_excel(df: pd.DataFrame, filename: str, output_dir: str):
|
| 20 |
+
Path(output_dir).mkdir(exist_ok=True, parents=True)
|
| 21 |
+
path = os.path.join(output_dir, filename)
|
| 22 |
+
df.to_excel(path, index=False)
|
| 23 |
+
logging.info(f"Results saved to: {path}")
|