muhalwan commited on
Commit
48b5cb1
·
0 Parent(s):

awal bukanlah akhir

Browse files
Files changed (10) hide show
  1. .gitignore +13 -0
  2. app.py +335 -0
  3. config.py +170 -0
  4. data_loader.py +88 -0
  5. data_processor.py +350 -0
  6. data_validator.py +467 -0
  7. evaluator.py +103 -0
  8. prophet_predictor.py +253 -0
  9. requirements.txt +10 -0
  10. utils.py +23 -0
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ article/
3
+ docs/
4
+ output/
5
+ validation_output/
6
+ venv/
7
+ backtesting.py
8
+ data_exploration.ipynb
9
+ main.py
10
+ optimize_data.py
11
+ WORKFLOW.md
12
+ data/
13
+ hf_cache/
app.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from typing import Optional, Tuple
9
+
10
+ from config import Config
11
+ from data_processor import DataProcessor
12
+ from evaluator import Evaluator
13
+ from prophet_predictor import ProphetPredictor
14
+ from utils import setup_logging
15
+
16
+ setup_logging("INFO")
17
+ logger = logging.getLogger("GradioApp")
18
+
19
+ _processor: Optional[DataProcessor] = None
20
+ _predictor: Optional[ProphetPredictor] = None
21
+ _config: Optional[Config] = None
22
+ _df_enrollment: Optional[pd.DataFrame] = None
23
+ _elective_codes: Optional[set] = None
24
+ _backtest_metrics: Optional[dict] = None
25
+
26
+
27
+ def initialize_system():
28
+ """Initialize the prediction system (called once at startup)."""
29
+ global _processor, _predictor, _config, _df_enrollment, _elective_codes, _backtest_metrics
30
+
31
+ try:
32
+ logger.info("Initializing prediction system...")
33
+ _config = Config()
34
+
35
+ _processor = DataProcessor(_config)
36
+ _df_enrollment, _elective_codes = _processor.load_and_process()
37
+
38
+ _predictor = ProphetPredictor(_config)
39
+ _predictor.train_student_population_model(
40
+ _processor.raw_data["students_yearly"]
41
+ )
42
+
43
+ logger.info("✓ System initialized successfully")
44
+ return True
45
+ except Exception as e:
46
+ logger.error(f"Failed to initialize system: {e}", exc_info=True)
47
+ return False
48
+
49
+
50
+ def generate_predictions(year: int, semester: int) -> Tuple[str, Optional[pd.DataFrame], Optional[pd.DataFrame]]:
51
+ """
52
+ Generate enrollment predictions for a given year and semester.
53
+
54
+ Args:
55
+ year: Target year (e.g., 2025)
56
+ semester: Target semester (1 = Ganjil/Odd, 2 = Genap/Even)
57
+
58
+ Returns:
59
+ Tuple of (summary_text, recommendations_df, all_predictions_df)
60
+ """
61
+ global _processor, _predictor, _config, _df_enrollment, _elective_codes, _backtest_metrics
62
+
63
+ try:
64
+ if semester not in [1, 2]:
65
+ return "❌ Error: Semester must be 1 (Ganjil) or 2 (Genap)", None, None
66
+
67
+ if year < 2020 or year > 2030:
68
+ return "❌ Error: Year must be between 2020 and 2030", None, None
69
+
70
+ if _config is None or _predictor is None or _processor is None or _df_enrollment is None or _elective_codes is None:
71
+ return "❌ Error: System not initialized. Please restart the app.", None, None
72
+
73
+ logger.info(f"Generating predictions for {year} Semester {semester}...")
74
+
75
+ _config.prediction.PREDICT_YEAR = year
76
+ _config.prediction.PREDICT_SEMESTER = semester
77
+
78
+ if _backtest_metrics is None:
79
+ logger.info("Running backtest for the first time...")
80
+ evaluator = Evaluator(_config)
81
+ backtest_results = evaluator.run_backtest(_df_enrollment, _predictor)
82
+
83
+ if backtest_results is None or len(backtest_results) == 0:
84
+ logger.warning("Backtest returned no results, using defaults")
85
+ _backtest_metrics = {'mae': 0, 'rmse': 0}
86
+ else:
87
+ _backtest_metrics = evaluator.generate_metrics(backtest_results)
88
+ if _backtest_metrics is None:
89
+ logger.warning("Metrics calculation failed, using defaults")
90
+ _backtest_metrics = {'mae': 0, 'rmse': 0}
91
+ else:
92
+ logger.info("Using cached backtest metrics")
93
+
94
+ metrics = _backtest_metrics
95
+
96
+ predictions = _predictor.generate_batch_predictions(
97
+ _df_enrollment,
98
+ _processor.raw_data["courses"],
99
+ _elective_codes,
100
+ year,
101
+ semester,
102
+ )
103
+
104
+ recommended = predictions[predictions["recommendation"] == "BUKA"].copy()
105
+
106
+ semester_name = "Ganjil (Odd)" if semester == 1 else "Genap (Even)"
107
+ summary = f"""
108
+ ## 📊 Prediction Summary for {year} Semester {semester_name}
109
+
110
+ ### Model Performance (Backtest)
111
+ - **Mean Absolute Error (MAE)**: {metrics['mae']:.2f} students
112
+ - **Root Mean Squared Error (RMSE)**: {metrics['rmse']:.2f} students
113
+
114
+ ### Recommendations
115
+ - **Courses to Open**: {len(recommended)}
116
+ - **Total Seats Needed**: {int(recommended['recommended_quota'].sum()) if not recommended.empty else 0}
117
+ - **Estimated Students**: {int(recommended['predicted_enrollment'].sum()) if not recommended.empty else 0}
118
+
119
+ ### Top Course
120
+ """
121
+
122
+ if not recommended.empty:
123
+ top_course = recommended.iloc[0]
124
+ summary += f"- **{top_course['nama_mk']}** ({top_course['kode_mk']})\n"
125
+ summary += f" - Predicted: {top_course['predicted_enrollment']:.0f} students\n"
126
+ summary += f" - Recommended Quota: {top_course['recommended_quota']:.0f} seats"
127
+ else:
128
+ summary += "- No courses recommended to open"
129
+
130
+ if not recommended.empty:
131
+ recommended_display = recommended[[
132
+ 'kode_mk', 'nama_mk', 'predicted_enrollment',
133
+ 'recommended_quota', 'strategy'
134
+ ]].copy()
135
+ recommended_display.columns = [
136
+ 'Course Code', 'Course Name', 'Predicted Students',
137
+ 'Recommended Quota', 'Prediction Strategy'
138
+ ]
139
+ recommended_display['Predicted Students'] = recommended_display['Predicted Students'].round(1)
140
+ recommended_display['Recommended Quota'] = recommended_display['Recommended Quota'].astype(int)
141
+ recommended_display = recommended_display.sort_values('Predicted Students', ascending=False)
142
+ else:
143
+ recommended_display = pd.DataFrame()
144
+
145
+ # All predictions
146
+ all_predictions_display = predictions[[
147
+ 'kode_mk', 'nama_mk', 'predicted_enrollment',
148
+ 'recommended_quota', 'recommendation', 'strategy'
149
+ ]].copy()
150
+ all_predictions_display.columns = [
151
+ 'Course Code', 'Course Name', 'Predicted Students',
152
+ 'Recommended Quota', 'Recommendation', 'Strategy'
153
+ ]
154
+ all_predictions_display['Predicted Students'] = all_predictions_display['Predicted Students'].round(1)
155
+ all_predictions_display['Recommended Quota'] = all_predictions_display['Recommended Quota'].astype(int)
156
+ all_predictions_display = all_predictions_display.sort_values('Predicted Students', ascending=False)
157
+
158
+ logger.info(f"✓ Predictions generated successfully")
159
+ return summary, recommended_display, all_predictions_display
160
+
161
+ except Exception as e:
162
+ error_msg = f"❌ Error generating predictions: {str(e)}"
163
+ logger.error(error_msg, exc_info=True)
164
+ return error_msg, None, None
165
+
166
+
167
+ def get_data_info() -> str:
168
+ """Get information about the loaded dataset."""
169
+ global _processor, _config
170
+
171
+ try:
172
+ if _processor is None or _config is None:
173
+ return "❌ System not initialized"
174
+
175
+ courses = _processor.raw_data.get("courses")
176
+ students = _processor.raw_data.get("students_yearly")
177
+
178
+ if courses is None or students is None:
179
+ return "❌ Data not loaded"
180
+
181
+ # Get elective courses
182
+ elective_courses = courses[courses["kategori_mk"] == "P"]
183
+
184
+ info = f"""
185
+ ## 📁 Dataset Information
186
+
187
+ ### Course Catalog
188
+ - **Total Courses**: {len(courses)}
189
+ - **Elective Courses**: {len(elective_courses)}
190
+ - **Mandatory Courses**: {len(courses) - len(elective_courses)}
191
+
192
+ ### Student Population
193
+ - **Years Available**: {students['thn'].min()} - {students['thn'].max()}
194
+ - **Total Records**: {len(students)}
195
+
196
+ ### Data Source
197
+ - File: `{_config.data.FILE_PATH}`
198
+ - Last Updated: October 8, 2025
199
+ """
200
+ return info
201
+
202
+ except Exception as e:
203
+ return f"❌ Error getting data info: {str(e)}"
204
+
205
+
206
+ # Initialize system at startup
207
+ logger.info("Starting Gradio app...")
208
+ init_success = initialize_system()
209
+
210
+ if not init_success:
211
+ logger.error("Failed to initialize system. App may not work correctly.")
212
+
213
+ # Create Gradio Interface
214
+ with gr.Blocks(title="SKS Enrollment Predictor") as demo:
215
+
216
+ # Show disclaimer banner if using demo data
217
+ if os.getenv("DEMO_MODE", "false").lower() == "true":
218
+ gr.Markdown(
219
+ """
220
+ <div style='padding: 15px; background-color: #fff3cd; border-left: 5px solid #ffc107; margin-bottom: 20px;'>
221
+ <h3 style='margin-top: 0; color: #856404;'>⚠️ Demo Version - Anonymized Data</h3>
222
+ <p style='margin-bottom: 0; color: #856404;'>
223
+ This demonstration uses <strong>anonymized enrollment data</strong> to protect student privacy.
224
+ All predictions and functionality are identical to the production version.
225
+ </p>
226
+ <details style='margin-top: 10px;'>
227
+ <summary style='cursor: pointer; color: #856404;'><strong>Changes made for demo:</strong></summary>
228
+ <ul style='color: #856404;'>
229
+ <li>Student IDs replaced with anonymous codes (STU000001, STU000002, ...)</li>
230
+ <li>Population counts have ±3% random noise added</li>
231
+ <li>Course information and enrollment patterns fully preserved</li>
232
+ </ul>
233
+ </details>
234
+ </div>
235
+ """,
236
+ sanitize_html=False
237
+ )
238
+
239
+ with gr.Tabs():
240
+ with gr.Tab("Generate Predictions"):
241
+
242
+ with gr.Row():
243
+ with gr.Column(scale=1):
244
+ year_input = gr.Number(
245
+ label="Target Year",
246
+ value=2025,
247
+ precision=0,
248
+ minimum=2020,
249
+ maximum=2030,
250
+ info="Masukkan tahun yang ingin diprediksi"
251
+ )
252
+
253
+ semester_input = gr.Radio(
254
+ choices=[1, 2],
255
+ label="Semester",
256
+ value=2,
257
+ info="1 = Ganjil, 2 = Genap"
258
+ )
259
+
260
+ predict_btn = gr.Button(
261
+ "Generate Predictions",
262
+ variant="primary",
263
+ size="lg"
264
+ )
265
+
266
+ with gr.Column(scale=2):
267
+ summary_output = gr.Markdown(
268
+ label="Summary",
269
+ value="Click 'Generate Predictions' to start"
270
+ )
271
+
272
+ gr.Markdown("### Recommended Courses to Open")
273
+ recommended_output = gr.Dataframe(
274
+ label="Courses Recommended to Open",
275
+ wrap=True,
276
+ interactive=False
277
+ )
278
+
279
+ with gr.Accordion("View All Predictions", open=False):
280
+ all_predictions_output = gr.Dataframe(
281
+ label="All Elective Courses",
282
+ wrap=True,
283
+ interactive=False
284
+ )
285
+
286
+ with gr.Tab("Data Information"):
287
+ gr.Markdown(
288
+ )
289
+
290
+ data_info_btn = gr.Button("Refresh Data Info", variant="secondary")
291
+ data_info_output = gr.Markdown()
292
+
293
+ data_info_btn.click(
294
+ fn=get_data_info,
295
+ inputs=[],
296
+ outputs=data_info_output
297
+ )
298
+
299
+ demo.load(fn=get_data_info, inputs=[], outputs=data_info_output)
300
+
301
+
302
+ predict_btn.click(
303
+ fn=generate_predictions,
304
+ inputs=[year_input, semester_input],
305
+ outputs=[summary_output, recommended_output, all_predictions_output]
306
+ )
307
+
308
+ # Footer
309
+ if os.getenv("DEMO_MODE", "false").lower() == "true":
310
+ gr.Markdown(
311
+ """
312
+ ---
313
+ <div style='text-align: center; color: #666; font-size: 0.9em;'>
314
+ 📊 Demo Version with Anonymized Data | For Educational Purposes
315
+ </div>
316
+ """
317
+ )
318
+ else:
319
+ gr.Markdown(
320
+ """
321
+ ---
322
+ <div style='text-align: center; color: #666; font-size: 0.9em;'>
323
+ 🔒 Private & Confidential | For Authorized Use Only
324
+ </div>
325
+ """
326
+ )
327
+
328
+ # Launch the app
329
+ if __name__ == "__main__":
330
+ demo.launch(
331
+ server_name="0.0.0.0",
332
+ server_port=7860,
333
+ share=False,
334
+ show_error=True
335
+ )
config.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, List
3
+ import os
4
+
5
+ # Import data loader for private HF dataset support
6
+ try:
7
+ from data_loader import load_data_file
8
+ DATA_LOADER_AVAILABLE = True
9
+ except ImportError:
10
+ DATA_LOADER_AVAILABLE = False
11
+ def load_data_file() -> str:
12
+ """Fallback if data_loader not available."""
13
+ return "data/optimized_data.xlsx"
14
+
15
+
16
+ def _get_data_file_path() -> str:
17
+ """
18
+ Get data file path based on environment.
19
+
20
+ Priority:
21
+ 1. If HF_TOKEN set: Load from private HF dataset (muhalwan/optimized_data_mhs)
22
+ 2. If DEMO_MODE=true: Use demo_data.xlsx (anonymized)
23
+ 3. Otherwise: Use local optimized_data.xlsx
24
+ """
25
+ if os.getenv("HF_TOKEN"):
26
+ return load_data_file() # Loads from HF dataset if HF_TOKEN is set
27
+ elif os.getenv("DEMO_MODE", "false").lower() == "true":
28
+ return "data/demo_data.xlsx"
29
+ else:
30
+ return "data/optimized_data.xlsx"
31
+
32
+
33
+ @dataclass
34
+ class DataConfig:
35
+ """Data source configuration and validation rules."""
36
+
37
+ # Data file path - automatically determined based on environment
38
+ FILE_PATH: str = field(default_factory=_get_data_file_path)
39
+
40
+ # Sheet mappings
41
+ SHEET_COURSES: str = "tabel1_data_matkul"
42
+ SHEET_OFFERINGS: str = "tabel2_data_matkul_dibuka"
43
+ SHEET_STUDENTS_YEARLY: str = "tabel3_data_mahasiswa_per_tahun"
44
+ SHEET_STUDENTS_INDIVIDUAL: str = "tabel4_data_individu_mahasiswa"
45
+
46
+ # Standardization
47
+ OFFERINGS_RENAME: Dict[str, str] = field(
48
+ default_factory=lambda: {"tahun": "thn", "semester": "smt"}
49
+ )
50
+
51
+ # Elective Course Identification
52
+ # IMPORTANT: Elective courses are identified by kategori_mk = 'P' in tabel1
53
+ # Mandatory/Required courses have kategori_mk = 'W'
54
+ ELECTIVE_CATEGORY: str = "P"
55
+ MANDATORY_CATEGORY: str = "W"
56
+
57
+ # Valid category values (will be normalized to uppercase)
58
+ VALID_CATEGORIES: List[str] = field(default_factory=lambda: ["P", "W"])
59
+
60
+
61
+ @dataclass
62
+ class ModelConfig:
63
+ """Prophet model hyperparameters and prediction strategies."""
64
+
65
+ # Prophet Hyperparameters
66
+ GROWTH_MODE: str = "logistic"
67
+ CHANGEPOINT_SCALE: float = 0.01
68
+ SEASONALITY_MODE: str = "multiplicative"
69
+ YEARLY_SEASONALITY: bool = True
70
+ FALLBACK_DEFAULT: int = 20
71
+
72
+ # Prediction safety limits
73
+ # Maximum multiplier of historical max enrollment before flagging as unrealistic
74
+ SANITY_CHECK_MAX_MULTIPLIER: float = 3.0
75
+ # Minimum historical data points required for reliable prediction
76
+ MIN_HISTORY_POINTS: int = 3
77
+
78
+
79
+ @dataclass
80
+ class PredictionConfig:
81
+ """Business logic for predictions."""
82
+
83
+ PREDICT_YEAR: int = 2025
84
+ PREDICT_SEMESTER: int = 2
85
+
86
+ # Buffer Calculations
87
+ BUFFER_PERCENT: float = 0.20
88
+ MIN_QUOTA_OPEN: int = 25
89
+ MIN_PREDICT_THRESHOLD: int = 15
90
+
91
+ MAX_CAPACITY_MULTIPLIER: float = 2.0
92
+ ABSOLUTE_MAX_STUDENTS: int = 400
93
+
94
+ SEMESTER_TO_MONTH: Dict[int, str] = field(
95
+ default_factory=lambda: {
96
+ 1: "09-01",
97
+ 2: "03-01",
98
+ }
99
+ )
100
+
101
+
102
+ @dataclass
103
+ class OutputConfig:
104
+ """Output settings."""
105
+
106
+ OUTPUT_DIR: str = "output"
107
+ LOG_LEVEL: str = "INFO"
108
+ TOP_N_DISPLAY: int = 30
109
+
110
+
111
+ @dataclass
112
+ class BacktestConfig:
113
+ """Backtest settings and validation."""
114
+
115
+ START_YEAR: int = 2010
116
+ END_YEAR: int = 2024
117
+ VERBOSE: bool = True
118
+
119
+ # Minimum elective enrollments required for backtesting
120
+ MIN_ELECTIVE_ENROLLMENTS: int = 1
121
+ # Minimum unique courses required for backtesting
122
+ MIN_UNIQUE_COURSES: int = 1
123
+
124
+
125
+ class Config:
126
+ """
127
+ Master Config Object.
128
+
129
+ ELECTIVE COURSE DEFINITION:
130
+ ---------------------------
131
+ Elective courses are identified by kategori_mk = 'P' in tabel1_data_matkul.
132
+ This is the ONLY source of truth for course categories.
133
+
134
+ Examples of elective courses (kategori_mk = 'P'):
135
+ - EF234607: Keamanan Aplikasi
136
+ - EF234613: Game Edukasi dan Simulasi
137
+ - UG234922: Kebudayaan dan Kebangsaan
138
+ - IW184301: Sistem Basis Data
139
+ - KI series: Various computer science electives
140
+
141
+ Mandatory courses have kategori_mk = 'W' (Wajib).
142
+
143
+ DATA REQUIREMENTS FOR BACKTESTING:
144
+ -----------------------------------
145
+ To backtest a semester, you need:
146
+ 1. Course catalog (tabel1) with kategori_mk properly set
147
+ 2. ACTUAL student enrollments (tabel4) for that semester
148
+ 3. At least one elective course with enrollments
149
+
150
+ Note: Course offerings (tabel2) alone are NOT sufficient for backtesting.
151
+ You must have actual enrollment data (tabel4) to validate predictions.
152
+ """
153
+
154
+ def __init__(self):
155
+ self.data: DataConfig = DataConfig()
156
+ self.model: ModelConfig = ModelConfig()
157
+ self.prediction: PredictionConfig = PredictionConfig()
158
+ self.output: OutputConfig = OutputConfig()
159
+ self.backtest: BacktestConfig = BacktestConfig()
160
+
161
+ def get_prediction_target_name(self) -> str:
162
+ sem = "Ganjil" if self.prediction.PREDICT_SEMESTER == 1 else "Genap"
163
+ return f"{self.prediction.PREDICT_YEAR} Semester {sem}"
164
+
165
+ def get_elective_filter_description(self) -> str:
166
+ """Get human-readable description of elective identification."""
167
+ return f"kategori_mk = '{self.data.ELECTIVE_CATEGORY}' in {self.data.SHEET_COURSES}"
168
+
169
+
170
+ default_config = Config()
data_loader.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def load_data_file() -> str:
10
+ hf_token = os.getenv("HF_TOKEN")
11
+
12
+ if hf_token:
13
+ try:
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ logger.info("🔐 Loading data from private Hugging Face dataset...")
17
+ logger.info(" Dataset: muhalwan/optimized_data_mhs")
18
+
19
+ file_path = hf_hub_download(
20
+ repo_id="muhalwan/optimized_data_mhs",
21
+ filename="optimized_data.xlsx",
22
+ repo_type="dataset",
23
+ token=hf_token,
24
+ cache_dir="./hf_cache"
25
+ )
26
+
27
+ logger.info(f"✓ Data loaded successfully from HF dataset")
28
+ logger.info(f" Cached at: {file_path}")
29
+ return file_path
30
+
31
+ except ImportError:
32
+ logger.error("huggingface_hub not installed. Install with: pip install huggingface_hub")
33
+ raise
34
+
35
+ except Exception as e:
36
+ logger.error(f"Failed to download from HF dataset: {e}")
37
+ logger.error("Falling back to local file if available...")
38
+
39
+ local_path = "data/optimized_data.xlsx"
40
+
41
+ if Path(local_path).exists():
42
+ logger.info(f"📁 Loading data from local file: {local_path}")
43
+ return local_path
44
+
45
+ error_msg = (
46
+ "No data file found!\n"
47
+ "Options:\n"
48
+ "1. Set HF_TOKEN environment variable to load from private dataset\n"
49
+ "2. Place optimized_data.xlsx in data/ folder for local development\n"
50
+ )
51
+ logger.error(error_msg)
52
+ raise FileNotFoundError(error_msg)
53
+
54
+
55
+ def get_data_source_info() -> dict:
56
+ hf_token = os.getenv("HF_TOKEN")
57
+ local_exists = Path("data/optimized_data.xlsx").exists()
58
+
59
+ return {
60
+ "hf_token_available": bool(hf_token),
61
+ "local_file_available": local_exists,
62
+ "will_use_hf_dataset": bool(hf_token),
63
+ "will_use_local": not hf_token and local_exists,
64
+ "dataset_repo": "muhalwan/optimized_data_mhs" if hf_token else None,
65
+ "local_path": "data/optimized_data.xlsx" if local_exists else None
66
+ }
67
+
68
+
69
+ if __name__ == "__main__":
70
+ logging.basicConfig(level=logging.INFO)
71
+
72
+ print("=" * 80)
73
+ print("Data Source Information")
74
+ print("=" * 80)
75
+
76
+ info = get_data_source_info()
77
+ for key, value in info.items():
78
+ print(f" {key}: {value}")
79
+
80
+ print("\n" + "=" * 80)
81
+ print("Attempting to load data...")
82
+ print("=" * 80)
83
+
84
+ try:
85
+ file_path = load_data_file()
86
+ print(f"\n✓ Success! Data file: {file_path}")
87
+ except Exception as e:
88
+ print(f"\n✗ Failed: {e}")
data_processor.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Set, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from config import Config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DataProcessor:
13
+ def __init__(self, config: Config):
14
+ self.config = config
15
+ self.raw_data: Dict[str, pd.DataFrame] = {}
16
+ self.processed_data: pd.DataFrame = pd.DataFrame()
17
+ self.elective_codes: Set[str] = set()
18
+
19
+ def load_and_process(self) -> Tuple[pd.DataFrame, Set[str]]:
20
+ self._load_excel()
21
+ self._validate_raw_data()
22
+ return self._preprocess()
23
+
24
+ def _load_excel(self):
25
+ logger.info(f"Loading {self.config.data.FILE_PATH}...")
26
+ try:
27
+ sheets = pd.read_excel(self.config.data.FILE_PATH, sheet_name=None)
28
+ self.raw_data = {
29
+ "courses": sheets[self.config.data.SHEET_COURSES],
30
+ "offerings": sheets[self.config.data.SHEET_OFFERINGS],
31
+ "students_yearly": sheets[self.config.data.SHEET_STUDENTS_YEARLY],
32
+ "students_ind": sheets[self.config.data.SHEET_STUDENTS_INDIVIDUAL],
33
+ }
34
+ except Exception as e:
35
+ logger.error(f"Failed to load Excel: {e}")
36
+ raise
37
+
38
+ def _validate_raw_data(self):
39
+ """Validate required columns and log data quality metrics."""
40
+ req_cols = {
41
+ "courses": ["kode_mk", "kategori_mk"],
42
+ "students_ind": ["kode_mk", "thn", "smt", "kode_mhs"],
43
+ "students_yearly": ["thn", "smt", "jumlah_aktif"],
44
+ }
45
+
46
+ for key, cols in req_cols.items():
47
+ if not all(col in self.raw_data[key].columns for col in cols):
48
+ raise ValueError(f"Missing columns in {key}: {cols}")
49
+
50
+ # Log data quality metrics
51
+ self._log_data_quality()
52
+
53
+ def _log_data_quality(self):
54
+ """Log data quality metrics for monitoring."""
55
+ courses_df = self.raw_data["courses"]
56
+ students_df = self.raw_data["students_ind"]
57
+
58
+ logger.info("=" * 60)
59
+ logger.info("Data Quality Report:")
60
+ logger.info(f" Courses (tabel1): {len(courses_df)} records")
61
+ logger.info(f" - Unique courses: {courses_df['kode_mk'].nunique()}")
62
+ logger.info(
63
+ f" - Duplicates: {len(courses_df) - courses_df['kode_mk'].nunique()}"
64
+ )
65
+ logger.info(f" Students (tabel4): {len(students_df)} records")
66
+ logger.info(f" - Unique students: {students_df['kode_mhs'].nunique()}")
67
+ logger.info("=" * 60)
68
+
69
+ def _clean_courses_data(self, courses: pd.DataFrame) -> pd.DataFrame:
70
+ """
71
+ Clean and standardize course catalog data.
72
+
73
+ Cleaning steps:
74
+ 1. Remove exact duplicates
75
+ 2. Standardize kategori_mk values (uppercase, strip whitespace)
76
+ 3. Remove courses with invalid/missing data
77
+ 4. Keep first occurrence for duplicate course codes
78
+ 5. Validate kategori_mk values
79
+ """
80
+ initial_count = len(courses)
81
+
82
+ # Step 1: Remove exact duplicate rows
83
+ courses = courses.drop_duplicates()
84
+ if len(courses) < initial_count:
85
+ logger.info(
86
+ f" Removed {initial_count - len(courses)} exact duplicate rows"
87
+ )
88
+
89
+ # Step 2: Standardize kategori_mk
90
+ courses["kategori_mk"] = (
91
+ courses["kategori_mk"]
92
+ .astype(str)
93
+ .str.upper()
94
+ .str.strip()
95
+ .replace("", np.nan)
96
+ )
97
+
98
+ # Step 3: Remove rows with missing critical data
99
+ before_dropna = len(courses)
100
+ courses = courses.dropna(subset=["kode_mk", "kategori_mk"])
101
+ if len(courses) < before_dropna:
102
+ logger.info(
103
+ f" Removed {before_dropna - len(courses)} rows with missing kode_mk or kategori_mk"
104
+ )
105
+
106
+ # Step 4: Validate kategori_mk values (should be P or W)
107
+ valid_categories = {"P", "W"}
108
+ invalid_mask = ~courses["kategori_mk"].isin(valid_categories)
109
+ if invalid_mask.any():
110
+ invalid_cats = courses[invalid_mask]["kategori_mk"].unique()
111
+ logger.warning(
112
+ f" Found {invalid_mask.sum()} courses with invalid categories: {invalid_cats}"
113
+ )
114
+ logger.warning(" Keeping only valid categories (P, W)")
115
+ courses = courses[~invalid_mask]
116
+
117
+ # Step 5: Remove duplicate course codes (keep first)
118
+ before_dedup = len(courses)
119
+ courses = courses.drop_duplicates(subset="kode_mk", keep="first")
120
+ if len(courses) < before_dedup:
121
+ logger.info(
122
+ f" Removed {before_dedup - len(courses)} duplicate course codes (kept first occurrence)"
123
+ )
124
+
125
+ logger.info(f" Final course catalog: {len(courses)} unique courses")
126
+
127
+ return courses
128
+
129
+ def _clean_students_data(self, students: pd.DataFrame) -> pd.DataFrame:
130
+ """
131
+ Clean and validate student enrollment data.
132
+
133
+ Cleaning steps:
134
+ 1. Remove rows with missing critical data
135
+ 2. Standardize data types
136
+ 3. Remove invalid year/semester values
137
+ 4. Remove duplicate enrollment records
138
+ """
139
+ initial_count = len(students)
140
+
141
+ # Step 1: Remove rows with missing critical data
142
+ students = students.dropna(subset=["kode_mk", "thn", "smt", "kode_mhs"])
143
+ if len(students) < initial_count:
144
+ logger.info(
145
+ f" Removed {initial_count - len(students)} rows with missing critical data"
146
+ )
147
+
148
+ # Step 2: Ensure correct data types
149
+ students["thn"] = pd.to_numeric(students["thn"], errors="coerce")
150
+ students["smt"] = pd.to_numeric(students["smt"], errors="coerce")
151
+
152
+ # Step 3: Remove rows with invalid year/semester after conversion
153
+ before_invalid = len(students)
154
+ students = students.dropna(subset=["thn", "smt"])
155
+ if len(students) < before_invalid:
156
+ logger.info(
157
+ f" Removed {before_invalid - len(students)} rows with invalid year/semester values"
158
+ )
159
+
160
+ # Step 4: Validate semester values (should be 1, 2, or 3)
161
+ valid_semesters = {1, 2, 3}
162
+ invalid_sem = ~students["smt"].isin(valid_semesters)
163
+ if invalid_sem.any():
164
+ logger.warning(
165
+ f" Found {invalid_sem.sum()} records with invalid semester values"
166
+ )
167
+ students = students[~invalid_sem]
168
+
169
+ # Step 5: Validate year range (reasonable academic years)
170
+ current_year = pd.Timestamp.now().year
171
+ invalid_year = (students["thn"] < 2000) | (students["thn"] > current_year + 1)
172
+ if invalid_year.any():
173
+ logger.warning(
174
+ f" Found {invalid_year.sum()} records with unreasonable year values"
175
+ )
176
+ students = students[~invalid_year]
177
+
178
+ # Step 6: Remove exact duplicate enrollments (same student, course, semester)
179
+ before_dedup = len(students)
180
+ students = students.drop_duplicates(
181
+ subset=["kode_mhs", "kode_mk", "thn", "smt"], keep="first"
182
+ )
183
+ if len(students) < before_dedup:
184
+ logger.info(
185
+ f" Removed {before_dedup - len(students)} duplicate enrollment records"
186
+ )
187
+
188
+ logger.info(f" Final enrollment records: {len(students)}")
189
+
190
+ return students
191
+
192
+ def _clean_yearly_population(self, yearly_pop: pd.DataFrame) -> pd.DataFrame:
193
+ """
194
+ Clean and validate yearly student population data.
195
+
196
+ Cleaning steps:
197
+ 1. Remove duplicates
198
+ 2. Validate and fill missing population data
199
+ 3. Ensure chronological order
200
+ """
201
+ # Remove duplicate year-semester combinations
202
+ before_dedup = len(yearly_pop)
203
+ yearly_pop = yearly_pop.drop_duplicates(subset=["thn", "smt"], keep="first")
204
+ if len(yearly_pop) < before_dedup:
205
+ logger.info(
206
+ f" Removed {before_dedup - len(yearly_pop)} duplicate year-semester records"
207
+ )
208
+
209
+ # Ensure jumlah_aktif is numeric and positive
210
+ yearly_pop["jumlah_aktif"] = pd.to_numeric(
211
+ yearly_pop["jumlah_aktif"], errors="coerce"
212
+ )
213
+
214
+ # Replace zero or negative values with NaN (will be filled later)
215
+ yearly_pop.loc[yearly_pop["jumlah_aktif"] <= 0, "jumlah_aktif"] = np.nan
216
+
217
+ # Sort by year and semester
218
+ yearly_pop = yearly_pop.sort_values(["thn", "smt"]).reset_index(drop=True)
219
+
220
+ logger.info(f" Yearly population records: {len(yearly_pop)}")
221
+
222
+ return yearly_pop
223
+
224
+ def _preprocess(self) -> Tuple[pd.DataFrame, Set[str]]:
225
+ """Clean, merge, and aggregate data with comprehensive cleaning."""
226
+ logger.info("Preprocessing data...")
227
+ logger.info("-" * 60)
228
+
229
+ # Step 1: Clean course catalog
230
+ logger.info("Step 1: Cleaning course catalog...")
231
+ courses = self._clean_courses_data(self.raw_data["courses"].copy())
232
+
233
+ # Step 2: Identify elective courses
234
+ elective_category = self.config.data.ELECTIVE_CATEGORY
235
+ self.elective_codes = set(
236
+ courses[courses["kategori_mk"] == elective_category]["kode_mk"]
237
+ )
238
+ logger.info(f"Step 2: Identified {len(self.elective_codes)} elective courses")
239
+
240
+ if len(self.elective_codes) == 0:
241
+ logger.warning(
242
+ f"No elective courses found! Check if kategori_mk = '{elective_category}' exists in data."
243
+ )
244
+ logger.warning(
245
+ f"Elective identification rule: {self.config.get_elective_filter_description()}"
246
+ )
247
+ return pd.DataFrame(), set()
248
+
249
+ # Step 3: Clean student enrollment data
250
+ logger.info("Step 3: Cleaning student enrollment data...")
251
+ students = self._clean_students_data(self.raw_data["students_ind"].copy())
252
+
253
+ # Step 4: Filter for elective courses only
254
+ students = students[students["kode_mk"].isin(self.elective_codes)]
255
+ logger.info(f"Step 4: Filtered to {len(students)} elective enrollment records")
256
+
257
+ if len(students) == 0:
258
+ logger.warning("No enrollment data found for elective courses!")
259
+ return pd.DataFrame(), self.elective_codes
260
+
261
+ # Step 5: Aggregate enrollment by course-semester
262
+ logger.info("Step 5: Aggregating enrollment data...")
263
+ enrollment = (
264
+ students.groupby(["kode_mk", "thn", "smt"])["kode_mhs"]
265
+ .nunique()
266
+ .reset_index(name="enrollment")
267
+ )
268
+ logger.info(f" Created {len(enrollment)} course-semester enrollment records")
269
+
270
+ # Step 6: Clean yearly population data
271
+ logger.info("Step 6: Cleaning yearly population data...")
272
+ yearly_pop = self._clean_yearly_population(
273
+ self.raw_data["students_yearly"][["thn", "smt", "jumlah_aktif"]].copy()
274
+ )
275
+
276
+ # Step 7: Merge enrollment with population data
277
+ logger.info("Step 7: Merging enrollment with population data...")
278
+ df = enrollment.merge(yearly_pop, on=["thn", "smt"], how="left")
279
+
280
+ # Step 8: Handle missing population data
281
+ missing_pop = df["jumlah_aktif"].isna().sum()
282
+ if missing_pop > 0:
283
+ logger.warning(
284
+ f" {missing_pop} records missing population data - filling with interpolation"
285
+ )
286
+ df["jumlah_aktif"] = df["jumlah_aktif"].ffill().bfill()
287
+
288
+ # If still missing, use a reasonable default
289
+ if df["jumlah_aktif"].isna().any():
290
+ default_pop = 500 # Reasonable default student population
291
+ logger.warning(
292
+ f" Some population data still missing - using default: {default_pop}"
293
+ )
294
+ df["jumlah_aktif"] = df["jumlah_aktif"].fillna(default_pop)
295
+
296
+ # Step 9: Validate enrollment data
297
+ logger.info("Step 8: Validating final enrollment data...")
298
+ df = self._validate_enrollment_data(df)
299
+
300
+ # Step 10: Sort and finalize
301
+ df = df.sort_values(["kode_mk", "thn", "smt"]).reset_index(drop=True)
302
+ self.processed_data = df
303
+
304
+ logger.info("-" * 60)
305
+ logger.info(
306
+ f"✓ Preprocessing complete. {len(df)} enrollment records generated."
307
+ )
308
+ logger.info(f"✓ Year range: {df['thn'].min():.0f} - {df['thn'].max():.0f}")
309
+ logger.info(f"✓ Courses with data: {df['kode_mk'].nunique()}")
310
+ logger.info("-" * 60)
311
+
312
+ return df, self.elective_codes
313
+
314
+ def _validate_enrollment_data(self, df: pd.DataFrame) -> pd.DataFrame:
315
+ """
316
+ Validate and clean the final enrollment dataset.
317
+
318
+ Checks:
319
+ 1. Remove records with zero enrollment
320
+ 2. Check for outliers
321
+ 3. Validate population data
322
+ """
323
+ initial_count = len(df)
324
+
325
+ # Remove zero enrollments
326
+ df = df[df["enrollment"] > 0]
327
+ if len(df) < initial_count:
328
+ logger.info(
329
+ f" Removed {initial_count - len(df)} records with zero enrollment"
330
+ )
331
+
332
+ # Check for extreme outliers in enrollment
333
+ for course in df["kode_mk"].unique():
334
+ course_data = df[df["kode_mk"] == course]["enrollment"]
335
+ if len(course_data) > 1:
336
+ q75, q25 = course_data.quantile([0.75, 0.25])
337
+ iqr = q75 - q25
338
+ upper_bound = q75 + (3 * iqr) # Using 3*IQR for outliers
339
+
340
+ outliers = course_data > upper_bound
341
+ if outliers.any():
342
+ logger.debug(
343
+ f" Course {course} has {outliers.sum()} potential outliers (keeping them)"
344
+ )
345
+
346
+ # Ensure population is reasonable
347
+ if (df["jumlah_aktif"] < 50).any():
348
+ logger.warning(" Some semesters have very low student population (<50)")
349
+
350
+ return df
data_validator.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Validation Utility
3
+
4
+ Provides pre-flight checks and data quality validation for the enrollment prediction system.
5
+ This module validates data availability, quality, and completeness before processing.
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import Dict, List, Optional, Tuple
11
+
12
+ import pandas as pd
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class ValidationResult:
19
+ """Result of a validation check."""
20
+
21
+ passed: bool
22
+ message: str
23
+ severity: str = "INFO" # INFO, WARNING, ERROR
24
+ details: Optional[Dict] = None
25
+
26
+
27
+ @dataclass
28
+ class SemesterDataStatus:
29
+ """Status of data availability for a specific semester."""
30
+
31
+ year: int
32
+ semester: int
33
+ has_offerings: bool
34
+ has_enrollments: bool
35
+ has_elective_enrollments: bool
36
+ total_enrollments: int
37
+ elective_enrollments: int
38
+ elective_courses: List[str]
39
+
40
+
41
+ class DataValidator:
42
+ """Validates data quality and availability for the enrollment prediction system."""
43
+
44
+ def __init__(self, file_path: str):
45
+ """
46
+ Initialize the validator.
47
+
48
+ Args:
49
+ file_path: Path to the Excel data file
50
+ """
51
+ self.file_path = file_path
52
+ self.validation_results: List[ValidationResult] = []
53
+
54
+ def validate_all(self) -> Tuple[bool, List[ValidationResult]]:
55
+ """
56
+ Run all validation checks.
57
+
58
+ Returns:
59
+ Tuple of (all_passed, list of validation results)
60
+ """
61
+ logger.info("Running comprehensive data validation...")
62
+
63
+ # Load raw data
64
+ try:
65
+ self.raw_data = self._load_raw_data()
66
+ except Exception as e:
67
+ self.validation_results.append(
68
+ ValidationResult(
69
+ passed=False,
70
+ message=f"Failed to load data: {str(e)}",
71
+ severity="ERROR",
72
+ )
73
+ )
74
+ return False, self.validation_results
75
+
76
+ # Run validation checks
77
+ self._validate_file_structure()
78
+ self._validate_course_catalog()
79
+ self._validate_elective_courses()
80
+ self._validate_enrollment_data()
81
+ self._validate_population_data()
82
+
83
+ # Overall result
84
+ all_passed = all(
85
+ r.passed for r in self.validation_results if r.severity == "ERROR"
86
+ )
87
+
88
+ return all_passed, self.validation_results
89
+
90
+ def check_semester_data_availability(
91
+ self, year: int, semester: int
92
+ ) -> SemesterDataStatus:
93
+ """
94
+ Check data availability for a specific semester.
95
+
96
+ Args:
97
+ year: Academic year
98
+ semester: Semester (1 or 2)
99
+
100
+ Returns:
101
+ SemesterDataStatus object with detailed availability info
102
+ """
103
+ if not hasattr(self, "raw_data"):
104
+ self.raw_data = self._load_raw_data()
105
+
106
+ # Check course offerings (tabel2)
107
+ offerings = self.raw_data["offerings"]
108
+ has_offerings = (
109
+ len(
110
+ offerings[
111
+ (offerings["tahun"] == year) & (offerings["semester"] == semester)
112
+ ]
113
+ )
114
+ > 0
115
+ )
116
+
117
+ # Check enrollments (tabel4)
118
+ students = self.raw_data["students"]
119
+ semester_enrollments = students[
120
+ (students["thn"] == year) & (students["smt"] == semester)
121
+ ]
122
+ has_enrollments = len(semester_enrollments) > 0
123
+
124
+ # Check elective enrollments
125
+ elective_codes = self._get_elective_codes()
126
+ elective_enrollments = semester_enrollments[
127
+ semester_enrollments["kode_mk"].isin(elective_codes)
128
+ ]
129
+ has_elective_enrollments = len(elective_enrollments) > 0
130
+
131
+ # Get elective courses for this semester
132
+ elective_courses = []
133
+ if has_elective_enrollments:
134
+ elective_courses = (
135
+ elective_enrollments.groupby("kode_mk")["kode_mhs"]
136
+ .nunique()
137
+ .sort_values(ascending=False)
138
+ .to_dict()
139
+ )
140
+
141
+ return SemesterDataStatus(
142
+ year=year,
143
+ semester=semester,
144
+ has_offerings=has_offerings,
145
+ has_enrollments=has_enrollments,
146
+ has_elective_enrollments=has_elective_enrollments,
147
+ total_enrollments=len(semester_enrollments),
148
+ elective_enrollments=len(elective_enrollments),
149
+ elective_courses=elective_courses,
150
+ )
151
+
152
+ def get_available_semesters_for_backtesting(self) -> List[Tuple[int, int]]:
153
+ """
154
+ Get list of semesters that have elective enrollment data (suitable for backtesting).
155
+
156
+ Returns:
157
+ List of (year, semester) tuples
158
+ """
159
+ if not hasattr(self, "raw_data"):
160
+ self.raw_data = self._load_raw_data()
161
+
162
+ students = self.raw_data["students"]
163
+ elective_codes = self._get_elective_codes()
164
+
165
+ # Filter to elective enrollments only
166
+ elective_students = students[students["kode_mk"].isin(elective_codes)]
167
+
168
+ # Get unique year-semester combinations
169
+ available = (
170
+ elective_students.groupby(["thn", "smt"]).size().reset_index(name="count")
171
+ )
172
+ available = available[available["count"] > 0]
173
+
174
+ semesters = [
175
+ (int(row["thn"]), int(row["smt"])) for _, row in available.iterrows()
176
+ ]
177
+ semesters.sort(reverse=True) # Most recent first
178
+
179
+ return semesters
180
+
181
+ def print_validation_summary(self):
182
+ """Print a summary of validation results."""
183
+ if not self.validation_results:
184
+ print("\nWARNING: No validation has been run yet.")
185
+ return
186
+
187
+ print("\n" + "=" * 80)
188
+ print("DATA VALIDATION SUMMARY")
189
+ print("=" * 80)
190
+
191
+ errors = [r for r in self.validation_results if r.severity == "ERROR"]
192
+ warnings = [r for r in self.validation_results if r.severity == "WARNING"]
193
+ info = [r for r in self.validation_results if r.severity == "INFO"]
194
+
195
+ if errors:
196
+ print(f"\nERROR ({len(errors)}):")
197
+ for result in errors:
198
+ print(f" - {result.message}")
199
+
200
+ if warnings:
201
+ print(f"\nWARNING ({len(warnings)}):")
202
+ for result in warnings:
203
+ print(f" - {result.message}")
204
+
205
+ if info:
206
+ print(f"\nINFO ({len(info)}):")
207
+ for result in info:
208
+ print(f" - {result.message}")
209
+
210
+ print("\n" + "=" * 80)
211
+ if not errors:
212
+ print("VALIDATION PASSED - Data is ready for processing")
213
+ else:
214
+ print("VALIDATION FAILED - Please fix errors before proceeding")
215
+ print("=" * 80)
216
+
217
+ def _load_raw_data(self) -> Dict[str, pd.DataFrame]:
218
+ """Load raw data from Excel file."""
219
+ logger.info(f"Loading data from {self.file_path}...")
220
+
221
+ return {
222
+ "courses": pd.read_excel(self.file_path, sheet_name="tabel1_data_matkul"),
223
+ "offerings": pd.read_excel(
224
+ self.file_path, sheet_name="tabel2_data_matkul_dibuka"
225
+ ),
226
+ "population": pd.read_excel(
227
+ self.file_path, sheet_name="tabel3_data_mahasiswa_per_tahun"
228
+ ),
229
+ "students": pd.read_excel(
230
+ self.file_path, sheet_name="tabel4_data_individu_mahasiswa"
231
+ ),
232
+ }
233
+
234
+ def _validate_file_structure(self):
235
+ """Validate that all required sheets and columns exist."""
236
+ required_sheets = {
237
+ "courses": ["kode_mk", "nama_mk", "kategori_mk"],
238
+ "offerings": ["kode_mk", "tahun", "semester"],
239
+ "students": ["kode_mk", "kode_mhs", "thn", "smt"],
240
+ "population": ["jumlah_aktif"], # tahun_ajaran and semester may vary
241
+ }
242
+
243
+ for sheet_name, required_cols in required_sheets.items():
244
+ df = self.raw_data.get(sheet_name)
245
+ if df is None:
246
+ self.validation_results.append(
247
+ ValidationResult(
248
+ passed=False,
249
+ message=f"Sheet '{sheet_name}' not found",
250
+ severity="ERROR",
251
+ )
252
+ )
253
+ continue
254
+
255
+ missing_cols = [col for col in required_cols if col not in df.columns]
256
+ if missing_cols:
257
+ self.validation_results.append(
258
+ ValidationResult(
259
+ passed=False,
260
+ message=f"Missing columns in {sheet_name}: {missing_cols}",
261
+ severity="ERROR",
262
+ )
263
+ )
264
+ else:
265
+ self.validation_results.append(
266
+ ValidationResult(
267
+ passed=True,
268
+ message=f"Sheet '{sheet_name}' has all required columns",
269
+ severity="INFO",
270
+ )
271
+ )
272
+
273
+ def _validate_course_catalog(self):
274
+ """Validate course catalog (tabel1)."""
275
+ courses = self.raw_data["courses"]
276
+
277
+ # Check for duplicates
278
+ total_records = len(courses)
279
+ unique_courses = courses["kode_mk"].nunique()
280
+ duplicate_count = total_records - unique_courses
281
+
282
+ if duplicate_count > 0:
283
+ self.validation_results.append(
284
+ ValidationResult(
285
+ passed=True,
286
+ message=f"Course catalog has {duplicate_count:,} duplicate records (will be cleaned)",
287
+ severity="WARNING",
288
+ details={"total": total_records, "unique": unique_courses},
289
+ )
290
+ )
291
+
292
+ # Check for category consistency
293
+ categories = courses["kategori_mk"].unique()
294
+ non_standard = [c for c in categories if c not in ["W", "P"]]
295
+ if non_standard:
296
+ self.validation_results.append(
297
+ ValidationResult(
298
+ passed=True,
299
+ message=f"Non-standard categories found: {non_standard} (will be normalized)",
300
+ severity="WARNING",
301
+ )
302
+ )
303
+
304
+ def _validate_elective_courses(self):
305
+ """Validate elective course identification."""
306
+ courses = self.raw_data["courses"]
307
+
308
+ # Clean and identify electives
309
+ courses_clean = courses.drop_duplicates(subset="kode_mk").copy()
310
+ courses_clean["kategori_mk"] = (
311
+ courses_clean["kategori_mk"].astype(str).str.upper().str.strip()
312
+ )
313
+
314
+ electives = courses_clean[courses_clean["kategori_mk"] == "P"]
315
+ elective_count = len(electives)
316
+
317
+ if elective_count == 0:
318
+ self.validation_results.append(
319
+ ValidationResult(
320
+ passed=False,
321
+ message="No elective courses found (kategori_mk = 'P')",
322
+ severity="ERROR",
323
+ )
324
+ )
325
+ else:
326
+ self.validation_results.append(
327
+ ValidationResult(
328
+ passed=True,
329
+ message=f"Found {elective_count} elective courses",
330
+ severity="INFO",
331
+ details={"electives": electives["kode_mk"].tolist()},
332
+ )
333
+ )
334
+
335
+ def _validate_enrollment_data(self):
336
+ """Validate student enrollment data (tabel4)."""
337
+ students = self.raw_data["students"]
338
+
339
+ # Check for missing critical data
340
+ critical_fields = ["kode_mk", "kode_mhs", "thn", "smt"]
341
+ missing_data = students[critical_fields].isnull().any(axis=1).sum()
342
+
343
+ if missing_data > 0:
344
+ self.validation_results.append(
345
+ ValidationResult(
346
+ passed=True,
347
+ message=f"{missing_data} enrollment records have missing data (will be cleaned)",
348
+ severity="WARNING",
349
+ )
350
+ )
351
+
352
+ # Check for duplicates
353
+ duplicate_enrollments = students.duplicated(
354
+ subset=["kode_mhs", "kode_mk", "thn", "smt"]
355
+ ).sum()
356
+
357
+ if duplicate_enrollments > 0:
358
+ self.validation_results.append(
359
+ ValidationResult(
360
+ passed=True,
361
+ message=f"{duplicate_enrollments:,} duplicate enrollment records (will be cleaned)",
362
+ severity="WARNING",
363
+ )
364
+ )
365
+
366
+ # Check year range
367
+ min_year = students["thn"].min()
368
+ max_year = students["thn"].max()
369
+
370
+ self.validation_results.append(
371
+ ValidationResult(
372
+ passed=True,
373
+ message=f"Enrollment data spans {int(min_year)} to {int(max_year)}",
374
+ severity="INFO",
375
+ )
376
+ )
377
+
378
+ def _validate_population_data(self):
379
+ """Validate yearly population data (tabel3)."""
380
+ population = self.raw_data["population"]
381
+
382
+ if len(population) == 0:
383
+ self.validation_results.append(
384
+ ValidationResult(
385
+ passed=False,
386
+ message="No population data found",
387
+ severity="ERROR",
388
+ )
389
+ )
390
+ return
391
+
392
+ # Check for required fields (note: actual columns are tahun_ajaran/semester, not in sheet_name definition)
393
+ if "jumlah_aktif" in population.columns:
394
+ min_pop = population["jumlah_aktif"].min()
395
+ max_pop = population["jumlah_aktif"].max()
396
+
397
+ self.validation_results.append(
398
+ ValidationResult(
399
+ passed=True,
400
+ message=f"Population data: {len(population)} records, range {int(min_pop)}-{int(max_pop)} students",
401
+ severity="INFO",
402
+ )
403
+ )
404
+ else:
405
+ self.validation_results.append(
406
+ ValidationResult(
407
+ passed=False,
408
+ message="Population data missing 'jumlah_aktif' column",
409
+ severity="ERROR",
410
+ )
411
+ )
412
+
413
+ def _get_elective_codes(self) -> set:
414
+ """Get set of elective course codes."""
415
+ courses = self.raw_data["courses"]
416
+ courses_clean = courses.drop_duplicates(subset="kode_mk").copy()
417
+ courses_clean["kategori_mk"] = (
418
+ courses_clean["kategori_mk"].astype(str).str.upper().str.strip()
419
+ )
420
+ return set(courses_clean[courses_clean["kategori_mk"] == "P"]["kode_mk"])
421
+
422
+
423
+ if __name__ == "__main__":
424
+ # Example usage
425
+ logging.basicConfig(
426
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
427
+ )
428
+
429
+ validator = DataValidator(
430
+ "data/Data Perkuliahan Mahasiswa untuk Penelitian (8 Oktober 2025).xlsx"
431
+ )
432
+
433
+ # Run validation
434
+ passed, results = validator.validate_all()
435
+ validator.print_validation_summary()
436
+
437
+ # Check specific semesters
438
+ print("\n" + "=" * 80)
439
+ print("SEMESTER DATA AVAILABILITY")
440
+ print("=" * 80)
441
+
442
+ for year, semester in [(2024, 2), (2025, 1)]:
443
+ status = validator.check_semester_data_availability(year, semester)
444
+ print(f"\n{year} Semester {semester}:")
445
+ print(f" Offerings: {'Yes' if status.has_offerings else 'No'}")
446
+ print(
447
+ f" Enrollments: {'Yes' if status.has_enrollments else 'No'} ({status.total_enrollments} records)"
448
+ )
449
+ print(
450
+ f" Elective Enrollments: {'Yes' if status.has_elective_enrollments else 'No'} ({status.elective_enrollments} records)"
451
+ )
452
+ if status.elective_courses:
453
+ print(f" Elective courses: {len(status.elective_courses)}")
454
+ for code, count in list(status.elective_courses.items())[:5]:
455
+ print(f" - {code}: {count} students")
456
+
457
+ # Show available semesters for backtesting
458
+ print("\n" + "=" * 80)
459
+ print("SEMESTERS AVAILABLE FOR BACKTESTING")
460
+ print("=" * 80)
461
+ available = validator.get_available_semesters_for_backtesting()
462
+ if available:
463
+ print(f"\nFound {len(available)} semesters with elective enrollment data:")
464
+ for year, sem in available:
465
+ print(f" • {year} Semester {sem}")
466
+ else:
467
+ print("\nERROR: No semesters with elective enrollment data found!")
evaluator.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import pandas as pd
8
+ import seaborn as sns
9
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
10
+
11
+ from config import Config
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Evaluator:
17
+ def __init__(self, config: Config):
18
+ self.config = config
19
+
20
+ def run_backtest(self, full_data: pd.DataFrame, predictor):
21
+ """Simulate past semesters to check accuracy."""
22
+ logger.info("Starting Backtest...")
23
+ results = []
24
+
25
+ start_year: int = self.config.backtest.START_YEAR
26
+ end_year: int = self.config.backtest.END_YEAR
27
+
28
+ for year in range(start_year, end_year + 1):
29
+ for smt in [1, 2]:
30
+ target_mask = (full_data["thn"] == year) & (full_data["smt"] == smt)
31
+ test_set = full_data[target_mask]
32
+
33
+ if test_set.empty:
34
+ continue
35
+
36
+ train_set = full_data[
37
+ (full_data["thn"] < year)
38
+ | ((full_data["thn"] == year) & (full_data["smt"] < smt))
39
+ ]
40
+
41
+ try:
42
+ pop_est = predictor.get_student_forecast(year, smt)
43
+ except Exception:
44
+ pop_est = test_set["jumlah_aktif"].mean()
45
+
46
+ for _, row in test_set.iterrows():
47
+ pred = predictor.predict_course(
48
+ row["kode_mk"], train_set, year, smt, pop_est
49
+ )
50
+
51
+ results.append(
52
+ {
53
+ "year": year,
54
+ "semester": smt,
55
+ "kode_mk": row["kode_mk"],
56
+ "actual": row["enrollment"],
57
+ "predicted": pred["val"],
58
+ "strategy": pred["strategy"],
59
+ "error": abs(row["enrollment"] - pred["val"]),
60
+ }
61
+ )
62
+
63
+ return pd.DataFrame(results)
64
+
65
+ def generate_metrics(self, results: pd.DataFrame):
66
+ """Calculate and log performance metrics."""
67
+ results["error"] = abs(results["predicted"] - results["actual"])
68
+
69
+ mae = mean_absolute_error(results["actual"], results["predicted"])
70
+ rmse = np.sqrt(mean_squared_error(results["actual"], results["predicted"]))
71
+
72
+ logger.info("\n" + "=" * 40)
73
+ logger.info("BACKTEST METRICS")
74
+ logger.info("=" * 40)
75
+ logger.info(f"Overall MAE: {mae:.2f}")
76
+ logger.info(f"Overall RMSE: {rmse:.2f}")
77
+
78
+ logger.info("\nPerformance by Strategy:")
79
+ strat_perf = results.groupby("strategy")["error"].mean()
80
+ logger.info(strat_perf.to_string())
81
+
82
+ self._plot_results(results)
83
+
84
+ return {
85
+ 'mae': mae,
86
+ 'rmse': rmse
87
+ }
88
+
89
+ def _plot_results(self, df):
90
+ """Generate simple Actual vs Predicted scatter plot."""
91
+ Path(self.config.output.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
92
+
93
+ plt.figure(figsize=(10, 6))
94
+ sns.scatterplot(
95
+ data=df, x="actual", y="predicted", hue="strategy", style="strategy"
96
+ )
97
+
98
+ limit = max(df["actual"].max(), df["predicted"].max())
99
+ plt.plot([0, limit], [0, limit], "r--", alpha=0.5)
100
+
101
+ plt.title("Actual vs Predicted Enrollment")
102
+ plt.savefig(f"{self.config.output.OUTPUT_DIR}/backtest_scatter.png")
103
+ plt.close()
prophet_predictor.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from prophet import Prophet
7
+
8
+ from config import Config
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ProphetPredictor:
14
+ def __init__(self, config: Config):
15
+ self.config = config
16
+ self.student_model: Optional[Prophet] = None
17
+
18
+ def train_student_population_model(self, population_data: pd.DataFrame):
19
+ df = population_data.copy()
20
+ df["ds"] = pd.to_datetime(
21
+ df["thn"].astype(str)
22
+ + "-"
23
+ + df["smt"].map(self.config.prediction.SEMESTER_TO_MONTH)
24
+ )
25
+ df["y"] = df["jumlah_aktif"]
26
+
27
+ self.student_model = Prophet(daily_seasonality=False, weekly_seasonality=False) # type: ignore[arg-type]
28
+ self.student_model.fit(df)
29
+ logger.info("Student population model trained.")
30
+
31
+ def get_student_forecast(self, year: int, semester: int) -> float:
32
+ assert self.student_model is not None, "Student model must be trained first"
33
+ target_date = pd.to_datetime(
34
+ f"{year}-{self.config.prediction.SEMESTER_TO_MONTH[semester]}"
35
+ )
36
+ future = pd.DataFrame({"ds": [target_date]})
37
+ forecast = self.student_model.predict(future)
38
+ return max(forecast["yhat"].values[0], 100)
39
+
40
+ def predict_course(
41
+ self,
42
+ course_code: str,
43
+ df_history: pd.DataFrame,
44
+ target_year: int,
45
+ target_smt: int,
46
+ student_pop: float,
47
+ ) -> dict:
48
+ hist = df_history[
49
+ (df_history["kode_mk"] == course_code) &
50
+ (df_history["smt"] == target_smt)
51
+ ].sort_values(["thn", "smt"])
52
+
53
+ if len(hist) == 0:
54
+ return {
55
+ "val": self.config.model.FALLBACK_DEFAULT,
56
+ "strategy": "cold_start",
57
+ "confidence": "low",
58
+ }
59
+
60
+ return self._predict_prophet_logistic(
61
+ hist, target_year, target_smt, student_pop
62
+ )
63
+
64
+ def _predict_prophet_logistic(
65
+ self, hist: pd.DataFrame, year: int, smt: int, pop: float
66
+ ) -> dict:
67
+ df = hist.copy()
68
+ df["ds"] = pd.to_datetime(
69
+ df["thn"].astype(int).astype(str)
70
+ + "-"
71
+ + df["smt"].astype(int).map(self.config.prediction.SEMESTER_TO_MONTH)
72
+ )
73
+ df["y"] = df["enrollment"]
74
+
75
+ if df["y"].notna().sum() < 2:
76
+ return {
77
+ "val": hist["enrollment"].mean(),
78
+ "strategy": "fallback_mean",
79
+ "confidence": "medium",
80
+ }
81
+
82
+ if "jumlah_aktif" not in df.columns:
83
+ logger.warning(
84
+ "jumlah_aktif column missing from historical data - cannot use regressor"
85
+ )
86
+ return {
87
+ "val": hist["enrollment"].mean(),
88
+ "strategy": "fallback_mean",
89
+ "confidence": "low",
90
+ }
91
+
92
+ hist_max = df["y"].max()
93
+ hist_mean = df["y"].mean()
94
+
95
+ cap_value = min(
96
+ hist_max * self.config.prediction.MAX_CAPACITY_MULTIPLIER,
97
+ self.config.prediction.ABSOLUTE_MAX_STUDENTS,
98
+ )
99
+
100
+ df["cap"] = cap_value
101
+ df["floor"] = 0
102
+
103
+ try:
104
+ m = Prophet(
105
+ growth=self.config.model.GROWTH_MODE,
106
+ changepoint_prior_scale=self.config.model.CHANGEPOINT_SCALE,
107
+ seasonality_mode=self.config.model.SEASONALITY_MODE,
108
+ daily_seasonality=False, # type: ignore[arg-type]
109
+ weekly_seasonality=False, # type: ignore[arg-type]
110
+ )
111
+
112
+ m.add_regressor("jumlah_aktif", mode="multiplicative")
113
+ m.fit(df[["ds", "y", "cap", "floor", "jumlah_aktif"]])
114
+
115
+ future_date = pd.to_datetime(
116
+ f"{year}-{self.config.prediction.SEMESTER_TO_MONTH[smt]}"
117
+ )
118
+
119
+ future = pd.DataFrame(
120
+ {
121
+ "ds": [future_date],
122
+ "cap": [cap_value],
123
+ "floor": [0],
124
+ "jumlah_aktif": [pop],
125
+ }
126
+ )
127
+
128
+ forecast = m.predict(future)
129
+ raw_pred = forecast["yhat"].values[0]
130
+
131
+ if (
132
+ raw_pred < 0
133
+ or not np.isfinite(raw_pred)
134
+ or raw_pred > hist_max * 5
135
+ or raw_pred > cap_value * 2
136
+ ):
137
+ logger.warning(
138
+ f"Prophet prediction ({raw_pred:.1f}) unrealistic. "
139
+ f"Using trend-based fallback. (hist_max={hist_max}, cap={cap_value})"
140
+ )
141
+ if len(df) >= 3:
142
+ recent_trend = df["y"].tail(3).mean()
143
+ pop_growth_factor = pop / df["jumlah_aktif"].mean()
144
+ growth_factor = min(
145
+ max(pop_growth_factor, 0.8), 1.3
146
+ )
147
+ pred = recent_trend * growth_factor
148
+ else:
149
+ pop_growth_factor = pop / df["jumlah_aktif"].mean()
150
+ pred = hist_mean * min(max(pop_growth_factor, 0.8), 1.3)
151
+
152
+ pred = min(max(pred, 0), cap_value)
153
+
154
+ return {
155
+ "val": pred,
156
+ "strategy": "trend_fallback",
157
+ "confidence": "medium",
158
+ }
159
+
160
+ pred = min(max(0, raw_pred), cap_value)
161
+
162
+ return {
163
+ "val": pred,
164
+ "strategy": "prophet_logistic",
165
+ "confidence": "high",
166
+ }
167
+
168
+ except Exception as e:
169
+ logger.warning(f"Prophet failed for course. Error: {e}. Using fallback.")
170
+ return {
171
+ "val": hist["enrollment"].mean(),
172
+ "strategy": "fallback_mean",
173
+ "confidence": "medium",
174
+ }
175
+
176
+ def generate_batch_predictions(
177
+ self,
178
+ full_data: pd.DataFrame,
179
+ course_metadata: pd.DataFrame,
180
+ electives: set,
181
+ year: int,
182
+ smt: int,
183
+ ):
184
+ """Generate predictions for all courses."""
185
+ student_pop = self.get_student_forecast(year, smt)
186
+ results = []
187
+
188
+ logger.info(
189
+ f"Predicting for {len(electives)} courses (Pop: {int(student_pop)})..."
190
+ )
191
+
192
+ for code in electives:
193
+ meta = course_metadata[course_metadata["kode_mk"] == code].iloc[0]
194
+
195
+ pred_result = self.predict_course(code, full_data, year, smt, student_pop)
196
+ pred_val = pred_result["val"]
197
+
198
+ rec_quota = int(
199
+ np.ceil(pred_val * (1 + self.config.prediction.BUFFER_PERCENT))
200
+ )
201
+ rec_quota = max(rec_quota, self.config.prediction.MIN_QUOTA_OPEN)
202
+
203
+ status = (
204
+ "BUKA"
205
+ if pred_val >= self.config.prediction.MIN_PREDICT_THRESHOLD
206
+ else "TUTUP"
207
+ )
208
+
209
+ results.append(
210
+ {
211
+ "kode_mk": code,
212
+ "nama_mk": meta["nama_mk"],
213
+ "sks": meta["sks_mk"],
214
+ "predicted_enrollment": round(pred_val, 1),
215
+ "recommended_quota": rec_quota if status == "BUKA" else 0,
216
+ "recommendation": status,
217
+ "strategy": pred_result["strategy"],
218
+ "confidence": pred_result["confidence"],
219
+ "classes_est": int(np.ceil(rec_quota / 40))
220
+ if status == "BUKA"
221
+ else 0,
222
+ }
223
+ )
224
+
225
+ return pd.DataFrame(results).sort_values(
226
+ "predicted_enrollment", ascending=False
227
+ )
228
+
229
+ def predict_course_enrollment(
230
+ self,
231
+ course_code: str,
232
+ train_data: pd.DataFrame,
233
+ test_year: int,
234
+ test_semester: int,
235
+ test_student_count: float,
236
+ ) -> tuple[float, str]:
237
+ result = self.predict_course(
238
+ course_code=course_code,
239
+ df_history=train_data,
240
+ target_year=test_year,
241
+ target_smt=test_semester,
242
+ student_pop=test_student_count,
243
+ )
244
+
245
+ val = float(result["val"])
246
+ if not np.isfinite(val):
247
+ val = self.config.model.FALLBACK_DEFAULT
248
+ strategy = "fallback_default"
249
+ else:
250
+ val = max(0.0, val)
251
+ strategy = result["strategy"]
252
+
253
+ return val, strategy
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas>=2.0.0
2
+ numpy>=1.24.0
3
+ openpyxl>=3.1.0
4
+ prophet>=1.1.5
5
+ scikit-learn>=1.3.0
6
+ matplotlib>=3.7.0
7
+ seaborn>=0.12.0
8
+ gradio>=4.0.0
9
+ python-dateutil>=2.8.2
10
+ huggingface_hub>=0.20.0
utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def setup_logging(level: str):
9
+ logging.basicConfig(
10
+ level=level,
11
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
12
+ datefmt="%H:%M:%S",
13
+ )
14
+
15
+ logging.getLogger("prophet").setLevel(logging.WARNING)
16
+ logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
17
+
18
+
19
+ def save_excel(df: pd.DataFrame, filename: str, output_dir: str):
20
+ Path(output_dir).mkdir(exist_ok=True, parents=True)
21
+ path = os.path.join(output_dir, filename)
22
+ df.to_excel(path, index=False)
23
+ logging.info(f"Results saved to: {path}")