File size: 15,226 Bytes
61b18b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3939d46
 
 
 
7b3f4e5
61b18b3
 
 
 
 
bcd37b2
 
 
 
61b18b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3939d46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b3f4e5
 
3939d46
7b3f4e5
 
3939d46
 
7b3f4e5
 
3939d46
 
7b3f4e5
3939d46
 
 
 
 
 
 
 
 
7b3f4e5
3939d46
 
 
 
 
 
7b3f4e5
 
 
 
 
 
 
 
 
 
 
 
 
 
3939d46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61b18b3
 
 
 
3939d46
 
61b18b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3939d46
 
 
61b18b3
 
 
 
 
 
 
 
 
3939d46
 
61b18b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3939d46
 
 
 
61b18b3
 
 
 
 
 
 
 
bcd37b2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import re
from typing import List, Dict, Tuple
from database_connection import DatabaseConnection
import os

class CourseRecommender:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.db_connection = DatabaseConnection()
        self.is_trained = False
        self._available_courses = None  # Cache for available courses
        self._last_data_count = 0  # Track data count for auto-retraining
        self._auto_retrain_threshold = 5  # Retrain every 5 new feedbacks
        self._min_samples_for_training = 10  # Minimum samples needed to train
        self._local_feedback = []  # Store feedback locally for learning
        
    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocess the data for training"""
        df_processed = df.copy()
        
        # Normalize strand to uppercase for case-insensitive matching
        if 'strand' in df_processed.columns:
            df_processed['strand'] = df_processed['strand'].astype(str).str.upper()
        
        # Encode categorical variables
        categorical_columns = ['strand', 'hobbies']
        
        for col in categorical_columns:
            if col not in self.label_encoders:
                self.label_encoders[col] = LabelEncoder()
                df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
            else:
                # Handle unseen labels by using a default value
                try:
                    df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
                except ValueError:
                    # For unseen labels, use the most common label from training
                    most_common = self.label_encoders[col].classes_[0]
                    df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
        
        return df_processed
    
    def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
        """Extract features from hobbies string"""
        if not hobbies or pd.isna(hobbies):
            hobbies = ""
        
        hobbies_lower = str(hobbies).lower()
        
        # Define hobby categories
        hobby_categories = {
            'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
            'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
            'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
            'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
            'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
        }
        
        features = {}
        for category, keywords in hobby_categories.items():
            features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
        
        return features
    
    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepare features for the model"""
        df_features = df.copy()
        
        # Extract hobby features
        hobby_features = []
        for hobbies in df['hobbies']:
            features = self.extract_hobbies_features(hobbies)
            hobby_features.append(features)
        
        hobby_df = pd.DataFrame(hobby_features)
        df_features = pd.concat([df_features, hobby_df], axis=1)
        
        # Normalize GWA to 0-1 scale (75-100 -> 0-1)
        df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
        
        # Create stanine bins
        df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
        df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
        df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
        
        return df_features
    
    def get_available_courses(self):
        """Get available courses with caching"""
        if self._available_courses is None:
            # Try to get courses from /courses endpoint first
            courses = self.db_connection.get_available_courses()
            if not courses:
                print("No courses found in /courses endpoint. Using courses from student feedback data...")
                # Get courses from student feedback data
                df_temp = self.db_connection.get_student_feedback_counts()
                if df_temp.empty:
                    raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
                courses = df_temp['course'].unique().tolist()
                print(f"Using courses from student feedback: {courses}")
            
            self._available_courses = courses
            print(f"Available courses cached: {len(courses)} courses")
        
        return self._available_courses
    
    def refresh_courses_cache(self):
        """Refresh the available courses cache"""
        self._available_courses = None
        return self.get_available_courses()
    
    def get_current_data_count(self):
        """Get current number of feedback records in database"""
        try:
            df = self.db_connection.get_student_feedback_counts()
            return len(df) if not df.empty else 0
        except:
            return 0
    
    def check_and_auto_retrain(self):
        """Check if enough new data exists and auto-retrain if needed"""
        # Use local feedback count for auto-retraining
        local_feedback_count = len(self._local_feedback)
        
        if local_feedback_count < self._min_samples_for_training:
            print(f"Not enough local feedback for training: {local_feedback_count} < {self._min_samples_for_training}")
            return False
        
        if local_feedback_count - self._last_data_count >= self._auto_retrain_threshold:
            print(f"Auto-retraining triggered: {local_feedback_count - self._last_data_count} new local feedbacks")
            try:
                accuracy = self.train_model(use_database=True)
                self._last_data_count = local_feedback_count
                print(f"Auto-retraining completed with accuracy: {accuracy:.3f}")
                return True
            except Exception as e:
                print(f"Auto-retraining failed: {e}")
                return False
        
        return False
    
    def add_feedback_with_learning(self, course: str, stanine: int, gwa: float, strand: str, 
                                 rating: str, hobbies: str) -> bool:
        """Add feedback to database and trigger auto-learning if needed"""
        # Add feedback to database
        success = self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)
        
        if success:
            print(f"Feedback added for course: {course}")
            
            # Store feedback locally for learning (since API has issues)
            feedback_record = {
                'course': course,
                'stanine': stanine,
                'gwa': gwa,
                'strand': strand,
                'rating': rating,
                'hobbies': hobbies,
                'count': 1
            }
            self._local_feedback.append(feedback_record)
            print(f"Feedback stored locally for learning: {len(self._local_feedback)} total")
            
            # Check if we should auto-retrain
            self.check_and_auto_retrain()
        
        return success
    
    def configure_auto_learning(self, retrain_threshold=5, min_samples=10):
        """Configure auto-learning parameters"""
        self._auto_retrain_threshold = retrain_threshold
        self._min_samples_for_training = min_samples
        print(f"Auto-learning configured: retrain every {retrain_threshold} new feedbacks, minimum {min_samples} samples")
    
    def get_learning_status(self):
        """Get current learning status"""
        current_count = self.get_current_data_count()
        return {
            'current_data_count': current_count,
            'last_trained_count': self._last_data_count,
            'new_feedbacks': current_count - self._last_data_count,
            'retrain_threshold': self._auto_retrain_threshold,
            'min_samples': self._min_samples_for_training,
            'ready_for_retrain': (current_count - self._last_data_count) >= self._auto_retrain_threshold
        }
    
    def train_model(self, use_database: bool = True):
        """Train the recommendation model using student feedback data"""
        print("Loading training data from student feedback...")
        
        # Get available courses with caching
        available_courses = self.get_available_courses()
        
        # Get training data from student feedback
        df = self.db_connection.get_student_feedback_counts()
        if df.empty:
            raise ValueError("No student feedback data available for training")
        
        print(f"Student feedback data: {len(df)} samples")
        print(f"Feedback courses: {df['course'].unique().tolist()}")
        
        # Filter training data to only include courses that are available in /courses
        df_filtered = df[df['course'].isin(available_courses)]
        if df_filtered.empty:
            raise ValueError("No training data available for courses that exist in /courses endpoint")
        
        print(f"Training with {len(df_filtered)} samples (filtered to available courses)")
        
        # Clean and prepare data
        df_clean = df_filtered.copy()
        
        # Convert data types
        df_clean['stanine'] = pd.to_numeric(df_clean['stanine'], errors='coerce')
        df_clean['gwa'] = pd.to_numeric(df_clean['gwa'], errors='coerce')
        df_clean['rating'] = df_clean['rating'].astype(str)
        
        # Remove rows with invalid data
        df_clean = df_clean.dropna(subset=['stanine', 'gwa'])
        
        if df_clean.empty:
            raise ValueError("No valid training data after cleaning")
        
        print(f"Training with {len(df_clean)} clean samples")
        
        # Prepare features
        df_features = self.prepare_features(df_clean)
        df_processed = self.preprocess_data(df_features)
        
        # Select features for training
        feature_columns = [
            'stanine', 'gwa_normalized', 'strand', 'hobby_technical', 
            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
            'stanine_high', 'stanine_medium', 'stanine_low'
        ]
        
        X = df_processed[feature_columns]
        y = df_processed['course']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Train model
        self.model.fit(X_train_scaled, y_train)
        
        # Evaluate
        y_pred = self.model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model accuracy: {accuracy:.3f}")
        
        self.is_trained = True
        
        # Save model
        self.save_model()
        
        # Update data count tracking
        self._last_data_count = len(df_clean)
        
        return accuracy
    
    def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
        """Predict course recommendations using student feedback data and available courses"""
        if not self.is_trained:
            self.load_model()
            if not self.is_trained:
                raise ValueError("Model not trained. Please train the model first.")
        
        # Get available courses with caching
        available_courses = self.get_available_courses()
        
        # Create input data
        input_data = pd.DataFrame({
            'stanine': [stanine],
            'gwa': [gwa],
            'strand': [strand],
            'hobbies': [hobbies]
        })
        
        # Prepare features
        input_features = self.prepare_features(input_data)
        input_processed = self.preprocess_data(input_features)
        
        # Select same features as training
        feature_columns = [
            'stanine', 'gwa_normalized', 'strand', 'hobby_technical', 
            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
            'stanine_high', 'stanine_medium', 'stanine_low'
        ]
        
        X = input_processed[feature_columns]
        X_scaled = self.scaler.transform(X)
        
        # Get predictions with probabilities
        probabilities = self.model.predict_proba(X_scaled)[0]
        classes = self.model.classes_
        
        # Filter recommendations to only include courses available in /courses endpoint
        available_recommendations = []
        for i, course in enumerate(classes):
            if course in available_courses:
                available_recommendations.append((course, probabilities[i]))
        
        # Sort by probability and get top 5
        available_recommendations.sort(key=lambda x: x[1], reverse=True)
        recommendations = available_recommendations[:5]
        
        return recommendations
    
    def save_model(self):
        """Save the trained model and encoders"""
        os.makedirs('models', exist_ok=True)
        joblib.dump(self.model, 'models/course_recommender_model.pkl')
        joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
        joblib.dump(self.scaler, 'models/scaler.pkl')
        print("Model saved successfully")
    
    def load_model(self):
        """Load the trained model and encoders"""
        try:
            self.model = joblib.load('models/course_recommender_model.pkl')
            self.label_encoders = joblib.load('models/label_encoders.pkl')
            self.scaler = joblib.load('models/scaler.pkl')
            self.is_trained = True
            
            # Initialize data count tracking
            self._last_data_count = self.get_current_data_count()
            
            print("Model loaded successfully")
        except FileNotFoundError:
            print("No saved model found. Please train the model first.")
            self.is_trained = False
    
    def add_feedback(self, course: str, stanine: int, gwa: float, strand: str, 
                    rating: int, hobbies: str) -> bool:
        """Add user feedback to the database"""
        return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)