markobinario commited on
Commit
61b18b3
·
verified ·
1 Parent(s): 0e1524e

Update course_recommender.py

Browse files
Files changed (1) hide show
  1. course_recommender.py +254 -208
course_recommender.py CHANGED
@@ -1,208 +1,254 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.ensemble import RandomForestClassifier
4
- from sklearn.preprocessing import LabelEncoder, StandardScaler
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.metrics import accuracy_score, classification_report
7
- import joblib
8
- import re
9
- from typing import List, Dict, Tuple
10
- from database_connection import DatabaseConnection
11
- import os
12
-
13
- class CourseRecommender:
14
- def __init__(self):
15
- self.model = RandomForestClassifier(n_estimators=100, random_state=42)
16
- self.label_encoders = {}
17
- self.scaler = StandardScaler()
18
- self.db_connection = DatabaseConnection()
19
- self.is_trained = False
20
-
21
- def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
22
- """Preprocess the data for training"""
23
- df_processed = df.copy()
24
-
25
- # Encode categorical variables
26
- categorical_columns = ['strand', 'hobbies']
27
-
28
- for col in categorical_columns:
29
- if col not in self.label_encoders:
30
- self.label_encoders[col] = LabelEncoder()
31
- df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
32
- else:
33
- # Handle unseen labels by using a default value
34
- try:
35
- df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
36
- except ValueError:
37
- # For unseen labels, use the most common label from training
38
- most_common = self.label_encoders[col].classes_[0]
39
- df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
40
-
41
- return df_processed
42
-
43
- def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
44
- """Extract features from hobbies string"""
45
- if not hobbies or pd.isna(hobbies):
46
- hobbies = ""
47
-
48
- hobbies_lower = str(hobbies).lower()
49
-
50
- # Define hobby categories
51
- hobby_categories = {
52
- 'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
53
- 'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
54
- 'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
55
- 'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
56
- 'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
57
- }
58
-
59
- features = {}
60
- for category, keywords in hobby_categories.items():
61
- features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
62
-
63
- return features
64
-
65
- def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
66
- """Prepare features for the model"""
67
- df_features = df.copy()
68
-
69
- # Extract hobby features
70
- hobby_features = []
71
- for hobbies in df['hobbies']:
72
- features = self.extract_hobbies_features(hobbies)
73
- hobby_features.append(features)
74
-
75
- hobby_df = pd.DataFrame(hobby_features)
76
- df_features = pd.concat([df_features, hobby_df], axis=1)
77
-
78
- # Normalize GWA to 0-1 scale (75-100 -> 0-1)
79
- df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
80
-
81
- # Create stanine bins
82
- df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
83
- df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
84
- df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
85
-
86
- return df_features
87
-
88
- def train_model(self, use_database: bool = True):
89
- """Train the recommendation model"""
90
- print("Loading training data...")
91
-
92
- if use_database:
93
- # Try to get data from database first
94
- df = self.db_connection.get_student_feedback_counts()
95
- if df.empty:
96
- print("No data from database, using basic training data...")
97
- from basic_training_data import create_basic_training_data
98
- df = create_basic_training_data()
99
- else:
100
- from basic_training_data import create_basic_training_data
101
- df = create_basic_training_data()
102
-
103
- if df.empty:
104
- raise ValueError("No training data available")
105
-
106
- print(f"Training with {len(df)} samples")
107
-
108
- # Prepare features
109
- df_features = self.prepare_features(df)
110
- df_processed = self.preprocess_data(df_features)
111
-
112
- # Select features for training
113
- feature_columns = [
114
- 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
115
- 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
116
- 'stanine_high', 'stanine_medium', 'stanine_low'
117
- ]
118
-
119
- X = df_processed[feature_columns]
120
- y = df_processed['course']
121
-
122
- # Split data
123
- X_train, X_test, y_train, y_test = train_test_split(
124
- X, y, test_size=0.2, random_state=42, stratify=y
125
- )
126
-
127
- # Scale features
128
- X_train_scaled = self.scaler.fit_transform(X_train)
129
- X_test_scaled = self.scaler.transform(X_test)
130
-
131
- # Train model
132
- self.model.fit(X_train_scaled, y_train)
133
-
134
- # Evaluate
135
- y_pred = self.model.predict(X_test_scaled)
136
- accuracy = accuracy_score(y_test, y_pred)
137
- print(f"Model accuracy: {accuracy:.3f}")
138
-
139
- self.is_trained = True
140
-
141
- # Save model
142
- self.save_model()
143
-
144
- return accuracy
145
-
146
- def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
147
- """Predict course recommendations"""
148
- if not self.is_trained:
149
- self.load_model()
150
- if not self.is_trained:
151
- raise ValueError("Model not trained. Please train the model first.")
152
-
153
- # Create input data
154
- input_data = pd.DataFrame({
155
- 'stanine': [stanine],
156
- 'gwa': [gwa],
157
- 'strand': [strand],
158
- 'hobbies': [hobbies]
159
- })
160
-
161
- # Prepare features
162
- input_features = self.prepare_features(input_data)
163
- input_processed = self.preprocess_data(input_features)
164
-
165
- # Select same features as training
166
- feature_columns = [
167
- 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
168
- 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
169
- 'stanine_high', 'stanine_medium', 'stanine_low'
170
- ]
171
-
172
- X = input_processed[feature_columns]
173
- X_scaled = self.scaler.transform(X)
174
-
175
- # Get predictions with probabilities
176
- probabilities = self.model.predict_proba(X_scaled)[0]
177
- classes = self.model.classes_
178
-
179
- # Get top 5 recommendations
180
- top_indices = np.argsort(probabilities)[-5:][::-1]
181
- recommendations = [(classes[i], probabilities[i]) for i in top_indices]
182
-
183
- return recommendations
184
-
185
- def save_model(self):
186
- """Save the trained model and encoders"""
187
- os.makedirs('models', exist_ok=True)
188
- joblib.dump(self.model, 'models/course_recommender_model.pkl')
189
- joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
190
- joblib.dump(self.scaler, 'models/scaler.pkl')
191
- print("Model saved successfully")
192
-
193
- def load_model(self):
194
- """Load the trained model and encoders"""
195
- try:
196
- self.model = joblib.load('models/course_recommender_model.pkl')
197
- self.label_encoders = joblib.load('models/label_encoders.pkl')
198
- self.scaler = joblib.load('models/scaler.pkl')
199
- self.is_trained = True
200
- print("Model loaded successfully")
201
- except FileNotFoundError:
202
- print("No saved model found. Please train the model first.")
203
- self.is_trained = False
204
-
205
- def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
206
- rating: int, hobbies: str) -> bool:
207
- """Add user feedback to the database"""
208
- return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import accuracy_score, classification_report
7
+ import joblib
8
+ import re
9
+ from typing import List, Dict, Tuple
10
+ from database_connection import DatabaseConnection
11
+ import os
12
+
13
+ class CourseRecommender:
14
+ def __init__(self):
15
+ self.model = RandomForestClassifier(n_estimators=100, random_state=42)
16
+ self.label_encoders = {}
17
+ self.scaler = StandardScaler()
18
+ self.db_connection = DatabaseConnection()
19
+ self.is_trained = False
20
+
21
+ def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
22
+ """Preprocess the data for training"""
23
+ df_processed = df.copy()
24
+
25
+ # Encode categorical variables
26
+ categorical_columns = ['strand', 'hobbies']
27
+
28
+ for col in categorical_columns:
29
+ if col not in self.label_encoders:
30
+ self.label_encoders[col] = LabelEncoder()
31
+ df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
32
+ else:
33
+ # Handle unseen labels by using a default value
34
+ try:
35
+ df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
36
+ except ValueError:
37
+ # For unseen labels, use the most common label from training
38
+ most_common = self.label_encoders[col].classes_[0]
39
+ df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
40
+
41
+ return df_processed
42
+
43
+ def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
44
+ """Extract features from hobbies string"""
45
+ if not hobbies or pd.isna(hobbies):
46
+ hobbies = ""
47
+
48
+ hobbies_lower = str(hobbies).lower()
49
+
50
+ # Define hobby categories
51
+ hobby_categories = {
52
+ 'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
53
+ 'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
54
+ 'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
55
+ 'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
56
+ 'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
57
+ }
58
+
59
+ features = {}
60
+ for category, keywords in hobby_categories.items():
61
+ features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
62
+
63
+ return features
64
+
65
+ def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
66
+ """Prepare features for the model"""
67
+ df_features = df.copy()
68
+
69
+ # Extract hobby features
70
+ hobby_features = []
71
+ for hobbies in df['hobbies']:
72
+ features = self.extract_hobbies_features(hobbies)
73
+ hobby_features.append(features)
74
+
75
+ hobby_df = pd.DataFrame(hobby_features)
76
+ df_features = pd.concat([df_features, hobby_df], axis=1)
77
+
78
+ # Normalize GWA to 0-1 scale (75-100 -> 0-1)
79
+ df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
80
+
81
+ # Create stanine bins
82
+ df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
83
+ df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
84
+ df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
85
+
86
+ return df_features
87
+
88
+ def train_model(self, use_database: bool = True):
89
+ """Train the recommendation model using student feedback data"""
90
+ print("Loading training data from student feedback...")
91
+
92
+ # Get available courses from /courses endpoint
93
+ available_courses = self.db_connection.get_available_courses()
94
+ if not available_courses:
95
+ print("No courses found in /courses endpoint. Using courses from student feedback data...")
96
+ # Get courses from student feedback data
97
+ df_temp = self.db_connection.get_student_feedback_counts()
98
+ if df_temp.empty:
99
+ raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
100
+ available_courses = df_temp['course'].unique().tolist()
101
+ print(f"Using courses from student feedback: {available_courses}")
102
+
103
+ print(f"Available courses from /courses: {len(available_courses)}")
104
+ print(f"Available courses: {available_courses}")
105
+
106
+ # Get training data from student feedback
107
+ df = self.db_connection.get_student_feedback_counts()
108
+ if df.empty:
109
+ raise ValueError("No student feedback data available for training")
110
+
111
+ print(f"Student feedback data: {len(df)} samples")
112
+ print(f"Feedback courses: {df['course'].unique().tolist()}")
113
+
114
+ # Filter training data to only include courses that are available in /courses
115
+ df_filtered = df[df['course'].isin(available_courses)]
116
+ if df_filtered.empty:
117
+ raise ValueError("No training data available for courses that exist in /courses endpoint")
118
+
119
+ print(f"Training with {len(df_filtered)} samples (filtered to available courses)")
120
+
121
+ # Clean and prepare data
122
+ df_clean = df_filtered.copy()
123
+
124
+ # Convert data types
125
+ df_clean['stanine'] = pd.to_numeric(df_clean['stanine'], errors='coerce')
126
+ df_clean['gwa'] = pd.to_numeric(df_clean['gwa'], errors='coerce')
127
+ df_clean['rating'] = df_clean['rating'].astype(str)
128
+
129
+ # Remove rows with invalid data
130
+ df_clean = df_clean.dropna(subset=['stanine', 'gwa'])
131
+
132
+ if df_clean.empty:
133
+ raise ValueError("No valid training data after cleaning")
134
+
135
+ print(f"Training with {len(df_clean)} clean samples")
136
+
137
+ # Prepare features
138
+ df_features = self.prepare_features(df_clean)
139
+ df_processed = self.preprocess_data(df_features)
140
+
141
+ # Select features for training
142
+ feature_columns = [
143
+ 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
144
+ 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
145
+ 'stanine_high', 'stanine_medium', 'stanine_low'
146
+ ]
147
+
148
+ X = df_processed[feature_columns]
149
+ y = df_processed['course']
150
+
151
+ # Split data
152
+ X_train, X_test, y_train, y_test = train_test_split(
153
+ X, y, test_size=0.2, random_state=42, stratify=y
154
+ )
155
+
156
+ # Scale features
157
+ X_train_scaled = self.scaler.fit_transform(X_train)
158
+ X_test_scaled = self.scaler.transform(X_test)
159
+
160
+ # Train model
161
+ self.model.fit(X_train_scaled, y_train)
162
+
163
+ # Evaluate
164
+ y_pred = self.model.predict(X_test_scaled)
165
+ accuracy = accuracy_score(y_test, y_pred)
166
+ print(f"Model accuracy: {accuracy:.3f}")
167
+
168
+ self.is_trained = True
169
+
170
+ # Save model
171
+ self.save_model()
172
+
173
+ return accuracy
174
+
175
+ def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
176
+ """Predict course recommendations using student feedback data and available courses"""
177
+ if not self.is_trained:
178
+ self.load_model()
179
+ if not self.is_trained:
180
+ raise ValueError("Model not trained. Please train the model first.")
181
+
182
+ # Get available courses from /courses endpoint
183
+ available_courses = self.db_connection.get_available_courses()
184
+ if not available_courses:
185
+ print("No courses found in /courses endpoint. Using courses from student feedback data...")
186
+ # Get courses from student feedback data
187
+ df_temp = self.db_connection.get_student_feedback_counts()
188
+ if df_temp.empty:
189
+ raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
190
+ available_courses = df_temp['course'].unique().tolist()
191
+ print(f"Using courses from student feedback: {available_courses}")
192
+
193
+ # Create input data
194
+ input_data = pd.DataFrame({
195
+ 'stanine': [stanine],
196
+ 'gwa': [gwa],
197
+ 'strand': [strand],
198
+ 'hobbies': [hobbies]
199
+ })
200
+
201
+ # Prepare features
202
+ input_features = self.prepare_features(input_data)
203
+ input_processed = self.preprocess_data(input_features)
204
+
205
+ # Select same features as training
206
+ feature_columns = [
207
+ 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
208
+ 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
209
+ 'stanine_high', 'stanine_medium', 'stanine_low'
210
+ ]
211
+
212
+ X = input_processed[feature_columns]
213
+ X_scaled = self.scaler.transform(X)
214
+
215
+ # Get predictions with probabilities
216
+ probabilities = self.model.predict_proba(X_scaled)[0]
217
+ classes = self.model.classes_
218
+
219
+ # Filter recommendations to only include courses available in /courses endpoint
220
+ available_recommendations = []
221
+ for i, course in enumerate(classes):
222
+ if course in available_courses:
223
+ available_recommendations.append((course, probabilities[i]))
224
+
225
+ # Sort by probability and get top 5
226
+ available_recommendations.sort(key=lambda x: x[1], reverse=True)
227
+ recommendations = available_recommendations[:5]
228
+
229
+ return recommendations
230
+
231
+ def save_model(self):
232
+ """Save the trained model and encoders"""
233
+ os.makedirs('models', exist_ok=True)
234
+ joblib.dump(self.model, 'models/course_recommender_model.pkl')
235
+ joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
236
+ joblib.dump(self.scaler, 'models/scaler.pkl')
237
+ print("Model saved successfully")
238
+
239
+ def load_model(self):
240
+ """Load the trained model and encoders"""
241
+ try:
242
+ self.model = joblib.load('models/course_recommender_model.pkl')
243
+ self.label_encoders = joblib.load('models/label_encoders.pkl')
244
+ self.scaler = joblib.load('models/scaler.pkl')
245
+ self.is_trained = True
246
+ print("Model loaded successfully")
247
+ except FileNotFoundError:
248
+ print("No saved model found. Please train the model first.")
249
+ self.is_trained = False
250
+
251
+ def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
252
+ rating: int, hobbies: str) -> bool:
253
+ """Add user feedback to the database"""
254
+ return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)