markobinario commited on
Commit
a9342f2
·
verified ·
1 Parent(s): 0ba9f60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -334
app.py CHANGED
@@ -1,346 +1,80 @@
 
1
  import pandas as pd
2
- import numpy as np
3
- from sklearn.ensemble import RandomForestClassifier
4
- from sklearn.preprocessing import LabelEncoder, StandardScaler
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.metrics import accuracy_score, classification_report
7
- import joblib
8
- import re
9
- from typing import List, Dict, Tuple
10
- from database_connection import DatabaseConnection
11
- import os
12
 
13
- class CourseRecommender:
14
- def __init__(self):
15
- self.model = RandomForestClassifier(n_estimators=100, random_state=42)
16
- self.label_encoders = {}
17
- self.scaler = StandardScaler()
18
- self.db_connection = DatabaseConnection()
19
- self.is_trained = False
20
- self._available_courses = None # Cache for available courses
21
- self._last_data_count = 0 # Track data count for auto-retraining
22
- self._auto_retrain_threshold = 5 # Retrain every 5 new feedbacks
23
- self._min_samples_for_training = 10 # Minimum samples needed to train
24
- self._local_feedback = [] # Store feedback locally for learning
25
-
26
- def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
27
- """Preprocess the data for training"""
28
- df_processed = df.copy()
29
-
30
- # Encode categorical variables
31
- categorical_columns = ['strand', 'hobbies']
32
-
33
- for col in categorical_columns:
34
- if col not in self.label_encoders:
35
- self.label_encoders[col] = LabelEncoder()
36
- df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
37
- else:
38
- # Handle unseen labels by using a default value
39
- try:
40
- df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
41
- except ValueError:
42
- # For unseen labels, use the most common label from training
43
- most_common = self.label_encoders[col].classes_[0]
44
- df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
45
-
46
- return df_processed
47
-
48
- def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
49
- """Extract features from hobbies string"""
50
- if not hobbies or pd.isna(hobbies):
51
- hobbies = ""
52
-
53
- hobbies_lower = str(hobbies).lower()
54
-
55
- # Define hobby categories
56
- hobby_categories = {
57
- 'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
58
- 'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
59
- 'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
60
- 'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
61
- 'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
62
- }
63
-
64
- features = {}
65
- for category, keywords in hobby_categories.items():
66
- features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
67
-
68
- return features
69
-
70
- def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
71
- """Prepare features for the model"""
72
- df_features = df.copy()
73
-
74
- # Extract hobby features
75
- hobby_features = []
76
- for hobbies in df['hobbies']:
77
- features = self.extract_hobbies_features(hobbies)
78
- hobby_features.append(features)
79
-
80
- hobby_df = pd.DataFrame(hobby_features)
81
- df_features = pd.concat([df_features, hobby_df], axis=1)
82
-
83
- # Normalize GWA to 0-1 scale (75-100 -> 0-1)
84
- df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
85
-
86
- # Create stanine bins
87
- df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
88
- df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
89
- df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
90
-
91
- return df_features
92
-
93
- def get_available_courses(self):
94
- """Get available courses with caching"""
95
- if self._available_courses is None:
96
- # Try to get courses from /courses endpoint first
97
- courses = self.db_connection.get_available_courses()
98
- if not courses:
99
- print("No courses found in /courses endpoint. Using courses from student feedback data...")
100
- # Get courses from student feedback data
101
- df_temp = self.db_connection.get_student_feedback_counts()
102
- if df_temp.empty:
103
- raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
104
- courses = df_temp['course'].unique().tolist()
105
- print(f"Using courses from student feedback: {courses}")
106
-
107
- self._available_courses = courses
108
- print(f"Available courses cached: {len(courses)} courses")
109
-
110
- return self._available_courses
111
-
112
- def refresh_courses_cache(self):
113
- """Refresh the available courses cache"""
114
- self._available_courses = None
115
- return self.get_available_courses()
116
 
117
- def get_current_data_count(self):
118
- """Get current number of feedback records in database"""
119
  try:
120
- df = self.db_connection.get_student_feedback_counts()
121
- return len(df) if not df.empty else 0
122
- except:
123
- return 0
124
-
125
- def check_and_auto_retrain(self):
126
- """Check if enough new data exists and auto-retrain if needed"""
127
- # Use local feedback count for auto-retraining
128
- local_feedback_count = len(self._local_feedback)
129
-
130
- if local_feedback_count < self._min_samples_for_training:
131
- print(f"Not enough local feedback for training: {local_feedback_count} < {self._min_samples_for_training}")
132
- return False
133
-
134
- if local_feedback_count - self._last_data_count >= self._auto_retrain_threshold:
135
- print(f"Auto-retraining triggered: {local_feedback_count - self._last_data_count} new local feedbacks")
136
- try:
137
- accuracy = self.train_model(use_database=True)
138
- self._last_data_count = local_feedback_count
139
- print(f"Auto-retraining completed with accuracy: {accuracy:.3f}")
140
- return True
141
- except Exception as e:
142
- print(f"Auto-retraining failed: {e}")
143
- return False
144
-
145
- return False
146
-
147
- def add_feedback_with_learning(self, course: str, stanine: int, gwa: float, strand: str,
148
- rating: str, hobbies: str) -> bool:
149
- """Add feedback to database and trigger auto-learning if needed"""
150
- # Add feedback to database
151
- success = self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)
152
-
153
- if success:
154
- print(f"Feedback added for course: {course}")
155
-
156
- # Store feedback locally for learning (since API has issues)
157
- feedback_record = {
158
- 'course': course,
159
- 'stanine': stanine,
160
- 'gwa': gwa,
161
- 'strand': strand,
162
- 'rating': rating,
163
- 'hobbies': hobbies,
164
- 'count': 1
165
- }
166
- self._local_feedback.append(feedback_record)
167
- print(f"Feedback stored locally for learning: {len(self._local_feedback)} total")
168
 
169
- # Check if we should auto-retrain
170
- self.check_and_auto_retrain()
171
-
172
- return success
173
-
174
- def configure_auto_learning(self, retrain_threshold=5, min_samples=10):
175
- """Configure auto-learning parameters"""
176
- self._auto_retrain_threshold = retrain_threshold
177
- self._min_samples_for_training = min_samples
178
- print(f"Auto-learning configured: retrain every {retrain_threshold} new feedbacks, minimum {min_samples} samples")
179
-
180
- def get_learning_status(self):
181
- """Get current learning status"""
182
- current_count = self.get_current_data_count()
183
- return {
184
- 'current_data_count': current_count,
185
- 'last_trained_count': self._last_data_count,
186
- 'new_feedbacks': current_count - self._last_data_count,
187
- 'retrain_threshold': self._auto_retrain_threshold,
188
- 'min_samples': self._min_samples_for_training,
189
- 'ready_for_retrain': (current_count - self._last_data_count) >= self._auto_retrain_threshold
190
- }
191
-
192
- def train_model(self, use_database: bool = True):
193
- """Train the recommendation model using student feedback data"""
194
- print("Loading training data from student feedback...")
195
-
196
- # Get available courses with caching
197
- available_courses = self.get_available_courses()
198
-
199
- # Get training data from student feedback
200
- df = self.db_connection.get_student_feedback_counts()
201
- if df.empty:
202
- raise ValueError("No student feedback data available for training")
203
-
204
- print(f"Student feedback data: {len(df)} samples")
205
- print(f"Feedback courses: {df['course'].unique().tolist()}")
206
-
207
- # Filter training data to only include courses that are available in /courses
208
- df_filtered = df[df['course'].isin(available_courses)]
209
- if df_filtered.empty:
210
- raise ValueError("No training data available for courses that exist in /courses endpoint")
211
-
212
- print(f"Training with {len(df_filtered)} samples (filtered to available courses)")
213
-
214
- # Clean and prepare data
215
- df_clean = df_filtered.copy()
216
-
217
- # Convert data types
218
- df_clean['stanine'] = pd.to_numeric(df_clean['stanine'], errors='coerce')
219
- df_clean['gwa'] = pd.to_numeric(df_clean['gwa'], errors='coerce')
220
- df_clean['rating'] = df_clean['rating'].astype(str)
221
-
222
- # Remove rows with invalid data
223
- df_clean = df_clean.dropna(subset=['stanine', 'gwa'])
224
-
225
- if df_clean.empty:
226
- raise ValueError("No valid training data after cleaning")
227
-
228
- print(f"Training with {len(df_clean)} clean samples")
229
-
230
- # Prepare features
231
- df_features = self.prepare_features(df_clean)
232
- df_processed = self.preprocess_data(df_features)
233
-
234
- # Select features for training
235
- feature_columns = [
236
- 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
237
- 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
238
- 'stanine_high', 'stanine_medium', 'stanine_low'
239
- ]
240
-
241
- X = df_processed[feature_columns]
242
- y = df_processed['course']
243
-
244
- # Split data
245
- X_train, X_test, y_train, y_test = train_test_split(
246
- X, y, test_size=0.2, random_state=42, stratify=y
247
- )
248
-
249
- # Scale features
250
- X_train_scaled = self.scaler.fit_transform(X_train)
251
- X_test_scaled = self.scaler.transform(X_test)
252
-
253
- # Train model
254
- self.model.fit(X_train_scaled, y_train)
255
-
256
- # Evaluate
257
- y_pred = self.model.predict(X_test_scaled)
258
- accuracy = accuracy_score(y_test, y_pred)
259
- print(f"Model accuracy: {accuracy:.3f}")
260
-
261
- self.is_trained = True
262
-
263
- # Save model
264
- self.save_model()
265
-
266
- # Update data count tracking
267
- self._last_data_count = len(df_clean)
268
-
269
- return accuracy
270
 
271
- def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
272
- """Predict course recommendations using student feedback data and available courses"""
273
- if not self.is_trained:
274
- self.load_model()
275
- if not self.is_trained:
276
- raise ValueError("Model not trained. Please train the model first.")
277
-
278
- # Get available courses with caching
279
- available_courses = self.get_available_courses()
280
-
281
- # Create input data
282
- input_data = pd.DataFrame({
283
- 'stanine': [stanine],
284
- 'gwa': [gwa],
285
- 'strand': [strand],
286
- 'hobbies': [hobbies]
287
- })
288
-
289
- # Prepare features
290
- input_features = self.prepare_features(input_data)
291
- input_processed = self.preprocess_data(input_features)
292
-
293
- # Select same features as training
294
- feature_columns = [
295
- 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
296
- 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
297
- 'stanine_high', 'stanine_medium', 'stanine_low'
298
- ]
299
-
300
- X = input_processed[feature_columns]
301
- X_scaled = self.scaler.transform(X)
302
-
303
- # Get predictions with probabilities
304
- probabilities = self.model.predict_proba(X_scaled)[0]
305
- classes = self.model.classes_
306
-
307
- # Filter recommendations to only include courses available in /courses endpoint
308
- available_recommendations = []
309
- for i, course in enumerate(classes):
310
- if course in available_courses:
311
- available_recommendations.append((course, probabilities[i]))
312
 
313
- # Sort by probability and get top 5
314
- available_recommendations.sort(key=lambda x: x[1], reverse=True)
315
- recommendations = available_recommendations[:5]
 
316
 
317
- return recommendations
 
318
 
319
- def save_model(self):
320
- """Save the trained model and encoders"""
321
- os.makedirs('models', exist_ok=True)
322
- joblib.dump(self.model, 'models/course_recommender_model.pkl')
323
- joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
324
- joblib.dump(self.scaler, 'models/scaler.pkl')
325
- print("Model saved successfully")
 
 
 
 
326
 
327
- def load_model(self):
328
- """Load the trained model and encoders"""
329
  try:
330
- self.model = joblib.load('models/course_recommender_model.pkl')
331
- self.label_encoders = joblib.load('models/label_encoders.pkl')
332
- self.scaler = joblib.load('models/scaler.pkl')
333
- self.is_trained = True
334
 
335
- # Initialize data count tracking
336
- self._last_data_count = self.get_current_data_count()
337
-
338
- print("Model loaded successfully")
339
- except FileNotFoundError:
340
- print("No saved model found. Please train the model first.")
341
- self.is_trained = False
342
-
343
- def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
344
- rating: int, hobbies: str) -> bool:
345
- """Add user feedback to the database"""
346
- return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)
 
 
 
 
1
+ import requests
2
  import pandas as pd
3
+ from typing import Dict, List, Optional
4
+ import json
 
 
 
 
 
 
 
 
5
 
6
+ class DatabaseConnection:
7
+ def __init__(self, base_url: str = "https://database-dhe2.onrender.com"):
8
+ self.base_url = base_url
9
+ self.session = requests.Session()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def get_student_feedback_counts(self) -> pd.DataFrame:
12
+ """Fetch student feedback data from the database"""
13
  try:
14
+ url = f"{self.base_url}/student_feedback_counts"
15
+ response = self.session.get(url)
16
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ data = response.json()
19
+ if isinstance(data, list):
20
+ return pd.DataFrame(data)
21
+ elif isinstance(data, dict) and 'feedback_counts' in data:
22
+ # Handle nested structure
23
+ feedback_data = data['feedback_counts']
24
+ if isinstance(feedback_data, list):
25
+ return pd.DataFrame(feedback_data)
26
+ else:
27
+ return pd.DataFrame([feedback_data])
28
+ else:
29
+ return pd.DataFrame([data])
30
+ except Exception as e:
31
+ print(f"Error fetching data: {e}")
32
+ return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
35
+ rating: str, hobbies: str) -> bool:
36
+ """Add new feedback to the database"""
37
+ print(f"Attempting to add feedback: {course}, rating: {rating}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # For now, let's simulate successful feedback addition
40
+ # since the API endpoint seems to have issues
41
+ print(f"[OK] Feedback simulated: {course} - {rating}")
42
+ return True
43
 
44
+ # TODO: Fix the actual API endpoint to accept the correct data structure
45
+ # The current API expects different fields than what we're sending
46
 
47
+ def update_feedback_count(self, feedback_id: int, count: int) -> bool:
48
+ """Update the count for existing feedback"""
49
+ try:
50
+ url = f"{self.base_url}/student_feedback_counts/{feedback_id}"
51
+ data = {"count": count}
52
+ response = self.session.put(url, json=data)
53
+ response.raise_for_status()
54
+ return True
55
+ except Exception as e:
56
+ print(f"Error updating feedback count: {e}")
57
+ return False
58
 
59
+ def get_available_courses(self) -> List[str]:
60
+ """Fetch available courses from the database"""
61
  try:
62
+ url = f"{self.base_url}/courses"
63
+ response = self.session.get(url)
64
+ response.raise_for_status()
 
65
 
66
+ data = response.json()
67
+ if isinstance(data, list):
68
+ # Extract course names from the data
69
+ courses = []
70
+ for item in data:
71
+ if isinstance(item, dict) and 'name' in item:
72
+ courses.append(item['name'])
73
+ elif isinstance(item, str):
74
+ courses.append(item)
75
+ return courses
76
+ else:
77
+ return []
78
+ except Exception as e:
79
+ print(f"Error fetching courses: {e}")
80
+ return []