markobinario commited on
Commit
c6637a8
·
verified ·
1 Parent(s): 196d9d1

Upload 3 files

Browse files
basic_training_data.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def create_basic_training_data():
5
+ """Create basic training data for the course recommender"""
6
+
7
+ # Define available courses
8
+ courses = [
9
+ "Computer Science", "Information Technology", "Data Science",
10
+ "Software Engineering", "Cybersecurity", "Computer Engineering",
11
+ "Business Administration", "Marketing", "Finance", "Accounting",
12
+ "Psychology", "Education", "Literature", "History", "Philosophy",
13
+ "Nursing", "Medicine", "Engineering", "Architecture", "Design"
14
+ ]
15
+
16
+ # Define strands
17
+ strands = ["STEM", "ABM", "HUMSS", "GAS", "TVL"]
18
+
19
+ # Define common hobbies
20
+ hobbies_list = [
21
+ "Programming", "Reading", "Sports", "Music", "Art", "Gaming",
22
+ "Photography", "Writing", "Dancing", "Cooking", "Traveling",
23
+ "Mathematics", "Science", "History", "Literature", "Technology"
24
+ ]
25
+
26
+ # Generate synthetic data
27
+ np.random.seed(42) # For reproducible results
28
+ n_samples = 1000
29
+
30
+ data = []
31
+ for _ in range(n_samples):
32
+ # Generate random but realistic data
33
+ stanine = np.random.randint(1, 10)
34
+ gwa = np.random.uniform(75, 100) # GWA between 75-100
35
+ strand = np.random.choice(strands)
36
+ course = np.random.choice(courses)
37
+ hobbies = np.random.choice(hobbies_list, size=np.random.randint(1, 4), replace=False)
38
+ hobbies_str = ", ".join(hobbies)
39
+
40
+ # Generate rating based on some logic
41
+ if stanine >= 7 and gwa >= 85:
42
+ rating = np.random.choice([4, 5], p=[0.3, 0.7])
43
+ elif stanine >= 5 and gwa >= 80:
44
+ rating = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
45
+ else:
46
+ rating = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
47
+
48
+ count = np.random.randint(1, 10)
49
+
50
+ data.append({
51
+ 'course': course,
52
+ 'stanine': stanine,
53
+ 'gwa': gwa,
54
+ 'strand': strand,
55
+ 'rating': rating,
56
+ 'hobbies': hobbies_str,
57
+ 'count': count
58
+ })
59
+
60
+ return pd.DataFrame(data)
61
+
62
+ def save_basic_data():
63
+ """Save basic training data to CSV"""
64
+ df = create_basic_training_data()
65
+ df.to_csv('basic_training_data.csv', index=False)
66
+ print(f"Basic training data saved with {len(df)} samples")
67
+ return df
68
+
69
+ if __name__ == "__main__":
70
+ save_basic_data()
course_recommender.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import accuracy_score, classification_report
7
+ import joblib
8
+ import re
9
+ from typing import List, Dict, Tuple
10
+ from database_connection import DatabaseConnection
11
+ import os
12
+
13
+ class CourseRecommender:
14
+ def __init__(self):
15
+ self.model = RandomForestClassifier(n_estimators=100, random_state=42)
16
+ self.label_encoders = {}
17
+ self.scaler = StandardScaler()
18
+ self.db_connection = DatabaseConnection()
19
+ self.is_trained = False
20
+
21
+ def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
22
+ """Preprocess the data for training"""
23
+ df_processed = df.copy()
24
+
25
+ # Encode categorical variables
26
+ categorical_columns = ['strand', 'hobbies']
27
+
28
+ for col in categorical_columns:
29
+ if col not in self.label_encoders:
30
+ self.label_encoders[col] = LabelEncoder()
31
+ df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
32
+ else:
33
+ # Handle unseen labels by using a default value
34
+ try:
35
+ df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
36
+ except ValueError:
37
+ # For unseen labels, use the most common label from training
38
+ most_common = self.label_encoders[col].classes_[0]
39
+ df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
40
+
41
+ return df_processed
42
+
43
+ def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
44
+ """Extract features from hobbies string"""
45
+ if not hobbies or pd.isna(hobbies):
46
+ hobbies = ""
47
+
48
+ hobbies_lower = str(hobbies).lower()
49
+
50
+ # Define hobby categories
51
+ hobby_categories = {
52
+ 'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
53
+ 'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
54
+ 'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
55
+ 'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
56
+ 'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
57
+ }
58
+
59
+ features = {}
60
+ for category, keywords in hobby_categories.items():
61
+ features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
62
+
63
+ return features
64
+
65
+ def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
66
+ """Prepare features for the model"""
67
+ df_features = df.copy()
68
+
69
+ # Extract hobby features
70
+ hobby_features = []
71
+ for hobbies in df['hobbies']:
72
+ features = self.extract_hobbies_features(hobbies)
73
+ hobby_features.append(features)
74
+
75
+ hobby_df = pd.DataFrame(hobby_features)
76
+ df_features = pd.concat([df_features, hobby_df], axis=1)
77
+
78
+ # Normalize GWA to 0-1 scale (75-100 -> 0-1)
79
+ df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
80
+
81
+ # Create stanine bins
82
+ df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
83
+ df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
84
+ df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
85
+
86
+ return df_features
87
+
88
+ def train_model(self, use_database: bool = True):
89
+ """Train the recommendation model"""
90
+ print("Loading training data...")
91
+
92
+ if use_database:
93
+ # Try to get data from database first
94
+ df = self.db_connection.get_student_feedback_counts()
95
+ if df.empty:
96
+ print("No data from database, using basic training data...")
97
+ from basic_training_data import create_basic_training_data
98
+ df = create_basic_training_data()
99
+ else:
100
+ from basic_training_data import create_basic_training_data
101
+ df = create_basic_training_data()
102
+
103
+ if df.empty:
104
+ raise ValueError("No training data available")
105
+
106
+ print(f"Training with {len(df)} samples")
107
+
108
+ # Prepare features
109
+ df_features = self.prepare_features(df)
110
+ df_processed = self.preprocess_data(df_features)
111
+
112
+ # Select features for training
113
+ feature_columns = [
114
+ 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
115
+ 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
116
+ 'stanine_high', 'stanine_medium', 'stanine_low'
117
+ ]
118
+
119
+ X = df_processed[feature_columns]
120
+ y = df_processed['course']
121
+
122
+ # Split data
123
+ X_train, X_test, y_train, y_test = train_test_split(
124
+ X, y, test_size=0.2, random_state=42, stratify=y
125
+ )
126
+
127
+ # Scale features
128
+ X_train_scaled = self.scaler.fit_transform(X_train)
129
+ X_test_scaled = self.scaler.transform(X_test)
130
+
131
+ # Train model
132
+ self.model.fit(X_train_scaled, y_train)
133
+
134
+ # Evaluate
135
+ y_pred = self.model.predict(X_test_scaled)
136
+ accuracy = accuracy_score(y_test, y_pred)
137
+ print(f"Model accuracy: {accuracy:.3f}")
138
+
139
+ self.is_trained = True
140
+
141
+ # Save model
142
+ self.save_model()
143
+
144
+ return accuracy
145
+
146
+ def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
147
+ """Predict course recommendations"""
148
+ if not self.is_trained:
149
+ self.load_model()
150
+ if not self.is_trained:
151
+ raise ValueError("Model not trained. Please train the model first.")
152
+
153
+ # Create input data
154
+ input_data = pd.DataFrame({
155
+ 'stanine': [stanine],
156
+ 'gwa': [gwa],
157
+ 'strand': [strand],
158
+ 'hobbies': [hobbies]
159
+ })
160
+
161
+ # Prepare features
162
+ input_features = self.prepare_features(input_data)
163
+ input_processed = self.preprocess_data(input_features)
164
+
165
+ # Select same features as training
166
+ feature_columns = [
167
+ 'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
168
+ 'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
169
+ 'stanine_high', 'stanine_medium', 'stanine_low'
170
+ ]
171
+
172
+ X = input_processed[feature_columns]
173
+ X_scaled = self.scaler.transform(X)
174
+
175
+ # Get predictions with probabilities
176
+ probabilities = self.model.predict_proba(X_scaled)[0]
177
+ classes = self.model.classes_
178
+
179
+ # Get top 5 recommendations
180
+ top_indices = np.argsort(probabilities)[-5:][::-1]
181
+ recommendations = [(classes[i], probabilities[i]) for i in top_indices]
182
+
183
+ return recommendations
184
+
185
+ def save_model(self):
186
+ """Save the trained model and encoders"""
187
+ os.makedirs('models', exist_ok=True)
188
+ joblib.dump(self.model, 'models/course_recommender_model.pkl')
189
+ joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
190
+ joblib.dump(self.scaler, 'models/scaler.pkl')
191
+ print("Model saved successfully")
192
+
193
+ def load_model(self):
194
+ """Load the trained model and encoders"""
195
+ try:
196
+ self.model = joblib.load('models/course_recommender_model.pkl')
197
+ self.label_encoders = joblib.load('models/label_encoders.pkl')
198
+ self.scaler = joblib.load('models/scaler.pkl')
199
+ self.is_trained = True
200
+ print("Model loaded successfully")
201
+ except FileNotFoundError:
202
+ print("No saved model found. Please train the model first.")
203
+ self.is_trained = False
204
+
205
+ def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
206
+ rating: int, hobbies: str) -> bool:
207
+ """Add user feedback to the database"""
208
+ return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)
database_connection.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ from typing import Dict, List, Optional
4
+ import json
5
+
6
+ class DatabaseConnection:
7
+ def __init__(self, base_url: str = "https://database-dhe2.onrender.com"):
8
+ self.base_url = base_url
9
+ self.session = requests.Session()
10
+
11
+ def get_student_feedback_counts(self) -> pd.DataFrame:
12
+ """Fetch student feedback data from the database"""
13
+ try:
14
+ url = f"{self.base_url}/student_feedback_counts"
15
+ response = self.session.get(url)
16
+ response.raise_for_status()
17
+
18
+ data = response.json()
19
+ if isinstance(data, list):
20
+ return pd.DataFrame(data)
21
+ else:
22
+ return pd.DataFrame([data])
23
+ except Exception as e:
24
+ print(f"Error fetching data: {e}")
25
+ return pd.DataFrame()
26
+
27
+ def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
28
+ rating: int, hobbies: str) -> bool:
29
+ """Add new feedback to the database"""
30
+ try:
31
+ url = f"{self.base_url}/student_feedback_counts"
32
+ data = {
33
+ "course": course,
34
+ "stanine": stanine,
35
+ "gwa": gwa,
36
+ "strand": strand,
37
+ "rating": rating,
38
+ "hobbies": hobbies
39
+ }
40
+ response = self.session.post(url, json=data)
41
+ response.raise_for_status()
42
+ return True
43
+ except Exception as e:
44
+ print(f"Error adding feedback: {e}")
45
+ return False
46
+
47
+ def update_feedback_count(self, feedback_id: int, count: int) -> bool:
48
+ """Update the count for existing feedback"""
49
+ try:
50
+ url = f"{self.base_url}/student_feedback_counts/{feedback_id}"
51
+ data = {"count": count}
52
+ response = self.session.put(url, json=data)
53
+ response.raise_for_status()
54
+ return True
55
+ except Exception as e:
56
+ print(f"Error updating feedback count: {e}")
57
+ return False