chatbot2

Sleeping

App Files Files Community

markobinario commited on Oct 22, 2025

Commit

61b18b3

verified ·

1 Parent(s): 0e1524e

Update course_recommender.py

Browse files

Files changed (1) hide show

course_recommender.py +254 -208

course_recommender.py CHANGED Viewed

@@ -1,208 +1,254 @@
-import pandas as pd
-import numpy as np
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report
-import joblib
-import re
-from typing import List, Dict, Tuple
-from database_connection import DatabaseConnection
-import os
-class CourseRecommender:
-    def __init__(self):
-        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
-        self.label_encoders = {}
-        self.scaler = StandardScaler()
-        self.db_connection = DatabaseConnection()
-        self.is_trained = False
-    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Preprocess the data for training"""
-        df_processed = df.copy()
-        # Encode categorical variables
-        categorical_columns = ['strand', 'hobbies']
-        for col in categorical_columns:
-            if col not in self.label_encoders:
-                self.label_encoders[col] = LabelEncoder()
-                df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
-            else:
-                # Handle unseen labels by using a default value
-                try:
-                    df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
-                except ValueError:
-                    # For unseen labels, use the most common label from training
-                    most_common = self.label_encoders[col].classes_[0]
-                    df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
-        return df_processed
-    def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
-        """Extract features from hobbies string"""
-        if not hobbies or pd.isna(hobbies):
-            hobbies = ""
-        hobbies_lower = str(hobbies).lower()
-        # Define hobby categories
-        hobby_categories = {
-            'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
-            'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
-            'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
-            'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
-            'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
-        }
-        features = {}
-        for category, keywords in hobby_categories.items():
-            features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
-        return features
-    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Prepare features for the model"""
-        df_features = df.copy()
-        # Extract hobby features
-        hobby_features = []
-        for hobbies in df['hobbies']:
-            features = self.extract_hobbies_features(hobbies)
-            hobby_features.append(features)
-        hobby_df = pd.DataFrame(hobby_features)
-        df_features = pd.concat([df_features, hobby_df], axis=1)
-        # Normalize GWA to 0-1 scale (75-100 -> 0-1)
-        df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
-        # Create stanine bins
-        df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
-        df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
-        df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
-        return df_features
-    def train_model(self, use_database: bool = True):
-        """Train the recommendation model"""
-        print("Loading training data...")
-        if use_database:
-            # Try to get data from database first
-            df = self.db_connection.get_student_feedback_counts()
-            if df.empty:
-                print("No data from database, using basic training data...")
-                from basic_training_data import create_basic_training_data
-                df = create_basic_training_data()
-        else:
-            from basic_training_data import create_basic_training_data
-            df = create_basic_training_data()
-        if df.empty:
-            raise ValueError("No training data available")
-        print(f"Training with {len(df)} samples")
-        # Prepare features
-        df_features = self.prepare_features(df)
-        df_processed = self.preprocess_data(df_features)
-        # Select features for training
-        feature_columns = [
-            'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
-            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
-            'stanine_high', 'stanine_medium', 'stanine_low'
-        ]
-        X = df_processed[feature_columns]
-        y = df_processed['course']
-        # Split data
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42, stratify=y
-        )
-        # Scale features
-        X_train_scaled = self.scaler.fit_transform(X_train)
-        X_test_scaled = self.scaler.transform(X_test)
-        # Train model
-        self.model.fit(X_train_scaled, y_train)
-        # Evaluate
-        y_pred = self.model.predict(X_test_scaled)
-        accuracy = accuracy_score(y_test, y_pred)
-        print(f"Model accuracy: {accuracy:.3f}")
-        self.is_trained = True
-        # Save model
-        self.save_model()
-        return accuracy
-    def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
-        """Predict course recommendations"""
-        if not self.is_trained:
-            self.load_model()
-            if not self.is_trained:
-                raise ValueError("Model not trained. Please train the model first.")
-        # Create input data
-        input_data = pd.DataFrame({
-            'stanine': [stanine],
-            'gwa': [gwa],
-            'strand': [strand],
-            'hobbies': [hobbies]
-        })
-        # Prepare features
-        input_features = self.prepare_features(input_data)
-        input_processed = self.preprocess_data(input_features)
-        # Select same features as training
-        feature_columns = [
-            'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
-            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
-            'stanine_high', 'stanine_medium', 'stanine_low'
-        ]
-        X = input_processed[feature_columns]
-        X_scaled = self.scaler.transform(X)
-        # Get predictions with probabilities
-        probabilities = self.model.predict_proba(X_scaled)[0]
-        classes = self.model.classes_
-        # Get top 5 recommendations
-        top_indices = np.argsort(probabilities)[-5:][::-1]
-        recommendations = [(classes[i], probabilities[i]) for i in top_indices]
-        return recommendations
-    def save_model(self):
-        """Save the trained model and encoders"""
-        os.makedirs('models', exist_ok=True)
-        joblib.dump(self.model, 'models/course_recommender_model.pkl')
-        joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
-        joblib.dump(self.scaler, 'models/scaler.pkl')
-        print("Model saved successfully")
-    def load_model(self):
-        """Load the trained model and encoders"""
-        try:
-            self.model = joblib.load('models/course_recommender_model.pkl')
-            self.label_encoders = joblib.load('models/label_encoders.pkl')
-            self.scaler = joblib.load('models/scaler.pkl')
-            self.is_trained = True
-            print("Model loaded successfully")
-        except FileNotFoundError:
-            print("No saved model found. Please train the model first.")
-            self.is_trained = False
-    def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
-                    rating: int, hobbies: str) -> bool:
-        """Add user feedback to the database"""
-        return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)

+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report
+import joblib
+import re
+from typing import List, Dict, Tuple
+from database_connection import DatabaseConnection
+import os
+class CourseRecommender:
+    def __init__(self):
+        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
+        self.label_encoders = {}
+        self.scaler = StandardScaler()
+        self.db_connection = DatabaseConnection()
+        self.is_trained = False
+    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Preprocess the data for training"""
+        df_processed = df.copy()
+        # Encode categorical variables
+        categorical_columns = ['strand', 'hobbies']
+        for col in categorical_columns:
+            if col not in self.label_encoders:
+                self.label_encoders[col] = LabelEncoder()
+                df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
+            else:
+                # Handle unseen labels by using a default value
+                try:
+                    df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
+                except ValueError:
+                    # For unseen labels, use the most common label from training
+                    most_common = self.label_encoders[col].classes_[0]
+                    df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
+        return df_processed
+    def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
+        """Extract features from hobbies string"""
+        if not hobbies or pd.isna(hobbies):
+            hobbies = ""
+        hobbies_lower = str(hobbies).lower()
+        # Define hobby categories
+        hobby_categories = {
+            'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
+            'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
+            'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
+            'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
+            'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
+        }
+        features = {}
+        for category, keywords in hobby_categories.items():
+            features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
+        return features
+    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Prepare features for the model"""
+        df_features = df.copy()
+        # Extract hobby features
+        hobby_features = []
+        for hobbies in df['hobbies']:
+            features = self.extract_hobbies_features(hobbies)
+            hobby_features.append(features)
+        hobby_df = pd.DataFrame(hobby_features)
+        df_features = pd.concat([df_features, hobby_df], axis=1)
+        # Normalize GWA to 0-1 scale (75-100 -> 0-1)
+        df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
+        # Create stanine bins
+        df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
+        df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
+        df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
+        return df_features
+    def train_model(self, use_database: bool = True):
+        """Train the recommendation model using student feedback data"""
+        print("Loading training data from student feedback...")
+        # Get available courses from /courses endpoint
+        available_courses = self.db_connection.get_available_courses()
+        if not available_courses:
+            print("No courses found in /courses endpoint. Using courses from student feedback data...")
+            # Get courses from student feedback data
+            df_temp = self.db_connection.get_student_feedback_counts()
+            if df_temp.empty:
+                raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
+            available_courses = df_temp['course'].unique().tolist()
+            print(f"Using courses from student feedback: {available_courses}")
+        print(f"Available courses from /courses: {len(available_courses)}")
+        print(f"Available courses: {available_courses}")
+        # Get training data from student feedback
+        df = self.db_connection.get_student_feedback_counts()
+        if df.empty:
+            raise ValueError("No student feedback data available for training")
+        print(f"Student feedback data: {len(df)} samples")
+        print(f"Feedback courses: {df['course'].unique().tolist()}")
+        # Filter training data to only include courses that are available in /courses
+        df_filtered = df[df['course'].isin(available_courses)]
+        if df_filtered.empty:
+            raise ValueError("No training data available for courses that exist in /courses endpoint")
+        print(f"Training with {len(df_filtered)} samples (filtered to available courses)")
+        # Clean and prepare data
+        df_clean = df_filtered.copy()
+        # Convert data types
+        df_clean['stanine'] = pd.to_numeric(df_clean['stanine'], errors='coerce')
+        df_clean['gwa'] = pd.to_numeric(df_clean['gwa'], errors='coerce')
+        df_clean['rating'] = df_clean['rating'].astype(str)
+        # Remove rows with invalid data
+        df_clean = df_clean.dropna(subset=['stanine', 'gwa'])
+        if df_clean.empty:
+            raise ValueError("No valid training data after cleaning")
+        print(f"Training with {len(df_clean)} clean samples")
+        # Prepare features
+        df_features = self.prepare_features(df_clean)
+        df_processed = self.preprocess_data(df_features)
+        # Select features for training
+        feature_columns = [
+            'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
+            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
+            'stanine_high', 'stanine_medium', 'stanine_low'
+        ]
+        X = df_processed[feature_columns]
+        y = df_processed['course']
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        # Scale features
+        X_train_scaled = self.scaler.fit_transform(X_train)
+        X_test_scaled = self.scaler.transform(X_test)
+        # Train model
+        self.model.fit(X_train_scaled, y_train)
+        # Evaluate
+        y_pred = self.model.predict(X_test_scaled)
+        accuracy = accuracy_score(y_test, y_pred)
+        print(f"Model accuracy: {accuracy:.3f}")
+        self.is_trained = True
+        # Save model
+        self.save_model()
+        return accuracy
+    def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
+        """Predict course recommendations using student feedback data and available courses"""
+        if not self.is_trained:
+            self.load_model()
+            if not self.is_trained:
+                raise ValueError("Model not trained. Please train the model first.")
+        # Get available courses from /courses endpoint
+        available_courses = self.db_connection.get_available_courses()
+        if not available_courses:
+            print("No courses found in /courses endpoint. Using courses from student feedback data...")
+            # Get courses from student feedback data
+            df_temp = self.db_connection.get_student_feedback_counts()
+            if df_temp.empty:
+                raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
+            available_courses = df_temp['course'].unique().tolist()
+            print(f"Using courses from student feedback: {available_courses}")
+        # Create input data
+        input_data = pd.DataFrame({
+            'stanine': [stanine],
+            'gwa': [gwa],
+            'strand': [strand],
+            'hobbies': [hobbies]
+        })
+        # Prepare features
+        input_features = self.prepare_features(input_data)
+        input_processed = self.preprocess_data(input_features)
+        # Select same features as training
+        feature_columns = [
+            'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
+            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
+            'stanine_high', 'stanine_medium', 'stanine_low'
+        ]
+        X = input_processed[feature_columns]
+        X_scaled = self.scaler.transform(X)
+        # Get predictions with probabilities
+        probabilities = self.model.predict_proba(X_scaled)[0]
+        classes = self.model.classes_
+        # Filter recommendations to only include courses available in /courses endpoint
+        available_recommendations = []
+        for i, course in enumerate(classes):
+            if course in available_courses:
+                available_recommendations.append((course, probabilities[i]))
+        # Sort by probability and get top 5
+        available_recommendations.sort(key=lambda x: x[1], reverse=True)
+        recommendations = available_recommendations[:5]
+        return recommendations
+    def save_model(self):
+        """Save the trained model and encoders"""
+        os.makedirs('models', exist_ok=True)
+        joblib.dump(self.model, 'models/course_recommender_model.pkl')
+        joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
+        joblib.dump(self.scaler, 'models/scaler.pkl')
+        print("Model saved successfully")
+    def load_model(self):
+        """Load the trained model and encoders"""
+        try:
+            self.model = joblib.load('models/course_recommender_model.pkl')
+            self.label_encoders = joblib.load('models/label_encoders.pkl')
+            self.scaler = joblib.load('models/scaler.pkl')
+            self.is_trained = True
+            print("Model loaded successfully")
+        except FileNotFoundError:
+            print("No saved model found. Please train the model first.")
+            self.is_trained = False
+    def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
+                    rating: int, hobbies: str) -> bool:
+        """Add user feedback to the database"""
+        return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)