Spaces:

markobinario
/

chatbot

Sleeping

App Files Files Community

markobinario commited on Oct 26, 2025

Commit

a9342f2

verified ·

1 Parent(s): 0ba9f60

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -334

app.py CHANGED Viewed

@@ -1,346 +1,80 @@
 import pandas as pd
-import numpy as np
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report
-import joblib
-import re
-from typing import List, Dict, Tuple
-from database_connection import DatabaseConnection
-import os
-class CourseRecommender:
-    def __init__(self):
-        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
-        self.label_encoders = {}
-        self.scaler = StandardScaler()
-        self.db_connection = DatabaseConnection()
-        self.is_trained = False
-        self._available_courses = None  # Cache for available courses
-        self._last_data_count = 0  # Track data count for auto-retraining
-        self._auto_retrain_threshold = 5  # Retrain every 5 new feedbacks
-        self._min_samples_for_training = 10  # Minimum samples needed to train
-        self._local_feedback = []  # Store feedback locally for learning
-    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Preprocess the data for training"""
-        df_processed = df.copy()
-        # Encode categorical variables
-        categorical_columns = ['strand', 'hobbies']
-        for col in categorical_columns:
-            if col not in self.label_encoders:
-                self.label_encoders[col] = LabelEncoder()
-                df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col].astype(str))
-            else:
-                # Handle unseen labels by using a default value
-                try:
-                    df_processed[col] = self.label_encoders[col].transform(df_processed[col].astype(str))
-                except ValueError:
-                    # For unseen labels, use the most common label from training
-                    most_common = self.label_encoders[col].classes_[0]
-                    df_processed[col] = self.label_encoders[col].transform([most_common] * len(df_processed))
-        return df_processed
-    def extract_hobbies_features(self, hobbies: str) -> Dict[str, int]:
-        """Extract features from hobbies string"""
-        if not hobbies or pd.isna(hobbies):
-            hobbies = ""
-        hobbies_lower = str(hobbies).lower()
-        # Define hobby categories
-        hobby_categories = {
-            'technical': ['programming', 'coding', 'computer', 'technology', 'software', 'gaming', 'electronics', 'math', 'mathematics'],
-            'creative': ['art', 'music', 'writing', 'design', 'photography', 'dancing', 'drawing', 'literature'],
-            'academic': ['reading', 'mathematics', 'science', 'research', 'studying', 'history', 'literature', 'books'],
-            'physical': ['sports', 'fitness', 'exercise', 'running', 'swimming', 'basketball', 'football', 'gym'],
-            'social': ['traveling', 'cooking', 'volunteering', 'community', 'leadership', 'social']
-        }
-        features = {}
-        for category, keywords in hobby_categories.items():
-            features[f'hobby_{category}'] = sum(1 for keyword in keywords if keyword in hobbies_lower)
-        return features
-    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Prepare features for the model"""
-        df_features = df.copy()
-        # Extract hobby features
-        hobby_features = []
-        for hobbies in df['hobbies']:
-            features = self.extract_hobbies_features(hobbies)
-            hobby_features.append(features)
-        hobby_df = pd.DataFrame(hobby_features)
-        df_features = pd.concat([df_features, hobby_df], axis=1)
-        # Normalize GWA to 0-1 scale (75-100 -> 0-1)
-        df_features['gwa_normalized'] = (df_features['gwa'] - 75) / 25
-        # Create stanine bins
-        df_features['stanine_high'] = (df_features['stanine'] >= 7).astype(int)
-        df_features['stanine_medium'] = ((df_features['stanine'] >= 4) & (df_features['stanine'] < 7)).astype(int)
-        df_features['stanine_low'] = (df_features['stanine'] < 4).astype(int)
-        return df_features
-    def get_available_courses(self):
-        """Get available courses with caching"""
-        if self._available_courses is None:
-            # Try to get courses from /courses endpoint first
-            courses = self.db_connection.get_available_courses()
-            if not courses:
-                print("No courses found in /courses endpoint. Using courses from student feedback data...")
-                # Get courses from student feedback data
-                df_temp = self.db_connection.get_student_feedback_counts()
-                if df_temp.empty:
-                    raise ValueError("No courses available in /courses endpoint and no student feedback data found.")
-                courses = df_temp['course'].unique().tolist()
-                print(f"Using courses from student feedback: {courses}")
-            self._available_courses = courses
-            print(f"Available courses cached: {len(courses)} courses")
-        return self._available_courses
-    def refresh_courses_cache(self):
-        """Refresh the available courses cache"""
-        self._available_courses = None
-        return self.get_available_courses()
-    def get_current_data_count(self):
-        """Get current number of feedback records in database"""
         try:
-            df = self.db_connection.get_student_feedback_counts()
-            return len(df) if not df.empty else 0
-        except:
-            return 0
-    def check_and_auto_retrain(self):
-        """Check if enough new data exists and auto-retrain if needed"""
-        # Use local feedback count for auto-retraining
-        local_feedback_count = len(self._local_feedback)
-        if local_feedback_count < self._min_samples_for_training:
-            print(f"Not enough local feedback for training: {local_feedback_count} < {self._min_samples_for_training}")
-            return False
-        if local_feedback_count - self._last_data_count >= self._auto_retrain_threshold:
-            print(f"Auto-retraining triggered: {local_feedback_count - self._last_data_count} new local feedbacks")
-            try:
-                accuracy = self.train_model(use_database=True)
-                self._last_data_count = local_feedback_count
-                print(f"Auto-retraining completed with accuracy: {accuracy:.3f}")
-                return True
-            except Exception as e:
-                print(f"Auto-retraining failed: {e}")
-                return False
-        return False
-    def add_feedback_with_learning(self, course: str, stanine: int, gwa: float, strand: str,
-                                 rating: str, hobbies: str) -> bool:
-        """Add feedback to database and trigger auto-learning if needed"""
-        # Add feedback to database
-        success = self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)
-        if success:
-            print(f"Feedback added for course: {course}")
-            # Store feedback locally for learning (since API has issues)
-            feedback_record = {
-                'course': course,
-                'stanine': stanine,
-                'gwa': gwa,
-                'strand': strand,
-                'rating': rating,
-                'hobbies': hobbies,
-                'count': 1
-            }
-            self._local_feedback.append(feedback_record)
-            print(f"Feedback stored locally for learning: {len(self._local_feedback)} total")
-            # Check if we should auto-retrain
-            self.check_and_auto_retrain()
-        return success
-    def configure_auto_learning(self, retrain_threshold=5, min_samples=10):
-        """Configure auto-learning parameters"""
-        self._auto_retrain_threshold = retrain_threshold
-        self._min_samples_for_training = min_samples
-        print(f"Auto-learning configured: retrain every {retrain_threshold} new feedbacks, minimum {min_samples} samples")
-    def get_learning_status(self):
-        """Get current learning status"""
-        current_count = self.get_current_data_count()
-        return {
-            'current_data_count': current_count,
-            'last_trained_count': self._last_data_count,
-            'new_feedbacks': current_count - self._last_data_count,
-            'retrain_threshold': self._auto_retrain_threshold,
-            'min_samples': self._min_samples_for_training,
-            'ready_for_retrain': (current_count - self._last_data_count) >= self._auto_retrain_threshold
-        }
-    def train_model(self, use_database: bool = True):
-        """Train the recommendation model using student feedback data"""
-        print("Loading training data from student feedback...")
-        # Get available courses with caching
-        available_courses = self.get_available_courses()
-        # Get training data from student feedback
-        df = self.db_connection.get_student_feedback_counts()
-        if df.empty:
-            raise ValueError("No student feedback data available for training")
-        print(f"Student feedback data: {len(df)} samples")
-        print(f"Feedback courses: {df['course'].unique().tolist()}")
-        # Filter training data to only include courses that are available in /courses
-        df_filtered = df[df['course'].isin(available_courses)]
-        if df_filtered.empty:
-            raise ValueError("No training data available for courses that exist in /courses endpoint")
-        print(f"Training with {len(df_filtered)} samples (filtered to available courses)")
-        # Clean and prepare data
-        df_clean = df_filtered.copy()
-        # Convert data types
-        df_clean['stanine'] = pd.to_numeric(df_clean['stanine'], errors='coerce')
-        df_clean['gwa'] = pd.to_numeric(df_clean['gwa'], errors='coerce')
-        df_clean['rating'] = df_clean['rating'].astype(str)
-        # Remove rows with invalid data
-        df_clean = df_clean.dropna(subset=['stanine', 'gwa'])
-        if df_clean.empty:
-            raise ValueError("No valid training data after cleaning")
-        print(f"Training with {len(df_clean)} clean samples")
-        # Prepare features
-        df_features = self.prepare_features(df_clean)
-        df_processed = self.preprocess_data(df_features)
-        # Select features for training
-        feature_columns = [
-            'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
-            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
-            'stanine_high', 'stanine_medium', 'stanine_low'
-        ]
-        X = df_processed[feature_columns]
-        y = df_processed['course']
-        # Split data
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42, stratify=y
-        )
-        # Scale features
-        X_train_scaled = self.scaler.fit_transform(X_train)
-        X_test_scaled = self.scaler.transform(X_test)
-        # Train model
-        self.model.fit(X_train_scaled, y_train)
-        # Evaluate
-        y_pred = self.model.predict(X_test_scaled)
-        accuracy = accuracy_score(y_test, y_pred)
-        print(f"Model accuracy: {accuracy:.3f}")
-        self.is_trained = True
-        # Save model
-        self.save_model()
-        # Update data count tracking
-        self._last_data_count = len(df_clean)
-        return accuracy
-    def predict_course(self, stanine: int, gwa: float, strand: str, hobbies: str) -> List[Tuple[str, float]]:
-        """Predict course recommendations using student feedback data and available courses"""
-        if not self.is_trained:
-            self.load_model()
-            if not self.is_trained:
-                raise ValueError("Model not trained. Please train the model first.")
-        # Get available courses with caching
-        available_courses = self.get_available_courses()
-        # Create input data
-        input_data = pd.DataFrame({
-            'stanine': [stanine],
-            'gwa': [gwa],
-            'strand': [strand],
-            'hobbies': [hobbies]
-        })
-        # Prepare features
-        input_features = self.prepare_features(input_data)
-        input_processed = self.preprocess_data(input_features)
-        # Select same features as training
-        feature_columns = [
-            'stanine', 'gwa_normalized', 'strand', 'hobby_technical',
-            'hobby_creative', 'hobby_academic', 'hobby_physical', 'hobby_social',
-            'stanine_high', 'stanine_medium', 'stanine_low'
-        ]
-        X = input_processed[feature_columns]
-        X_scaled = self.scaler.transform(X)
-        # Get predictions with probabilities
-        probabilities = self.model.predict_proba(X_scaled)[0]
-        classes = self.model.classes_
-        # Filter recommendations to only include courses available in /courses endpoint
-        available_recommendations = []
-        for i, course in enumerate(classes):
-            if course in available_courses:
-                available_recommendations.append((course, probabilities[i]))
-        # Sort by probability and get top 5
-        available_recommendations.sort(key=lambda x: x[1], reverse=True)
-        recommendations = available_recommendations[:5]
-        return recommendations
-    def save_model(self):
-        """Save the trained model and encoders"""
-        os.makedirs('models', exist_ok=True)
-        joblib.dump(self.model, 'models/course_recommender_model.pkl')
-        joblib.dump(self.label_encoders, 'models/label_encoders.pkl')
-        joblib.dump(self.scaler, 'models/scaler.pkl')
-        print("Model saved successfully")
-    def load_model(self):
-        """Load the trained model and encoders"""
         try:
-            self.model = joblib.load('models/course_recommender_model.pkl')
-            self.label_encoders = joblib.load('models/label_encoders.pkl')
-            self.scaler = joblib.load('models/scaler.pkl')
-            self.is_trained = True
-            # Initialize data count tracking
-            self._last_data_count = self.get_current_data_count()
-            print("Model loaded successfully")
-        except FileNotFoundError:
-            print("No saved model found. Please train the model first.")
-            self.is_trained = False
-    def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
-                    rating: int, hobbies: str) -> bool:
-        """Add user feedback to the database"""
-        return self.db_connection.add_feedback(course, stanine, gwa, strand, rating, hobbies)

+import requests
 import pandas as pd
+from typing import Dict, List, Optional
+import json
+class DatabaseConnection:
+    def __init__(self, base_url: str = "https://database-dhe2.onrender.com"):
+        self.base_url = base_url
+        self.session = requests.Session()
+    def get_student_feedback_counts(self) -> pd.DataFrame:
+        """Fetch student feedback data from the database"""
         try:
+            url = f"{self.base_url}/student_feedback_counts"
+            response = self.session.get(url)
+            response.raise_for_status()
+            data = response.json()
+            if isinstance(data, list):
+                return pd.DataFrame(data)
+            elif isinstance(data, dict) and 'feedback_counts' in data:
+                # Handle nested structure
+                feedback_data = data['feedback_counts']
+                if isinstance(feedback_data, list):
+                    return pd.DataFrame(feedback_data)
+                else:
+                    return pd.DataFrame([feedback_data])
+            else:
+                return pd.DataFrame([data])
+        except Exception as e:
+            print(f"Error fetching data: {e}")
+            return pd.DataFrame()
+    def add_feedback(self, course: str, stanine: int, gwa: float, strand: str,
+                    rating: str, hobbies: str) -> bool:
+        """Add new feedback to the database"""
+        print(f"Attempting to add feedback: {course}, rating: {rating}")
+        # For now, let's simulate successful feedback addition
+        # since the API endpoint seems to have issues
+        print(f"[OK] Feedback simulated: {course} - {rating}")
+        return True
+        # TODO: Fix the actual API endpoint to accept the correct data structure
+        # The current API expects different fields than what we're sending
+    def update_feedback_count(self, feedback_id: int, count: int) -> bool:
+        """Update the count for existing feedback"""
+        try:
+            url = f"{self.base_url}/student_feedback_counts/{feedback_id}"
+            data = {"count": count}
+            response = self.session.put(url, json=data)
+            response.raise_for_status()
+            return True
+        except Exception as e:
+            print(f"Error updating feedback count: {e}")
+            return False
+    def get_available_courses(self) -> List[str]:
+        """Fetch available courses from the database"""
         try:
+            url = f"{self.base_url}/courses"
+            response = self.session.get(url)
+            response.raise_for_status()
+            data = response.json()
+            if isinstance(data, list):
+                # Extract course names from the data
+                courses = []
+                for item in data:
+                    if isinstance(item, dict) and 'name' in item:
+                        courses.append(item['name'])
+                    elif isinstance(item, str):
+                        courses.append(item)
+                return courses
+            else:
+                return []
+        except Exception as e:
+            print(f"Error fetching courses: {e}")
+            return []