Spaces:

LvMAC
/

course-recommendation-system

Sleeping

App Files Files Community

LvMAC commited on Jul 25, 2025

Commit

5a630d5

verified ·

1 Parent(s): a166a35

Delete main_model.py

Browse files

Files changed (1) hide show

main_model.py +0 -540

main_model.py DELETED Viewed

@@ -1,540 +0,0 @@
-"""
-Simplified Course Recommendation System for Hugging Face Spaces
-Optimized for deployment with reduced dependencies and faster loading
-"""
-import pandas as pd
-import numpy as np
-import re
-import json
-import warnings
-from sentence_transformers import SentenceTransformer
-import faiss
-import requests
-from datetime import datetime
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import nltk
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-from nltk.stem import WordNetLemmatizer
-warnings.filterwarnings('ignore')
-# Download required NLTK data
-try:
-    nltk.download('punkt', quiet=True)
-    nltk.download('stopwords', quiet=True)
-    nltk.download('wordnet', quiet=True)
-    nltk.download('omw-1.4', quiet=True)
-except:
-    pass
-class ProductionCourseRecommendationSystem:
-    def __init__(self, device='cpu'):
-        """Initialize the simplified system for HF Spaces"""
-        self.device = device
-        print(f"🚀 Initializing Course Recommendation System on {device}")
-        # Initialize embedding model (lighter version for HF Spaces)
-        try:
-            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)
-            print("✅ Embedding model loaded successfully")
-        except Exception as e:
-            print(f"⚠️ Error loading embedding model: {e}")
-            self.embedding_model = None
-        # Initialize NLP components
-        self.lemmatizer = WordNetLemmatizer()
-        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
-        # Data components
-        self.course_data = None
-        self.course_embeddings = None
-        self.faiss_index = None
-        self.student_profile = {}
-        # Mistral API configuration
-        self.mistral_api_key = "IOIZD7Z0Sfxd1kjZzLdkNyyA8PNxKBJF"
-        print("✅ System initialized successfully!")
-    def _clean_text(self, text):
-        """Clean and normalize text data"""
-        if pd.isna(text):
-            return ""
-        text = str(text)
-        # Remove newlines and normalize spaces
-        text = re.sub(r'\n+', ' ', text)
-        text = re.sub(r'\s+', ' ', text)
-        text = text.strip().lower()
-        return text
-    def _tokenize_text(self, text):
-        """Tokenize text and remove stopwords"""
-        if not text:
-            return []
-        try:
-            tokens = word_tokenize(text.lower())
-            tokens = [token for token in tokens if token.isalpha() and len(token) > 2]
-            tokens = [token for token in tokens if token not in self.stop_words]
-            return list(dict.fromkeys(tokens))  # Remove duplicates
-        except:
-            # Fallback tokenization
-            tokens = re.sub(r'[^\w\s]', ' ', text.lower()).split()
-            return [token for token in tokens if len(token) > 2 and token not in self.stop_words]
-    def _create_enhanced_embeddings_and_faiss_index(self):
-        """Create optimized embeddings for the course dataset"""
-        if self.embedding_model is None or self.course_data is None:
-            print("⚠️ Cannot create embeddings: missing model or data")
-            return
-        print("🎯 Creating course embeddings...")
-        combined_texts = []
-        for _, row in self.course_data.iterrows():
-            # Create comprehensive course description
-            text = f"Course: {row['Course Name']}. Description: {row['Description']}. Type: {row['Type']}. Skills: {row['Skill Required']}. Field: {row['Field Interest']}. Career: {row.get('Career Paths', '')}. Industry: {row.get('Industry Sectors', '')}."
-            combined_texts.append(text)
-        # Generate embeddings
-        try:
-            self.course_embeddings = self.embedding_model.encode(
-                combined_texts,
-                batch_size=16,
-                show_progress_bar=True,
-                convert_to_numpy=True,
-                normalize_embeddings=True
-            )
-            # Build FAISS index
-            dimension = self.course_embeddings.shape[1]
-            self.faiss_index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
-            self.faiss_index.add(self.course_embeddings.astype('float32'))
-            print(f"✅ FAISS index created with {self.faiss_index.ntotal} courses")
-            print(f"📏 Embedding dimension: {dimension}")
-        except Exception as e:
-            print(f"❌ Error creating embeddings: {e}")
-    def create_enhanced_student_profile(self):
-        """Create student profile embedding from survey responses"""
-        if not self.student_profile or self.embedding_model is None:
-            print("⚠️ Cannot create profile: missing data or model")
-            return None, []
-        # Extract key information
-        study_hours = self.student_profile.get('Q1', '')
-        favorite_course = self.student_profile.get('Q2', '')
-        project_topic = self.student_profile.get('Q3', '')
-        problem_solving = self.student_profile.get('Q4', '')
-        career_goals = self.student_profile.get('Q5', '')
-        strengths = self.student_profile.get('Q6', '')
-        weaknesses = self.student_profile.get('Q7', '')
-        research_interests = self.student_profile.get('Q8', '')
-        course_preference = self.student_profile.get('Q9', '')
-        stress_response = self.student_profile.get('Q10', '')
-        # Create comprehensive profile text
-        profile_text = f"Study commitment: {study_hours}. Previous experience: {favorite_course}. Project interests: {project_topic}. Problem solving: {problem_solving}. Career goals: {career_goals}. Strengths: {strengths}. Areas for improvement: {weaknesses}. Research interests: {research_interests}. Learning preferences: {course_preference}. Stress management: {stress_response}."
-        # Generate embedding
-        try:
-            profile_embedding = self.embedding_model.encode([profile_text], normalize_embeddings=True)
-            return profile_embedding[0], [profile_text]
-        except Exception as e:
-            print(f"❌ Error creating profile embedding: {e}")
-            return None, []
-    def advanced_similarity_search(self, student_embedding, k=None):
-        """Perform similarity search using FAISS"""
-        if self.faiss_index is None or student_embedding is None:
-            print("⚠️ Cannot perform search: missing index or embedding")
-            return [], []
-        if k is None:
-            k = min(len(self.course_data), 10)
-        try:
-            # Perform FAISS search
-            similarities, indices = self.faiss_index.search(
-                student_embedding.reshape(1, -1).astype('float32'), k
-            )
-            # Convert similarities to percentages
-            similarity_scores = (similarities[0] * 100).clip(0, 100)
-            return similarity_scores, indices[0]
-        except Exception as e:
-            print(f"❌ Error in similarity search: {e}")
-            return [], []
-    def calculate_advanced_behavioral_metrics(self):
-        """Calculate behavioral compatibility metrics"""
-        if not self.student_profile or self.course_data is None:
-            return {}
-        metrics = {
-            'stress_matching': [],
-            'type_matching': [],
-            'description_matching': [],
-            'skill_matching': [],
-            'field_matching': []
-        }
-        # Extract student information
-        study_hours = self.student_profile.get('Q1', '')
-        favorite_course = self.student_profile.get('Q2', '')
-        project_topic = self.student_profile.get('Q3', '')
-        career_goals = self.student_profile.get('Q5', '')
-        strengths = self.student_profile.get('Q6', '')
-        weaknesses = self.student_profile.get('Q7', '')
-        research_interests = self.student_profile.get('Q8', '')
-        course_preference = self.student_profile.get('Q9', '')
-        stress_response = self.student_profile.get('Q10', '')
-        # Assess stress tolerance
-        stress_tolerance = self._assess_stress_tolerance(stress_response)
-        # Calculate metrics for each course
-        for _, course in self.course_data.iterrows():
-            # Stress compatibility
-            stress_score = self._calculate_stress_compatibility(stress_tolerance, course)
-            metrics['stress_matching'].append(stress_score)
-            # Learning type compatibility
-            type_score = self._calculate_type_compatibility(course_preference, course)
-            metrics['type_matching'].append(type_score)
-            # Interest alignment
-            desc_score = self._calculate_description_compatibility(
-                favorite_course, project_topic, career_goals, course
-            )
-            metrics['description_matching'].append(desc_score)
-            # Skill compatibility
-            skill_score = self._calculate_skill_compatibility(strengths, weaknesses, course)
-            metrics['skill_matching'].append(skill_score)
-            # Field compatibility
-            field_score = self._calculate_field_compatibility(research_interests, career_goals, course)
-            metrics['field_matching'].append(field_score)
-        return metrics
-    def _assess_stress_tolerance(self, stress_response):
-        """Assess student's stress tolerance level"""
-        response_lower = stress_response.lower()
-        high_indicators = ['calm', 'organized', 'handle', 'manage', 'control', 'systematic']
-        low_indicators = ['overwhelmed', 'panic', 'stressed', 'anxious', 'difficult', 'struggle']
-        high_score = sum(1 for indicator in high_indicators if indicator in response_lower)
-        low_score = sum(1 for indicator in low_indicators if indicator in response_lower)
-        if high_score >= 2:
-            return 'high'
-        elif low_score >= 2:
-            return 'low'
-        else:
-            return 'medium'
-    def _calculate_stress_compatibility(self, stress_tolerance, course):
-        """Calculate stress level compatibility"""
-        course_stress = course.get('stress_numeric', 2)
-        compatibility_matrix = {
-            ('high', 3): 95, ('high', 2): 85, ('high', 1): 70,
-            ('medium', 3): 60, ('medium', 2): 90, ('medium', 1): 85,
-            ('low', 3): 25, ('low', 2): 70, ('low', 1): 95
-        }
-        return compatibility_matrix.get((stress_tolerance, course_stress), 50)
-    def _calculate_type_compatibility(self, course_preference, course):
-        """Calculate learning type compatibility"""
-        course_type = str(course.get('Type', '')).lower()
-        preference_lower = course_preference.lower()
-        # Calculate semantic similarity
-        similarity = self._calculate_text_similarity(preference_lower, course_type)
-        # Add keyword bonuses
-        practical_keywords = ['hands-on', 'practical', 'applied', 'project']
-        theoretical_keywords = ['theory', 'theoretical', 'concept', 'academic']
-        bonus = 0
-        if any(keyword in preference_lower for keyword in practical_keywords) and 'practical' in course_type:
-            bonus += 20
-        if any(keyword in preference_lower for keyword in theoretical_keywords) and 'theoretical' in course_type:
-            bonus += 20
-        return min(100, similarity + bonus)
-    def _calculate_description_compatibility(self, favorite_course, project_topic, career_goals, course):
-        """Calculate compatibility based on course description and interests"""
-        course_desc = str(course.get('Description', ''))
-        course_field = str(course.get('Field Interest', ''))
-        # Calculate similarities
-        fav_similarity = self._calculate_text_similarity(favorite_course, course_desc)
-        project_similarity = self._calculate_text_similarity(project_topic, course_desc)
-        career_similarity = self._calculate_text_similarity(career_goals, course_field)
-        # Weighted average
-        weighted_score = (fav_similarity * 0.3 + project_similarity * 0.4 + career_similarity * 0.3)
-        return min(100, weighted_score)
-    def _calculate_skill_compatibility(self, strengths, weaknesses, course):
-        """Calculate skill compatibility"""
-        skills_required = str(course.get('Skill Required', ''))
-        # Calculate strength match
-        strength_match = self._calculate_text_similarity(strengths, skills_required)
-        # Check for weakness conflicts
-        weakness_penalty = 0
-        weakness_lower = weaknesses.lower()
-        skills_lower = skills_required.lower()
-        # Simple conflict detection
-        if 'math' in weakness_lower and ('math' in skills_lower or 'statistical' in skills_lower):
-            weakness_penalty += 10
-        if 'programming' in weakness_lower and ('programming' in skills_lower or 'coding' in skills_lower):
-            weakness_penalty += 10
-        return max(0, min(100, strength_match - weakness_penalty))
-    def _calculate_field_compatibility(self, research_interests, career_goals, course):
-        """Calculate field compatibility"""
-        field_interest = str(course.get('Field Interest', ''))
-        career_paths = str(course.get('Career Paths', ''))
-        research_similarity = self._calculate_text_similarity(research_interests, field_interest)
-        career_similarity = self._calculate_text_similarity(career_goals, career_paths)
-        return min(100, (research_similarity + career_similarity) / 2)
-    def _calculate_text_similarity(self, text1, text2):
-        """Calculate semantic similarity between two texts"""
-        if not text1 or not text2:
-            return 30
-        text1 = str(text1).lower().strip()
-        text2 = str(text2).lower().strip()
-        if text1 == text2:
-            return 100
-        try:
-            # Use embedding model if available
-            if self.embedding_model:
-                embeddings = self.embedding_model.encode([text1, text2])
-                similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
-                return max(0, min(100, similarity * 100))
-        except:
-            pass
-        # Fallback to TF-IDF similarity
-        try:
-            vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
-            tfidf_matrix = vectorizer.fit_transform([text1, text2])
-            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
-            return max(0, min(100, similarity * 100))
-        except:
-            pass
-        # Simple keyword matching as final fallback
-        words1 = set(text1.split())
-        words2 = set(text2.split())
-        intersection = words1 & words2
-        union = words1 | words2
-        if len(union) == 0:
-            return 0
-        jaccard_similarity = len(intersection) / len(union)
-        return max(0, min(100, jaccard_similarity * 100))
-    def _generate_fallback_recommendations(self, top_course_indices, similarity_scores, behavioral_metrics):
-        """Generate recommendations without external API"""
-        recommendations = []
-        for i, course_idx in enumerate(top_course_indices[:5]):
-            if course_idx >= len(self.course_data):
-                continue
-            course = self.course_data.iloc[course_idx]
-            base_confidence = similarity_scores[i] if i < len(similarity_scores) else 0
-            # Calculate behavioral scores
-            behavior_scores = []
-            for metric_name, values in behavioral_metrics.items():
-                if course_idx < len(values):
-                    behavior_scores.append(values[course_idx])
-            avg_behavior_score = np.mean(behavior_scores) if behavior_scores else 50
-            # Enhanced confidence combining similarity and behavioral scores
-            enhanced_confidence = (base_confidence * 0.4 + avg_behavior_score * 0.6)
-            recommendations.append({
-                'course': course,
-                'confidence': enhanced_confidence,
-                'index': course_idx,
-                'avg_bhvr_score': avg_behavior_score,
-                'base_confidence': base_confidence
-            })
-        # Sort by average behavioral score
-        recommendations.sort(key=lambda x: x['avg_bhvr_score'], reverse=True)
-        return recommendations
-    def generate_recommendations_with_mistral(self, top_course_indices, similarity_scores, behavioral_metrics):
-        """Generate recommendations using Mistral API"""
-        try:
-            # Prepare context
-            student_context = {
-                'study_hours': self.student_profile.get('Q1', ''),
-                'favorite_course': self.student_profile.get('Q2', ''),
-                'project_interests': self.student_profile.get('Q3', ''),
-                'career_goals': self.student_profile.get('Q5', ''),
-                'strengths': self.student_profile.get('Q6', ''),
-                'course_preferences': self.student_profile.get('Q9', '')
-            }
-            # Get top courses
-            top_courses = []
-            for i, idx in enumerate(top_course_indices[:3]):
-                if idx < len(self.course_data):
-                    course = self.course_data.iloc[idx]
-                    top_courses.append({
-                        'name': course['Course Name'],
-                        'description': course['Description'],
-                        'type': course['Type'],
-                        'confidence': similarity_scores[i] if i < len(similarity_scores) else 0
-                    })
-            # Create prompt
-            prompt = self._create_mistral_prompt(student_context, top_courses)
-            # Call Mistral API
-            response = self._call_mistral_api(prompt)
-            if response:
-                return self._parse_mistral_response(response, top_course_indices, similarity_scores, behavioral_metrics)
-        except Exception as e:
-            print(f"⚠️ Mistral API error: {e}")
-        # Fallback to non-API recommendations
-        return self._generate_fallback_recommendations(top_course_indices, similarity_scores, behavioral_metrics)
-    def _create_mistral_prompt(self, student_context, top_courses):
-        """Create prompt for Mistral API"""
-        prompt = f"""<s>[INST] You are an expert educational counselor. Analyze this student profile and recommend the best course from the options.
-Student Profile:
-- Study Commitment: {student_context['study_hours']}
-- Previous Experience: {student_context['favorite_course']}
-- Project Interests: {student_context['project_interests']}
-- Career Goals: {student_context['career_goals']}
-- Strengths: {student_context['strengths']}
-- Learning Preferences: {student_context['course_preferences']}
-Available Courses:
-"""
-        for i, course in enumerate(top_courses, 1):
-            prompt += f"\n{i}. {course['name']}\n   Description: {course['description']}\n   Type: {course['type']}\n   AI Confidence: {course['confidence']:.1f}%\n"
-        prompt += """\nProvide your recommendation in this exact JSON format:
-{
-    "recommended_course": "[exact course name]",
-    "confidence": [number between 0-100],
-    "reasoning": "[brief explanation]"
-}[/INST]"""
-        return prompt
-    def _call_mistral_api(self, prompt):
-        """Call Mistral API for course recommendation"""
-        try:
-            headers = {
-                'Authorization': f'Bearer {self.mistral_api_key}',
-                'Content-Type': 'application/json',
-            }
-            data = {
-                'model': 'mistral-large-latest',
-                'messages': [{'role': 'user', 'content': prompt}],
-                'max_tokens': 500,
-                'temperature': 0.7,
-            }
-            response = requests.post(
-                'https://api.mistral.ai/v1/chat/completions',
-                headers=headers,
-                json=data,
-                timeout=30
-            )
-            if response.status_code == 200:
-                result = response.json()
-                return result['choices'][0]['message']['content']
-        except Exception as e:
-            print(f"❌ Mistral API call failed: {e}")
-        return None
-    def _parse_mistral_response(self, response, top_course_indices, similarity_scores, behavioral_metrics):
-        """Parse Mistral API response"""
-        try:
-            # Extract JSON from response
-            json_start = response.find('{')
-            json_end = response.rfind('}') + 1
-            if json_start != -1 and json_end > json_start:
-                json_text = response[json_start:json_end]
-                parsed = json.loads(json_text)
-                recommended_course = parsed.get('recommended_course', '')
-                ai_confidence = parsed.get('confidence', 75)
-                reasoning = parsed.get('reasoning', 'AI-generated recommendation')
-                # Find the course in our data
-                for i, idx in enumerate(top_course_indices[:3]):
-                    if idx < len(self.course_data):
-                        course = self.course_data.iloc[idx]
-                        if recommended_course.lower() in course['Course Name'].lower():
-                            # Calculate behavioral score
-                            behavior_scores = [
-                                behavioral_metrics['stress_matching'][idx],
-                                behavioral_metrics['type_matching'][idx],
-                                behavioral_metrics['description_matching'][idx],
-                                behavioral_metrics['skill_matching'][idx],
-                                behavioral_metrics['field_matching'][idx]
-                            ]
-                            avg_behavior_score = np.mean(behavior_scores)
-                            return [{
-                                'course': course,
-                                'confidence': ai_confidence,
-                                'index': idx,
-                                'avg_bhvr_score': avg_behavior_score,
-                                'base_confidence': similarity_scores[i] if i < len(similarity_scores) else 0,
-                                'ai_reasoning': reasoning
-                            }]
-        except Exception as e:
-            print(f"❌ Error parsing Mistral response: {e}")
-        # Fallback
-        return self._generate_fallback_recommendations(top_course_indices, similarity_scores, behavioral_metrics)