LvMAC commited on
Commit
5a630d5
·
verified ·
1 Parent(s): a166a35

Delete main_model.py

Browse files
Files changed (1) hide show
  1. main_model.py +0 -540
main_model.py DELETED
@@ -1,540 +0,0 @@
1
- """
2
- Simplified Course Recommendation System for Hugging Face Spaces
3
- Optimized for deployment with reduced dependencies and faster loading
4
- """
5
-
6
- import pandas as pd
7
- import numpy as np
8
- import re
9
- import json
10
- import warnings
11
- from sentence_transformers import SentenceTransformer
12
- import faiss
13
- import requests
14
- from datetime import datetime
15
- from sklearn.feature_extraction.text import TfidfVectorizer
16
- from sklearn.metrics.pairwise import cosine_similarity
17
- import nltk
18
- from nltk.corpus import stopwords
19
- from nltk.tokenize import word_tokenize
20
- from nltk.stem import WordNetLemmatizer
21
-
22
- warnings.filterwarnings('ignore')
23
-
24
- # Download required NLTK data
25
- try:
26
- nltk.download('punkt', quiet=True)
27
- nltk.download('stopwords', quiet=True)
28
- nltk.download('wordnet', quiet=True)
29
- nltk.download('omw-1.4', quiet=True)
30
- except:
31
- pass
32
-
33
- class ProductionCourseRecommendationSystem:
34
- def __init__(self, device='cpu'):
35
- """Initialize the simplified system for HF Spaces"""
36
- self.device = device
37
- print(f"🚀 Initializing Course Recommendation System on {device}")
38
-
39
- # Initialize embedding model (lighter version for HF Spaces)
40
- try:
41
- self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)
42
- print("✅ Embedding model loaded successfully")
43
- except Exception as e:
44
- print(f"⚠️ Error loading embedding model: {e}")
45
- self.embedding_model = None
46
-
47
- # Initialize NLP components
48
- self.lemmatizer = WordNetLemmatizer()
49
- self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
50
-
51
- # Data components
52
- self.course_data = None
53
- self.course_embeddings = None
54
- self.faiss_index = None
55
- self.student_profile = {}
56
-
57
- # Mistral API configuration
58
- self.mistral_api_key = "IOIZD7Z0Sfxd1kjZzLdkNyyA8PNxKBJF"
59
-
60
- print("✅ System initialized successfully!")
61
-
62
- def _clean_text(self, text):
63
- """Clean and normalize text data"""
64
- if pd.isna(text):
65
- return ""
66
-
67
- text = str(text)
68
- # Remove newlines and normalize spaces
69
- text = re.sub(r'\n+', ' ', text)
70
- text = re.sub(r'\s+', ' ', text)
71
- text = text.strip().lower()
72
-
73
- return text
74
-
75
- def _tokenize_text(self, text):
76
- """Tokenize text and remove stopwords"""
77
- if not text:
78
- return []
79
-
80
- try:
81
- tokens = word_tokenize(text.lower())
82
- tokens = [token for token in tokens if token.isalpha() and len(token) > 2]
83
- tokens = [token for token in tokens if token not in self.stop_words]
84
- return list(dict.fromkeys(tokens)) # Remove duplicates
85
- except:
86
- # Fallback tokenization
87
- tokens = re.sub(r'[^\w\s]', ' ', text.lower()).split()
88
- return [token for token in tokens if len(token) > 2 and token not in self.stop_words]
89
-
90
- def _create_enhanced_embeddings_and_faiss_index(self):
91
- """Create optimized embeddings for the course dataset"""
92
- if self.embedding_model is None or self.course_data is None:
93
- print("⚠️ Cannot create embeddings: missing model or data")
94
- return
95
-
96
- print("🎯 Creating course embeddings...")
97
-
98
- combined_texts = []
99
- for _, row in self.course_data.iterrows():
100
- # Create comprehensive course description
101
- text = f"Course: {row['Course Name']}. Description: {row['Description']}. Type: {row['Type']}. Skills: {row['Skill Required']}. Field: {row['Field Interest']}. Career: {row.get('Career Paths', '')}. Industry: {row.get('Industry Sectors', '')}."
102
- combined_texts.append(text)
103
-
104
- # Generate embeddings
105
- try:
106
- self.course_embeddings = self.embedding_model.encode(
107
- combined_texts,
108
- batch_size=16,
109
- show_progress_bar=True,
110
- convert_to_numpy=True,
111
- normalize_embeddings=True
112
- )
113
-
114
- # Build FAISS index
115
- dimension = self.course_embeddings.shape[1]
116
- self.faiss_index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
117
- self.faiss_index.add(self.course_embeddings.astype('float32'))
118
-
119
- print(f"✅ FAISS index created with {self.faiss_index.ntotal} courses")
120
- print(f"📏 Embedding dimension: {dimension}")
121
-
122
- except Exception as e:
123
- print(f"❌ Error creating embeddings: {e}")
124
-
125
- def create_enhanced_student_profile(self):
126
- """Create student profile embedding from survey responses"""
127
- if not self.student_profile or self.embedding_model is None:
128
- print("⚠️ Cannot create profile: missing data or model")
129
- return None, []
130
-
131
- # Extract key information
132
- study_hours = self.student_profile.get('Q1', '')
133
- favorite_course = self.student_profile.get('Q2', '')
134
- project_topic = self.student_profile.get('Q3', '')
135
- problem_solving = self.student_profile.get('Q4', '')
136
- career_goals = self.student_profile.get('Q5', '')
137
- strengths = self.student_profile.get('Q6', '')
138
- weaknesses = self.student_profile.get('Q7', '')
139
- research_interests = self.student_profile.get('Q8', '')
140
- course_preference = self.student_profile.get('Q9', '')
141
- stress_response = self.student_profile.get('Q10', '')
142
-
143
- # Create comprehensive profile text
144
- profile_text = f"Study commitment: {study_hours}. Previous experience: {favorite_course}. Project interests: {project_topic}. Problem solving: {problem_solving}. Career goals: {career_goals}. Strengths: {strengths}. Areas for improvement: {weaknesses}. Research interests: {research_interests}. Learning preferences: {course_preference}. Stress management: {stress_response}."
145
-
146
- # Generate embedding
147
- try:
148
- profile_embedding = self.embedding_model.encode([profile_text], normalize_embeddings=True)
149
- return profile_embedding[0], [profile_text]
150
- except Exception as e:
151
- print(f"❌ Error creating profile embedding: {e}")
152
- return None, []
153
-
154
- def advanced_similarity_search(self, student_embedding, k=None):
155
- """Perform similarity search using FAISS"""
156
- if self.faiss_index is None or student_embedding is None:
157
- print("⚠️ Cannot perform search: missing index or embedding")
158
- return [], []
159
-
160
- if k is None:
161
- k = min(len(self.course_data), 10)
162
-
163
- try:
164
- # Perform FAISS search
165
- similarities, indices = self.faiss_index.search(
166
- student_embedding.reshape(1, -1).astype('float32'), k
167
- )
168
-
169
- # Convert similarities to percentages
170
- similarity_scores = (similarities[0] * 100).clip(0, 100)
171
-
172
- return similarity_scores, indices[0]
173
-
174
- except Exception as e:
175
- print(f"❌ Error in similarity search: {e}")
176
- return [], []
177
-
178
- def calculate_advanced_behavioral_metrics(self):
179
- """Calculate behavioral compatibility metrics"""
180
- if not self.student_profile or self.course_data is None:
181
- return {}
182
-
183
- metrics = {
184
- 'stress_matching': [],
185
- 'type_matching': [],
186
- 'description_matching': [],
187
- 'skill_matching': [],
188
- 'field_matching': []
189
- }
190
-
191
- # Extract student information
192
- study_hours = self.student_profile.get('Q1', '')
193
- favorite_course = self.student_profile.get('Q2', '')
194
- project_topic = self.student_profile.get('Q3', '')
195
- career_goals = self.student_profile.get('Q5', '')
196
- strengths = self.student_profile.get('Q6', '')
197
- weaknesses = self.student_profile.get('Q7', '')
198
- research_interests = self.student_profile.get('Q8', '')
199
- course_preference = self.student_profile.get('Q9', '')
200
- stress_response = self.student_profile.get('Q10', '')
201
-
202
- # Assess stress tolerance
203
- stress_tolerance = self._assess_stress_tolerance(stress_response)
204
-
205
- # Calculate metrics for each course
206
- for _, course in self.course_data.iterrows():
207
- # Stress compatibility
208
- stress_score = self._calculate_stress_compatibility(stress_tolerance, course)
209
- metrics['stress_matching'].append(stress_score)
210
-
211
- # Learning type compatibility
212
- type_score = self._calculate_type_compatibility(course_preference, course)
213
- metrics['type_matching'].append(type_score)
214
-
215
- # Interest alignment
216
- desc_score = self._calculate_description_compatibility(
217
- favorite_course, project_topic, career_goals, course
218
- )
219
- metrics['description_matching'].append(desc_score)
220
-
221
- # Skill compatibility
222
- skill_score = self._calculate_skill_compatibility(strengths, weaknesses, course)
223
- metrics['skill_matching'].append(skill_score)
224
-
225
- # Field compatibility
226
- field_score = self._calculate_field_compatibility(research_interests, career_goals, course)
227
- metrics['field_matching'].append(field_score)
228
-
229
- return metrics
230
-
231
- def _assess_stress_tolerance(self, stress_response):
232
- """Assess student's stress tolerance level"""
233
- response_lower = stress_response.lower()
234
-
235
- high_indicators = ['calm', 'organized', 'handle', 'manage', 'control', 'systematic']
236
- low_indicators = ['overwhelmed', 'panic', 'stressed', 'anxious', 'difficult', 'struggle']
237
-
238
- high_score = sum(1 for indicator in high_indicators if indicator in response_lower)
239
- low_score = sum(1 for indicator in low_indicators if indicator in response_lower)
240
-
241
- if high_score >= 2:
242
- return 'high'
243
- elif low_score >= 2:
244
- return 'low'
245
- else:
246
- return 'medium'
247
-
248
- def _calculate_stress_compatibility(self, stress_tolerance, course):
249
- """Calculate stress level compatibility"""
250
- course_stress = course.get('stress_numeric', 2)
251
-
252
- compatibility_matrix = {
253
- ('high', 3): 95, ('high', 2): 85, ('high', 1): 70,
254
- ('medium', 3): 60, ('medium', 2): 90, ('medium', 1): 85,
255
- ('low', 3): 25, ('low', 2): 70, ('low', 1): 95
256
- }
257
-
258
- return compatibility_matrix.get((stress_tolerance, course_stress), 50)
259
-
260
- def _calculate_type_compatibility(self, course_preference, course):
261
- """Calculate learning type compatibility"""
262
- course_type = str(course.get('Type', '')).lower()
263
- preference_lower = course_preference.lower()
264
-
265
- # Calculate semantic similarity
266
- similarity = self._calculate_text_similarity(preference_lower, course_type)
267
-
268
- # Add keyword bonuses
269
- practical_keywords = ['hands-on', 'practical', 'applied', 'project']
270
- theoretical_keywords = ['theory', 'theoretical', 'concept', 'academic']
271
-
272
- bonus = 0
273
- if any(keyword in preference_lower for keyword in practical_keywords) and 'practical' in course_type:
274
- bonus += 20
275
- if any(keyword in preference_lower for keyword in theoretical_keywords) and 'theoretical' in course_type:
276
- bonus += 20
277
-
278
- return min(100, similarity + bonus)
279
-
280
- def _calculate_description_compatibility(self, favorite_course, project_topic, career_goals, course):
281
- """Calculate compatibility based on course description and interests"""
282
- course_desc = str(course.get('Description', ''))
283
- course_field = str(course.get('Field Interest', ''))
284
-
285
- # Calculate similarities
286
- fav_similarity = self._calculate_text_similarity(favorite_course, course_desc)
287
- project_similarity = self._calculate_text_similarity(project_topic, course_desc)
288
- career_similarity = self._calculate_text_similarity(career_goals, course_field)
289
-
290
- # Weighted average
291
- weighted_score = (fav_similarity * 0.3 + project_similarity * 0.4 + career_similarity * 0.3)
292
- return min(100, weighted_score)
293
-
294
- def _calculate_skill_compatibility(self, strengths, weaknesses, course):
295
- """Calculate skill compatibility"""
296
- skills_required = str(course.get('Skill Required', ''))
297
-
298
- # Calculate strength match
299
- strength_match = self._calculate_text_similarity(strengths, skills_required)
300
-
301
- # Check for weakness conflicts
302
- weakness_penalty = 0
303
- weakness_lower = weaknesses.lower()
304
- skills_lower = skills_required.lower()
305
-
306
- # Simple conflict detection
307
- if 'math' in weakness_lower and ('math' in skills_lower or 'statistical' in skills_lower):
308
- weakness_penalty += 10
309
- if 'programming' in weakness_lower and ('programming' in skills_lower or 'coding' in skills_lower):
310
- weakness_penalty += 10
311
-
312
- return max(0, min(100, strength_match - weakness_penalty))
313
-
314
- def _calculate_field_compatibility(self, research_interests, career_goals, course):
315
- """Calculate field compatibility"""
316
- field_interest = str(course.get('Field Interest', ''))
317
- career_paths = str(course.get('Career Paths', ''))
318
-
319
- research_similarity = self._calculate_text_similarity(research_interests, field_interest)
320
- career_similarity = self._calculate_text_similarity(career_goals, career_paths)
321
-
322
- return min(100, (research_similarity + career_similarity) / 2)
323
-
324
- def _calculate_text_similarity(self, text1, text2):
325
- """Calculate semantic similarity between two texts"""
326
- if not text1 or not text2:
327
- return 30
328
-
329
- text1 = str(text1).lower().strip()
330
- text2 = str(text2).lower().strip()
331
-
332
- if text1 == text2:
333
- return 100
334
-
335
- try:
336
- # Use embedding model if available
337
- if self.embedding_model:
338
- embeddings = self.embedding_model.encode([text1, text2])
339
- similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
340
- return max(0, min(100, similarity * 100))
341
- except:
342
- pass
343
-
344
- # Fallback to TF-IDF similarity
345
- try:
346
- vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
347
- tfidf_matrix = vectorizer.fit_transform([text1, text2])
348
- similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
349
- return max(0, min(100, similarity * 100))
350
- except:
351
- pass
352
-
353
- # Simple keyword matching as final fallback
354
- words1 = set(text1.split())
355
- words2 = set(text2.split())
356
- intersection = words1 & words2
357
- union = words1 | words2
358
-
359
- if len(union) == 0:
360
- return 0
361
-
362
- jaccard_similarity = len(intersection) / len(union)
363
- return max(0, min(100, jaccard_similarity * 100))
364
-
365
- def _generate_fallback_recommendations(self, top_course_indices, similarity_scores, behavioral_metrics):
366
- """Generate recommendations without external API"""
367
- recommendations = []
368
-
369
- for i, course_idx in enumerate(top_course_indices[:5]):
370
- if course_idx >= len(self.course_data):
371
- continue
372
-
373
- course = self.course_data.iloc[course_idx]
374
- base_confidence = similarity_scores[i] if i < len(similarity_scores) else 0
375
-
376
- # Calculate behavioral scores
377
- behavior_scores = []
378
- for metric_name, values in behavioral_metrics.items():
379
- if course_idx < len(values):
380
- behavior_scores.append(values[course_idx])
381
-
382
- avg_behavior_score = np.mean(behavior_scores) if behavior_scores else 50
383
-
384
- # Enhanced confidence combining similarity and behavioral scores
385
- enhanced_confidence = (base_confidence * 0.4 + avg_behavior_score * 0.6)
386
-
387
- recommendations.append({
388
- 'course': course,
389
- 'confidence': enhanced_confidence,
390
- 'index': course_idx,
391
- 'avg_bhvr_score': avg_behavior_score,
392
- 'base_confidence': base_confidence
393
- })
394
-
395
- # Sort by average behavioral score
396
- recommendations.sort(key=lambda x: x['avg_bhvr_score'], reverse=True)
397
- return recommendations
398
-
399
- def generate_recommendations_with_mistral(self, top_course_indices, similarity_scores, behavioral_metrics):
400
- """Generate recommendations using Mistral API"""
401
- try:
402
- # Prepare context
403
- student_context = {
404
- 'study_hours': self.student_profile.get('Q1', ''),
405
- 'favorite_course': self.student_profile.get('Q2', ''),
406
- 'project_interests': self.student_profile.get('Q3', ''),
407
- 'career_goals': self.student_profile.get('Q5', ''),
408
- 'strengths': self.student_profile.get('Q6', ''),
409
- 'course_preferences': self.student_profile.get('Q9', '')
410
- }
411
-
412
- # Get top courses
413
- top_courses = []
414
- for i, idx in enumerate(top_course_indices[:3]):
415
- if idx < len(self.course_data):
416
- course = self.course_data.iloc[idx]
417
- top_courses.append({
418
- 'name': course['Course Name'],
419
- 'description': course['Description'],
420
- 'type': course['Type'],
421
- 'confidence': similarity_scores[i] if i < len(similarity_scores) else 0
422
- })
423
-
424
- # Create prompt
425
- prompt = self._create_mistral_prompt(student_context, top_courses)
426
-
427
- # Call Mistral API
428
- response = self._call_mistral_api(prompt)
429
-
430
- if response:
431
- return self._parse_mistral_response(response, top_course_indices, similarity_scores, behavioral_metrics)
432
-
433
- except Exception as e:
434
- print(f"⚠️ Mistral API error: {e}")
435
-
436
- # Fallback to non-API recommendations
437
- return self._generate_fallback_recommendations(top_course_indices, similarity_scores, behavioral_metrics)
438
-
439
- def _create_mistral_prompt(self, student_context, top_courses):
440
- """Create prompt for Mistral API"""
441
- prompt = f"""<s>[INST] You are an expert educational counselor. Analyze this student profile and recommend the best course from the options.
442
-
443
- Student Profile:
444
- - Study Commitment: {student_context['study_hours']}
445
- - Previous Experience: {student_context['favorite_course']}
446
- - Project Interests: {student_context['project_interests']}
447
- - Career Goals: {student_context['career_goals']}
448
- - Strengths: {student_context['strengths']}
449
- - Learning Preferences: {student_context['course_preferences']}
450
-
451
- Available Courses:
452
- """
453
-
454
- for i, course in enumerate(top_courses, 1):
455
- prompt += f"\n{i}. {course['name']}\n Description: {course['description']}\n Type: {course['type']}\n AI Confidence: {course['confidence']:.1f}%\n"
456
-
457
- prompt += """\nProvide your recommendation in this exact JSON format:
458
- {
459
- "recommended_course": "[exact course name]",
460
- "confidence": [number between 0-100],
461
- "reasoning": "[brief explanation]"
462
- }[/INST]"""
463
-
464
- return prompt
465
-
466
- def _call_mistral_api(self, prompt):
467
- """Call Mistral API for course recommendation"""
468
- try:
469
- headers = {
470
- 'Authorization': f'Bearer {self.mistral_api_key}',
471
- 'Content-Type': 'application/json',
472
- }
473
-
474
- data = {
475
- 'model': 'mistral-large-latest',
476
- 'messages': [{'role': 'user', 'content': prompt}],
477
- 'max_tokens': 500,
478
- 'temperature': 0.7,
479
- }
480
-
481
- response = requests.post(
482
- 'https://api.mistral.ai/v1/chat/completions',
483
- headers=headers,
484
- json=data,
485
- timeout=30
486
- )
487
-
488
- if response.status_code == 200:
489
- result = response.json()
490
- return result['choices'][0]['message']['content']
491
-
492
- except Exception as e:
493
- print(f"❌ Mistral API call failed: {e}")
494
-
495
- return None
496
-
497
- def _parse_mistral_response(self, response, top_course_indices, similarity_scores, behavioral_metrics):
498
- """Parse Mistral API response"""
499
- try:
500
- # Extract JSON from response
501
- json_start = response.find('{')
502
- json_end = response.rfind('}') + 1
503
-
504
- if json_start != -1 and json_end > json_start:
505
- json_text = response[json_start:json_end]
506
- parsed = json.loads(json_text)
507
-
508
- recommended_course = parsed.get('recommended_course', '')
509
- ai_confidence = parsed.get('confidence', 75)
510
- reasoning = parsed.get('reasoning', 'AI-generated recommendation')
511
-
512
- # Find the course in our data
513
- for i, idx in enumerate(top_course_indices[:3]):
514
- if idx < len(self.course_data):
515
- course = self.course_data.iloc[idx]
516
- if recommended_course.lower() in course['Course Name'].lower():
517
- # Calculate behavioral score
518
- behavior_scores = [
519
- behavioral_metrics['stress_matching'][idx],
520
- behavioral_metrics['type_matching'][idx],
521
- behavioral_metrics['description_matching'][idx],
522
- behavioral_metrics['skill_matching'][idx],
523
- behavioral_metrics['field_matching'][idx]
524
- ]
525
- avg_behavior_score = np.mean(behavior_scores)
526
-
527
- return [{
528
- 'course': course,
529
- 'confidence': ai_confidence,
530
- 'index': idx,
531
- 'avg_bhvr_score': avg_behavior_score,
532
- 'base_confidence': similarity_scores[i] if i < len(similarity_scores) else 0,
533
- 'ai_reasoning': reasoning
534
- }]
535
-
536
- except Exception as e:
537
- print(f"❌ Error parsing Mistral response: {e}")
538
-
539
- # Fallback
540
- return self._generate_fallback_recommendations(top_course_indices, similarity_scores, behavioral_metrics)