"""Universal Module - Academic & Experience Scoring""" import numpy as np import re from typing import Dict, Tuple class UniversalModule: """Scores based on academic performance and experience""" def __init__(self): self.feature_weights = { 'cgpa_norm': 0.30, 'sgpa_trend': 0.15, 'sgpa_consistency': 0.10, 'marks_consistency': 0.10, 'academic_improvement': 0.10, 'internship_exposure': 0.10, 'ec_quality': 0.08, 'cert_quality': 0.07 } def score(self, student_data: Dict) -> Tuple[float, float, Dict]: """ Calculate universal score Returns: (score, confidence, features_dict) """ features = {} # CGPA normalization (0-10 scale) cgpa = student_data.get('cgpa', 0) features['cgpa_norm'] = min(cgpa / 10.0, 1.0) # SGPA trend (improvement across semesters) - filter out null values sgpa_values = [] for sem_num in range(1, 9): sem_val = student_data.get(f'sgpa_sem{sem_num}') if sem_val is not None and sem_val > 0: # Ignore null/zero values sgpa_values.append(sem_val) if len(sgpa_values) >= 2: # Calculate trend from first to last available semester trend = (sgpa_values[-1] - sgpa_values[0]) / 10.0 # Normalize features['sgpa_trend'] = max(0, min(trend + 0.5, 1.0)) # Center at 0.5 else: features['sgpa_trend'] = 0.5 # Neutral if insufficient data # SGPA consistency (lower std = more consistent = better) if len(sgpa_values) >= 3: std_dev = np.std(sgpa_values) features['sgpa_consistency'] = max(0, 1 - (std_dev / 3.0)) # Inverse relationship else: features['sgpa_consistency'] = 0.5 # Marks consistency across 10th, 12th, CGPA tenth = student_data.get('tenth_pct') twelfth = student_data.get('twelfth_pct') if tenth and twelfth and cgpa: cgpa_pct = (cgpa / 10.0) * 100 marks_std = np.std([tenth, twelfth, cgpa_pct]) features['marks_consistency'] = max(0, 1 - (marks_std / 30.0)) else: features['marks_consistency'] = 0.5 # Academic improvement flag if tenth and twelfth and cgpa: cgpa_pct = (cgpa / 10.0) * 100 if cgpa_pct > twelfth and twelfth > tenth: features['academic_improvement'] = 1.0 elif cgpa_pct > twelfth or twelfth > tenth: features['academic_improvement'] = 0.7 else: features['academic_improvement'] = 0.3 else: features['academic_improvement'] = 0.5 # Extract features from text responses (handle None values) internship_text = student_data.get('internship_text') or '' ec_text = student_data.get('extracurricular_text') or '' cert_text = student_data.get('certifications_text') or '' # Internship exposure - extract from text features['internship_exposure'] = self._assess_internship_quality(internship_text) # Extracurricular quality - extract from text features['ec_quality'] = self._assess_extracurricular_quality(ec_text) # Certification quality - extract from text features['cert_quality'] = self._assess_certification_quality(cert_text) # Calculate weighted score score = sum(features[k] * self.feature_weights[k] for k in features.keys()) # Calculate confidence based on data completeness total_fields = 8 filled_fields = sum([ 1 if cgpa > 0 else 0, 1 if len(sgpa_values) >= 2 else 0, 1 if len(sgpa_values) >= 3 else 0, 1 if tenth and twelfth else 0, 1 if tenth and twelfth and cgpa else 0, 1 if len(internship_text) > 20 else 0, 1 if len(ec_text) > 20 else 0, 1 if len(cert_text) > 20 else 0 ]) confidence = filled_fields / total_fields return score, confidence, features def explain(self, features: Dict) -> Dict: """Generate explanation for scores""" explanations = { 'top_positive_features': [], 'top_negative_features': [] } # Sort features by value sorted_features = sorted(features.items(), key=lambda x: x[1], reverse=True) # Top 3 positive for feat, val in sorted_features[:3]: if val > 0.6: explanations['top_positive_features'].append({ 'feature': feat, 'value': round(val, 2), 'description': self._get_feature_description(feat, val) }) # Top 3 negative for feat, val in sorted_features[-3:]: if val < 0.4: explanations['top_negative_features'].append({ 'feature': feat, 'value': round(val, 2), 'description': self._get_feature_description(feat, val) }) return explanations def _assess_internship_quality(self, text: str) -> float: """Extract internship quality from text""" if not text or len(text) < 20: return 0.0 score = 0.0 text_lower = text.lower() # Duration indicators duration_patterns = [ (r'\b(\d+)\s*months?\b', 1.0), (r'\b(\d+)\s*weeks?\b', 0.25), (r'summer\s+internship', 0.5), (r'year\s+long|full\s+year|annual', 1.0), ] max_duration_score = 0.0 for pattern, multiplier in duration_patterns: matches = re.findall(pattern, text_lower) if matches: if pattern.startswith(r'\b(\d+)'): duration = max([int(m) for m in matches]) * multiplier max_duration_score = max(max_duration_score, min(duration / 6.0, 1.0)) else: max_duration_score = max(max_duration_score, multiplier) score += max_duration_score * 0.4 # Quality indicators quality_keywords = ['company', 'startup', 'corporation', 'project', 'developed', 'implemented', 'built', 'deployed', 'managed', 'led'] quality_count = sum(1 for kw in quality_keywords if kw in text_lower) score += min(quality_count / len(quality_keywords), 1.0) * 0.4 # Length indicates detail score += min(len(text) / 500, 1.0) * 0.2 return min(score, 1.0) def _assess_extracurricular_quality(self, text: str) -> float: """Extract extracurricular quality from text""" if not text or len(text) < 20: return 0.0 score = 0.0 text_lower = text.lower() # Leadership indicators leadership_keywords = ['led', 'organized', 'president', 'captain', 'head', 'coordinator', 'managed', 'founded'] leadership_count = sum(1 for kw in leadership_keywords if kw in text_lower) score += min(leadership_count / 3, 1.0) * 0.4 # Activity types activity_keywords = ['club', 'society', 'competition', 'hackathon', 'event', 'volunteer', 'sports', 'cultural', 'technical'] activity_count = sum(1 for kw in activity_keywords if kw in text_lower) score += min(activity_count / 4, 1.0) * 0.4 # Detail level score += min(len(text) / 400, 1.0) * 0.2 return min(score, 1.0) def _assess_certification_quality(self, text: str) -> float: """Extract certification quality from text""" if not text or len(text) < 20: return 0.0 score = 0.0 text_lower = text.lower() # Platform indicators (reputable sources) platform_keywords = ['coursera', 'udemy', 'edx', 'linkedin', 'google', 'microsoft', 'aws', 'azure', 'ibm', 'oracle'] platform_count = sum(1 for kw in platform_keywords if kw in text_lower) score += min(platform_count / 3, 1.0) * 0.4 # Technical skills tech_keywords = ['python', 'java', 'machine learning', 'data science', 'cloud', 'programming', 'development', 'database', 'web', 'mobile'] tech_count = sum(1 for kw in tech_keywords if kw in text_lower) score += min(tech_count / 4, 1.0) * 0.4 # Detail level score += min(len(text) / 400, 1.0) * 0.2 return min(score, 1.0) def _get_feature_description(self, feature: str, value: float) -> str: """Get human-readable description of feature""" descriptions = { 'cgpa_norm': f"CGPA performance: {value*10:.1f}/10", 'sgpa_trend': "Strong upward trend in semester grades" if value > 0.6 else "Declining semester grades", 'sgpa_consistency': "Very consistent semester performance" if value > 0.7 else "Inconsistent semester performance", 'marks_consistency': "Consistent performance across academics" if value > 0.7 else "Variable academic performance", 'academic_improvement': "Clear improvement over time" if value > 0.7 else "Limited academic growth", 'internship_exposure': "Strong internship experience" if value > 0.6 else "Limited internship exposure", 'ec_quality': "Excellent extracurricular involvement" if value > 0.6 else "Limited extracurricular activities", 'cert_quality': "Strong certification portfolio" if value > 0.6 else "Few professional certifications" } return descriptions.get(feature, feature)