Spaces:

parthnuwal7
/

FCT

Sleeping

File size: 10,073 Bytes

3d015cd

"""Universal Module - Academic & Experience Scoring"""
import numpy as np
import re
from typing import Dict, Tuple

class UniversalModule:
    """Scores based on academic performance and experience"""
    
    def __init__(self):
        self.feature_weights = {
            'cgpa_norm': 0.30,
            'sgpa_trend': 0.15,
            'sgpa_consistency': 0.10,
            'marks_consistency': 0.10,
            'academic_improvement': 0.10,
            'internship_exposure': 0.10,
            'ec_quality': 0.08,
            'cert_quality': 0.07
        }
    
    def score(self, student_data: Dict) -> Tuple[float, float, Dict]:
        """
        Calculate universal score
        Returns: (score, confidence, features_dict)
        """
        features = {}
        
        # CGPA normalization (0-10 scale)
        cgpa = student_data.get('cgpa', 0)
        features['cgpa_norm'] = min(cgpa / 10.0, 1.0)
        
        # SGPA trend (improvement across semesters) - filter out null values
        sgpa_values = []
        for sem_num in range(1, 9):
            sem_val = student_data.get(f'sgpa_sem{sem_num}')
            if sem_val is not None and sem_val > 0:  # Ignore null/zero values
                sgpa_values.append(sem_val)
        
        if len(sgpa_values) >= 2:
            # Calculate trend from first to last available semester
            trend = (sgpa_values[-1] - sgpa_values[0]) / 10.0  # Normalize
            features['sgpa_trend'] = max(0, min(trend + 0.5, 1.0))  # Center at 0.5
        else:
            features['sgpa_trend'] = 0.5  # Neutral if insufficient data
        
        # SGPA consistency (lower std = more consistent = better)
        if len(sgpa_values) >= 3:
            std_dev = np.std(sgpa_values)
            features['sgpa_consistency'] = max(0, 1 - (std_dev / 3.0))  # Inverse relationship
        else:
            features['sgpa_consistency'] = 0.5
        
        # Marks consistency across 10th, 12th, CGPA
        tenth = student_data.get('tenth_pct')
        twelfth = student_data.get('twelfth_pct')
        
        if tenth and twelfth and cgpa:
            cgpa_pct = (cgpa / 10.0) * 100
            marks_std = np.std([tenth, twelfth, cgpa_pct])
            features['marks_consistency'] = max(0, 1 - (marks_std / 30.0))
        else:
            features['marks_consistency'] = 0.5
        
        # Academic improvement flag
        if tenth and twelfth and cgpa:
            cgpa_pct = (cgpa / 10.0) * 100
            if cgpa_pct > twelfth and twelfth > tenth:
                features['academic_improvement'] = 1.0
            elif cgpa_pct > twelfth or twelfth > tenth:
                features['academic_improvement'] = 0.7
            else:
                features['academic_improvement'] = 0.3
        else:
            features['academic_improvement'] = 0.5
        
        # Extract features from text responses (handle None values)
        internship_text = student_data.get('internship_text') or ''
        ec_text = student_data.get('extracurricular_text') or ''
        cert_text = student_data.get('certifications_text') or ''
        
        # Internship exposure - extract from text
        features['internship_exposure'] = self._assess_internship_quality(internship_text)
        
        # Extracurricular quality - extract from text
        features['ec_quality'] = self._assess_extracurricular_quality(ec_text)
        
        # Certification quality - extract from text
        features['cert_quality'] = self._assess_certification_quality(cert_text)
        
        # Calculate weighted score
        score = sum(features[k] * self.feature_weights[k] for k in features.keys())
        
        # Calculate confidence based on data completeness
        total_fields = 8
        filled_fields = sum([
            1 if cgpa > 0 else 0,
            1 if len(sgpa_values) >= 2 else 0,
            1 if len(sgpa_values) >= 3 else 0,
            1 if tenth and twelfth else 0,
            1 if tenth and twelfth and cgpa else 0,
            1 if len(internship_text) > 20 else 0,
            1 if len(ec_text) > 20 else 0,
            1 if len(cert_text) > 20 else 0
        ])
        confidence = filled_fields / total_fields
        
        return score, confidence, features
    
    def explain(self, features: Dict) -> Dict:
        """Generate explanation for scores"""
        explanations = {
            'top_positive_features': [],
            'top_negative_features': []
        }
        
        # Sort features by value
        sorted_features = sorted(features.items(), key=lambda x: x[1], reverse=True)
        
        # Top 3 positive
        for feat, val in sorted_features[:3]:
            if val > 0.6:
                explanations['top_positive_features'].append({
                    'feature': feat,
                    'value': round(val, 2),
                    'description': self._get_feature_description(feat, val)
                })
        
        # Top 3 negative
        for feat, val in sorted_features[-3:]:
            if val < 0.4:
                explanations['top_negative_features'].append({
                    'feature': feat,
                    'value': round(val, 2),
                    'description': self._get_feature_description(feat, val)
                })
        
        return explanations
    
    def _assess_internship_quality(self, text: str) -> float:
        """Extract internship quality from text"""
        if not text or len(text) < 20:
            return 0.0
        
        score = 0.0
        text_lower = text.lower()
        
        # Duration indicators
        duration_patterns = [
            (r'\b(\d+)\s*months?\b', 1.0),
            (r'\b(\d+)\s*weeks?\b', 0.25),
            (r'summer\s+internship', 0.5),
            (r'year\s+long|full\s+year|annual', 1.0),
        ]
        
        max_duration_score = 0.0
        for pattern, multiplier in duration_patterns:
            matches = re.findall(pattern, text_lower)
            if matches:
                if pattern.startswith(r'\b(\d+)'):
                    duration = max([int(m) for m in matches]) * multiplier
                    max_duration_score = max(max_duration_score, min(duration / 6.0, 1.0))
                else:
                    max_duration_score = max(max_duration_score, multiplier)
        
        score += max_duration_score * 0.4
        
        # Quality indicators
        quality_keywords = ['company', 'startup', 'corporation', 'project', 'developed', 
                          'implemented', 'built', 'deployed', 'managed', 'led']
        quality_count = sum(1 for kw in quality_keywords if kw in text_lower)
        score += min(quality_count / len(quality_keywords), 1.0) * 0.4
        
        # Length indicates detail
        score += min(len(text) / 500, 1.0) * 0.2
        
        return min(score, 1.0)
    
    def _assess_extracurricular_quality(self, text: str) -> float:
        """Extract extracurricular quality from text"""
        if not text or len(text) < 20:
            return 0.0
        
        score = 0.0
        text_lower = text.lower()
        
        # Leadership indicators
        leadership_keywords = ['led', 'organized', 'president', 'captain', 'head', 
                             'coordinator', 'managed', 'founded']
        leadership_count = sum(1 for kw in leadership_keywords if kw in text_lower)
        score += min(leadership_count / 3, 1.0) * 0.4
        
        # Activity types
        activity_keywords = ['club', 'society', 'competition', 'hackathon', 'event', 
                           'volunteer', 'sports', 'cultural', 'technical']
        activity_count = sum(1 for kw in activity_keywords if kw in text_lower)
        score += min(activity_count / 4, 1.0) * 0.4
        
        # Detail level
        score += min(len(text) / 400, 1.0) * 0.2
        
        return min(score, 1.0)
    
    def _assess_certification_quality(self, text: str) -> float:
        """Extract certification quality from text"""
        if not text or len(text) < 20:
            return 0.0
        
        score = 0.0
        text_lower = text.lower()
        
        # Platform indicators (reputable sources)
        platform_keywords = ['coursera', 'udemy', 'edx', 'linkedin', 'google', 
                           'microsoft', 'aws', 'azure', 'ibm', 'oracle']
        platform_count = sum(1 for kw in platform_keywords if kw in text_lower)
        score += min(platform_count / 3, 1.0) * 0.4
        
        # Technical skills
        tech_keywords = ['python', 'java', 'machine learning', 'data science', 'cloud',
                        'programming', 'development', 'database', 'web', 'mobile']
        tech_count = sum(1 for kw in tech_keywords if kw in text_lower)
        score += min(tech_count / 4, 1.0) * 0.4
        
        # Detail level
        score += min(len(text) / 400, 1.0) * 0.2
        
        return min(score, 1.0)

    def _get_feature_description(self, feature: str, value: float) -> str:
        """Get human-readable description of feature"""
        descriptions = {
            'cgpa_norm': f"CGPA performance: {value*10:.1f}/10",
            'sgpa_trend': "Strong upward trend in semester grades" if value > 0.6 else "Declining semester grades",
            'sgpa_consistency': "Very consistent semester performance" if value > 0.7 else "Inconsistent semester performance",
            'marks_consistency': "Consistent performance across academics" if value > 0.7 else "Variable academic performance",
            'academic_improvement': "Clear improvement over time" if value > 0.7 else "Limited academic growth",
            'internship_exposure': "Strong internship experience" if value > 0.6 else "Limited internship exposure",
            'ec_quality': "Excellent extracurricular involvement" if value > 0.6 else "Limited extracurricular activities",
            'cert_quality': "Strong certification portfolio" if value > 0.6 else "Few professional certifications"
        }
        return descriptions.get(feature, feature)