File size: 10,073 Bytes
3d015cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""Universal Module - Academic & Experience Scoring"""
import numpy as np
import re
from typing import Dict, Tuple

class UniversalModule:
    """Scores based on academic performance and experience"""
    
    def __init__(self):
        self.feature_weights = {
            'cgpa_norm': 0.30,
            'sgpa_trend': 0.15,
            'sgpa_consistency': 0.10,
            'marks_consistency': 0.10,
            'academic_improvement': 0.10,
            'internship_exposure': 0.10,
            'ec_quality': 0.08,
            'cert_quality': 0.07
        }
    
    def score(self, student_data: Dict) -> Tuple[float, float, Dict]:
        """
        Calculate universal score
        Returns: (score, confidence, features_dict)
        """
        features = {}
        
        # CGPA normalization (0-10 scale)
        cgpa = student_data.get('cgpa', 0)
        features['cgpa_norm'] = min(cgpa / 10.0, 1.0)
        
        # SGPA trend (improvement across semesters) - filter out null values
        sgpa_values = []
        for sem_num in range(1, 9):
            sem_val = student_data.get(f'sgpa_sem{sem_num}')
            if sem_val is not None and sem_val > 0:  # Ignore null/zero values
                sgpa_values.append(sem_val)
        
        if len(sgpa_values) >= 2:
            # Calculate trend from first to last available semester
            trend = (sgpa_values[-1] - sgpa_values[0]) / 10.0  # Normalize
            features['sgpa_trend'] = max(0, min(trend + 0.5, 1.0))  # Center at 0.5
        else:
            features['sgpa_trend'] = 0.5  # Neutral if insufficient data
        
        # SGPA consistency (lower std = more consistent = better)
        if len(sgpa_values) >= 3:
            std_dev = np.std(sgpa_values)
            features['sgpa_consistency'] = max(0, 1 - (std_dev / 3.0))  # Inverse relationship
        else:
            features['sgpa_consistency'] = 0.5
        
        # Marks consistency across 10th, 12th, CGPA
        tenth = student_data.get('tenth_pct')
        twelfth = student_data.get('twelfth_pct')
        
        if tenth and twelfth and cgpa:
            cgpa_pct = (cgpa / 10.0) * 100
            marks_std = np.std([tenth, twelfth, cgpa_pct])
            features['marks_consistency'] = max(0, 1 - (marks_std / 30.0))
        else:
            features['marks_consistency'] = 0.5
        
        # Academic improvement flag
        if tenth and twelfth and cgpa:
            cgpa_pct = (cgpa / 10.0) * 100
            if cgpa_pct > twelfth and twelfth > tenth:
                features['academic_improvement'] = 1.0
            elif cgpa_pct > twelfth or twelfth > tenth:
                features['academic_improvement'] = 0.7
            else:
                features['academic_improvement'] = 0.3
        else:
            features['academic_improvement'] = 0.5
        
        # Extract features from text responses (handle None values)
        internship_text = student_data.get('internship_text') or ''
        ec_text = student_data.get('extracurricular_text') or ''
        cert_text = student_data.get('certifications_text') or ''
        
        # Internship exposure - extract from text
        features['internship_exposure'] = self._assess_internship_quality(internship_text)
        
        # Extracurricular quality - extract from text
        features['ec_quality'] = self._assess_extracurricular_quality(ec_text)
        
        # Certification quality - extract from text
        features['cert_quality'] = self._assess_certification_quality(cert_text)
        
        # Calculate weighted score
        score = sum(features[k] * self.feature_weights[k] for k in features.keys())
        
        # Calculate confidence based on data completeness
        total_fields = 8
        filled_fields = sum([
            1 if cgpa > 0 else 0,
            1 if len(sgpa_values) >= 2 else 0,
            1 if len(sgpa_values) >= 3 else 0,
            1 if tenth and twelfth else 0,
            1 if tenth and twelfth and cgpa else 0,
            1 if len(internship_text) > 20 else 0,
            1 if len(ec_text) > 20 else 0,
            1 if len(cert_text) > 20 else 0
        ])
        confidence = filled_fields / total_fields
        
        return score, confidence, features
    
    def explain(self, features: Dict) -> Dict:
        """Generate explanation for scores"""
        explanations = {
            'top_positive_features': [],
            'top_negative_features': []
        }
        
        # Sort features by value
        sorted_features = sorted(features.items(), key=lambda x: x[1], reverse=True)
        
        # Top 3 positive
        for feat, val in sorted_features[:3]:
            if val > 0.6:
                explanations['top_positive_features'].append({
                    'feature': feat,
                    'value': round(val, 2),
                    'description': self._get_feature_description(feat, val)
                })
        
        # Top 3 negative
        for feat, val in sorted_features[-3:]:
            if val < 0.4:
                explanations['top_negative_features'].append({
                    'feature': feat,
                    'value': round(val, 2),
                    'description': self._get_feature_description(feat, val)
                })
        
        return explanations
    
    def _assess_internship_quality(self, text: str) -> float:
        """Extract internship quality from text"""
        if not text or len(text) < 20:
            return 0.0
        
        score = 0.0
        text_lower = text.lower()
        
        # Duration indicators
        duration_patterns = [
            (r'\b(\d+)\s*months?\b', 1.0),
            (r'\b(\d+)\s*weeks?\b', 0.25),
            (r'summer\s+internship', 0.5),
            (r'year\s+long|full\s+year|annual', 1.0),
        ]
        
        max_duration_score = 0.0
        for pattern, multiplier in duration_patterns:
            matches = re.findall(pattern, text_lower)
            if matches:
                if pattern.startswith(r'\b(\d+)'):
                    duration = max([int(m) for m in matches]) * multiplier
                    max_duration_score = max(max_duration_score, min(duration / 6.0, 1.0))
                else:
                    max_duration_score = max(max_duration_score, multiplier)
        
        score += max_duration_score * 0.4
        
        # Quality indicators
        quality_keywords = ['company', 'startup', 'corporation', 'project', 'developed', 
                          'implemented', 'built', 'deployed', 'managed', 'led']
        quality_count = sum(1 for kw in quality_keywords if kw in text_lower)
        score += min(quality_count / len(quality_keywords), 1.0) * 0.4
        
        # Length indicates detail
        score += min(len(text) / 500, 1.0) * 0.2
        
        return min(score, 1.0)
    
    def _assess_extracurricular_quality(self, text: str) -> float:
        """Extract extracurricular quality from text"""
        if not text or len(text) < 20:
            return 0.0
        
        score = 0.0
        text_lower = text.lower()
        
        # Leadership indicators
        leadership_keywords = ['led', 'organized', 'president', 'captain', 'head', 
                             'coordinator', 'managed', 'founded']
        leadership_count = sum(1 for kw in leadership_keywords if kw in text_lower)
        score += min(leadership_count / 3, 1.0) * 0.4
        
        # Activity types
        activity_keywords = ['club', 'society', 'competition', 'hackathon', 'event', 
                           'volunteer', 'sports', 'cultural', 'technical']
        activity_count = sum(1 for kw in activity_keywords if kw in text_lower)
        score += min(activity_count / 4, 1.0) * 0.4
        
        # Detail level
        score += min(len(text) / 400, 1.0) * 0.2
        
        return min(score, 1.0)
    
    def _assess_certification_quality(self, text: str) -> float:
        """Extract certification quality from text"""
        if not text or len(text) < 20:
            return 0.0
        
        score = 0.0
        text_lower = text.lower()
        
        # Platform indicators (reputable sources)
        platform_keywords = ['coursera', 'udemy', 'edx', 'linkedin', 'google', 
                           'microsoft', 'aws', 'azure', 'ibm', 'oracle']
        platform_count = sum(1 for kw in platform_keywords if kw in text_lower)
        score += min(platform_count / 3, 1.0) * 0.4
        
        # Technical skills
        tech_keywords = ['python', 'java', 'machine learning', 'data science', 'cloud',
                        'programming', 'development', 'database', 'web', 'mobile']
        tech_count = sum(1 for kw in tech_keywords if kw in text_lower)
        score += min(tech_count / 4, 1.0) * 0.4
        
        # Detail level
        score += min(len(text) / 400, 1.0) * 0.2
        
        return min(score, 1.0)

    def _get_feature_description(self, feature: str, value: float) -> str:
        """Get human-readable description of feature"""
        descriptions = {
            'cgpa_norm': f"CGPA performance: {value*10:.1f}/10",
            'sgpa_trend': "Strong upward trend in semester grades" if value > 0.6 else "Declining semester grades",
            'sgpa_consistency': "Very consistent semester performance" if value > 0.7 else "Inconsistent semester performance",
            'marks_consistency': "Consistent performance across academics" if value > 0.7 else "Variable academic performance",
            'academic_improvement': "Clear improvement over time" if value > 0.7 else "Limited academic growth",
            'internship_exposure': "Strong internship experience" if value > 0.6 else "Limited internship exposure",
            'ec_quality': "Excellent extracurricular involvement" if value > 0.6 else "Limited extracurricular activities",
            'cert_quality': "Strong certification portfolio" if value > 0.6 else "Few professional certifications"
        }
        return descriptions.get(feature, feature)