Spaces:

parthnuwal7
/

FCT

Sleeping

File size: 7,812 Bytes

3d015cd

"""Text Embeddings Module - NLP-based Scoring"""
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Dict, Tuple
import re

class TextModule:
    """Scores text responses using SBERT embeddings and heuristics"""
    
    def __init__(self):
        # Load SBERT model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Reference embeddings for ideal responses
        self.reference_embeddings = {
            'strengths': self.model.encode([
                "I have strong technical skills in programming, problem-solving, and software development",
                "My strengths include leadership, communication, and analytical thinking",
                "I excel at teamwork, project management, and innovative solutions"
            ]),
            'career': self.model.encode([
                "I am interested in software engineering and technology innovation",
                "I want to work in data science and machine learning",
                "My goal is to become a product manager and lead technical teams"
            ])
        }
        
        # Leadership keywords
        self.leadership_keywords = [
            'lead', 'leader', 'leadership', 'managed', 'organized', 'president',
            'head', 'coordinator', 'captain', 'founded', 'initiated', 'directed'
        ]
    
    def score(self, text_responses: Dict[str, str]) -> Tuple[float, float, Dict]:
        """
        Calculate text score from 3 textual responses
        Returns: (score, confidence, features)
        """
        features = {}
        
        text_q1 = text_responses.get('text_q1', '')
        text_q2 = text_responses.get('text_q2', '')
        text_q3 = text_responses.get('text_q3', '')
        
        # Feature 1: Writing quality (text_q1 - strengths)
        features['writing_quality'] = self._assess_writing_quality(text_q1)
        
        # Feature 2: Intent coherence (text_q2 - career interests)
        features['intent_coherence'] = self._assess_intent_coherence(text_q2)
        
        # Feature 3: Leadership flag (text_q3 - extracurriculars)
        features['leadership_score'] = self._assess_leadership(text_q3)
        
        # Feature 4: Content depth (all responses)
        features['content_depth'] = self._assess_content_depth(text_q1, text_q2, text_q3)
        
        # Calculate overall text score
        text_score = (
            features['writing_quality'] * 0.25 +
            features['intent_coherence'] * 0.25 +
            features['leadership_score'] * 0.30 +
            features['content_depth'] * 0.20
        )
        
        # Calculate confidence based on response completeness
        confidence = self._calculate_confidence(text_q1, text_q2, text_q3)
        
        return text_score, confidence, features
    
    def _assess_writing_quality(self, text: str) -> float:
        """Assess writing quality using heuristics"""
        if not text or len(text) < 50:
            return 0.2
        
        score = 0.5  # Base score
        
        # Length check (150-300 words ideal)
        word_count = len(text.split())
        if 150 <= word_count <= 300:
            score += 0.3
        elif 100 <= word_count < 150 or 300 < word_count <= 400:
            score += 0.2
        else:
            score += 0.1
        
        # Sentence structure (multiple sentences)
        sentences = re.split(r'[.!?]+', text)
        if len(sentences) >= 5:
            score += 0.1
        
        # Proper capitalization
        if text[0].isupper():
            score += 0.05
        
        # No excessive repetition
        words = text.lower().split()
        unique_ratio = len(set(words)) / len(words) if words else 0
        if unique_ratio > 0.6:
            score += 0.05
        
        return min(score, 1.0)
    
    def _assess_intent_coherence(self, text: str) -> float:
        """Assess career intent coherence using embeddings"""
        if not text or len(text) < 50:
            return 0.2
        
        # Encode the response
        response_embedding = self.model.encode([text])[0]
        
        # Calculate similarity with reference career embeddings
        similarities = []
        for ref_emb in self.reference_embeddings['career']:
            similarity = np.dot(response_embedding, ref_emb) / (
                np.linalg.norm(response_embedding) * np.linalg.norm(ref_emb)
            )
            similarities.append(similarity)
        
        # Take max similarity
        max_similarity = max(similarities) if similarities else 0
        
        # Normalize to 0-1 (cosine similarity is -1 to 1)
        score = (max_similarity + 1) / 2
        
        return score
    
    def _assess_leadership(self, text: str) -> float:
        """Assess leadership based on keywords"""
        if not text or len(text) < 50:
            return 0.2
        
        text_lower = text.lower()
        
        # Count leadership keywords
        keyword_count = sum(1 for keyword in self.leadership_keywords if keyword in text_lower)
        
        # Base score on keyword presence
        if keyword_count >= 3:
            score = 1.0
        elif keyword_count == 2:
            score = 0.8
        elif keyword_count == 1:
            score = 0.6
        else:
            score = 0.3
        
        # Bonus for specific leadership phrases
        if 'led a team' in text_lower or 'team lead' in text_lower:
            score = min(score + 0.1, 1.0)
        
        return score
    
    def _assess_content_depth(self, text_q1: str, text_q2: str, text_q3: str) -> float:
        """Assess overall content depth"""
        total_words = len(text_q1.split()) + len(text_q2.split()) + len(text_q3.split())
        
        if total_words >= 450:  # 150+ words each
            return 1.0
        elif total_words >= 300:
            return 0.8
        elif total_words >= 200:
            return 0.6
        elif total_words >= 100:
            return 0.4
        else:
            return 0.2
    
    def _calculate_confidence(self, text_q1: str, text_q2: str, text_q3: str) -> float:
        """Calculate confidence based on completeness"""
        scores = []
        
        for text in [text_q1, text_q2, text_q3]:
            if not text:
                scores.append(0)
            elif len(text) < 50:
                scores.append(0.3)
            elif len(text) < 100:
                scores.append(0.6)
            else:
                scores.append(1.0)
        
        return np.mean(scores)
    
    def explain(self, features: Dict) -> Dict:
        """Generate explanation for text scores"""
        explanations = {
            'highlights': [],
            'suggestions': []
        }
        
        # Highlights
        if features.get('writing_quality', 0) > 0.7:
            explanations['highlights'].append("Strong writing quality with clear communication")
        
        if features.get('leadership_score', 0) > 0.7:
            explanations['highlights'].append("Demonstrated leadership experience and initiative")
        
        if features.get('intent_coherence', 0) > 0.7:
            explanations['highlights'].append("Clear and coherent career goals")
        
        # Suggestions
        if features.get('writing_quality', 0) < 0.5:
            explanations['suggestions'].append("Provide more detailed responses (aim for 150-300 words each)")
        
        if features.get('leadership_score', 0) < 0.5:
            explanations['suggestions'].append("Highlight specific leadership roles and their impact")
        
        if features.get('content_depth', 0) < 0.5:
            explanations['suggestions'].append("Include more specific examples and achievements")
        
        return explanations