"""Text Embeddings Module - NLP-based Scoring""" import numpy as np from sentence_transformers import SentenceTransformer from typing import Dict, Tuple import re class TextModule: """Scores text responses using SBERT embeddings and heuristics""" def __init__(self): # Load SBERT model self.model = SentenceTransformer('all-MiniLM-L6-v2') # Reference embeddings for ideal responses self.reference_embeddings = { 'strengths': self.model.encode([ "I have strong technical skills in programming, problem-solving, and software development", "My strengths include leadership, communication, and analytical thinking", "I excel at teamwork, project management, and innovative solutions" ]), 'career': self.model.encode([ "I am interested in software engineering and technology innovation", "I want to work in data science and machine learning", "My goal is to become a product manager and lead technical teams" ]) } # Leadership keywords self.leadership_keywords = [ 'lead', 'leader', 'leadership', 'managed', 'organized', 'president', 'head', 'coordinator', 'captain', 'founded', 'initiated', 'directed' ] def score(self, text_responses: Dict[str, str]) -> Tuple[float, float, Dict]: """ Calculate text score from 3 textual responses Returns: (score, confidence, features) """ features = {} text_q1 = text_responses.get('text_q1', '') text_q2 = text_responses.get('text_q2', '') text_q3 = text_responses.get('text_q3', '') # Feature 1: Writing quality (text_q1 - strengths) features['writing_quality'] = self._assess_writing_quality(text_q1) # Feature 2: Intent coherence (text_q2 - career interests) features['intent_coherence'] = self._assess_intent_coherence(text_q2) # Feature 3: Leadership flag (text_q3 - extracurriculars) features['leadership_score'] = self._assess_leadership(text_q3) # Feature 4: Content depth (all responses) features['content_depth'] = self._assess_content_depth(text_q1, text_q2, text_q3) # Calculate overall text score text_score = ( features['writing_quality'] * 0.25 + features['intent_coherence'] * 0.25 + features['leadership_score'] * 0.30 + features['content_depth'] * 0.20 ) # Calculate confidence based on response completeness confidence = self._calculate_confidence(text_q1, text_q2, text_q3) return text_score, confidence, features def _assess_writing_quality(self, text: str) -> float: """Assess writing quality using heuristics""" if not text or len(text) < 50: return 0.2 score = 0.5 # Base score # Length check (150-300 words ideal) word_count = len(text.split()) if 150 <= word_count <= 300: score += 0.3 elif 100 <= word_count < 150 or 300 < word_count <= 400: score += 0.2 else: score += 0.1 # Sentence structure (multiple sentences) sentences = re.split(r'[.!?]+', text) if len(sentences) >= 5: score += 0.1 # Proper capitalization if text[0].isupper(): score += 0.05 # No excessive repetition words = text.lower().split() unique_ratio = len(set(words)) / len(words) if words else 0 if unique_ratio > 0.6: score += 0.05 return min(score, 1.0) def _assess_intent_coherence(self, text: str) -> float: """Assess career intent coherence using embeddings""" if not text or len(text) < 50: return 0.2 # Encode the response response_embedding = self.model.encode([text])[0] # Calculate similarity with reference career embeddings similarities = [] for ref_emb in self.reference_embeddings['career']: similarity = np.dot(response_embedding, ref_emb) / ( np.linalg.norm(response_embedding) * np.linalg.norm(ref_emb) ) similarities.append(similarity) # Take max similarity max_similarity = max(similarities) if similarities else 0 # Normalize to 0-1 (cosine similarity is -1 to 1) score = (max_similarity + 1) / 2 return score def _assess_leadership(self, text: str) -> float: """Assess leadership based on keywords""" if not text or len(text) < 50: return 0.2 text_lower = text.lower() # Count leadership keywords keyword_count = sum(1 for keyword in self.leadership_keywords if keyword in text_lower) # Base score on keyword presence if keyword_count >= 3: score = 1.0 elif keyword_count == 2: score = 0.8 elif keyword_count == 1: score = 0.6 else: score = 0.3 # Bonus for specific leadership phrases if 'led a team' in text_lower or 'team lead' in text_lower: score = min(score + 0.1, 1.0) return score def _assess_content_depth(self, text_q1: str, text_q2: str, text_q3: str) -> float: """Assess overall content depth""" total_words = len(text_q1.split()) + len(text_q2.split()) + len(text_q3.split()) if total_words >= 450: # 150+ words each return 1.0 elif total_words >= 300: return 0.8 elif total_words >= 200: return 0.6 elif total_words >= 100: return 0.4 else: return 0.2 def _calculate_confidence(self, text_q1: str, text_q2: str, text_q3: str) -> float: """Calculate confidence based on completeness""" scores = [] for text in [text_q1, text_q2, text_q3]: if not text: scores.append(0) elif len(text) < 50: scores.append(0.3) elif len(text) < 100: scores.append(0.6) else: scores.append(1.0) return np.mean(scores) def explain(self, features: Dict) -> Dict: """Generate explanation for text scores""" explanations = { 'highlights': [], 'suggestions': [] } # Highlights if features.get('writing_quality', 0) > 0.7: explanations['highlights'].append("Strong writing quality with clear communication") if features.get('leadership_score', 0) > 0.7: explanations['highlights'].append("Demonstrated leadership experience and initiative") if features.get('intent_coherence', 0) > 0.7: explanations['highlights'].append("Clear and coherent career goals") # Suggestions if features.get('writing_quality', 0) < 0.5: explanations['suggestions'].append("Provide more detailed responses (aim for 150-300 words each)") if features.get('leadership_score', 0) < 0.5: explanations['suggestions'].append("Highlight specific leadership roles and their impact") if features.get('content_depth', 0) < 0.5: explanations['suggestions'].append("Include more specific examples and achievements") return explanations