""" Text Feature Extractor - IMPROVED VERSION Extracts 9 text features from conversation transcripts to detect busy/distracted states. KEY IMPROVEMENTS: 1. Uses NLI model for intent classification (understands "not busy" properly) 2. Handles negation, context, and sarcasm 3. Removes useless t9_latency for single-side audio """ import numpy as np from typing import List, Dict, Tuple from transformers import pipeline from sentence_transformers import SentenceTransformer import re class TextFeatureExtractor: """Extract 9 text features for busy detection""" def __init__(self, use_intent_model: bool = True): """ Initialize NLP models Args: use_intent_model: If True, use BART-MNLI for intent classification If False, fall back to pattern matching """ self.use_intent_model = use_intent_model print("Loading NLP models...") # Sentiment model model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" self.sentiment_model = pipeline( "sentiment-analysis", model=model_name, device=-1 ) print("[OK] Sentiment model loaded") # Coherence model self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2') print("[OK] Coherence model loaded") # Always setup patterns — busy_keywords is needed by extract_marker_counts() self._setup_patterns() # Intent classification model (NEW - understands context!) if self.use_intent_model: try: self.intent_classifier = pipeline( "zero-shot-classification", model="facebook/bart-large-mnli", device=-1 ) print("[OK] Intent classifier loaded (BART-MNLI)") except Exception as e: print(f"[WARN] Intent classifier failed to load: {e}") print(" Falling back to pattern matching") self.use_intent_model = False def _setup_patterns(self): """Setup pattern-based matching as fallback""" # Negation pattern self.negation_pattern = re.compile( r'\b(not|no|never|neither|n\'t|dont|don\'t|cannot|can\'t|wont|won\'t)\s+\w*\s*(busy|free|available|talk|rush)', re.IGNORECASE ) # Busy patterns (positive assertions) self.busy_patterns = [ r'\b(i\'m|i am|im)\s+(busy|driving|working|cooking|rushing)\b', r'\bin a (meeting|call|hurry)\b', r'\bcan\'t talk\b', r'\bcall (you|me) back\b', r'\bnot a good time\b', r'\bbad time\b' ] # Free patterns (positive assertions) self.free_patterns = [ r'\b(i\'m|i am|im)\s+(free|available)\b', r'\bcan talk\b', r'\bhave time\b', r'\bnot busy\b', r'\bgood time\b', r'\bnow works\b' ] # Compile patterns self.busy_patterns = [re.compile(p, re.IGNORECASE) for p in self.busy_patterns] self.free_patterns = [re.compile(p, re.IGNORECASE) for p in self.free_patterns] # Legacy keywords for other features self.busy_keywords = { 'cognitive_load': [ 'um', 'uh', 'like', 'you know', 'i mean', 'kind of', 'sort of', 'basically', 'actually' ], 'time_pressure': [ 'quickly', 'hurry', 'fast', 'urgent', 'asap', 'right now', 'immediately', 'short', 'brief' ], 'deflection': [ 'later', 'another time', 'not now', 'maybe', 'i don\'t know', 'whatever', 'sure sure', 'yeah yeah' ] } def extract_explicit_busy(self, transcript: str) -> float: """ T1: Explicit Busy Indicators (binary: 0 or 1) IMPROVED: Uses NLI model to understand context and negation - "I'm busy" → 1.0 - "I'm not busy" → 0.0 - "Can't talk right now" → 1.0 - "I can talk" → 0.0 """ if not transcript or len(transcript.strip()) < 3: return 0.0 # Method 1: Use intent classification model (best) if self.use_intent_model: try: result = self.intent_classifier( transcript, candidate_labels=["person is busy or occupied", "person is free and available", "unclear or neutral"], hypothesis_template="This {}." ) top_label = result['labels'][0] top_score = result['scores'][0] # Require high confidence (>0.6) to avoid false positives if top_score > 0.6: if "busy" in top_label: return 1.0 elif "free" in top_label: return 0.0 return 0.0 # Neutral or low confidence except Exception as e: print(f"Intent classification failed: {e}") # Fall through to pattern matching # Method 2: Pattern-based with negation handling (fallback) return self._extract_busy_patterns(transcript) def _extract_busy_patterns(self, transcript: str) -> float: """Pattern-based busy detection with negation handling""" transcript_lower = transcript.lower() # Check for negated busy/free statements negation_match = self.negation_pattern.search(transcript_lower) if negation_match: matched_text = negation_match.group(0) # "not busy" or "can't be free" etc. if any(word in matched_text for word in ['busy', 'rush']): return 0.0 # "not busy" = available elif any(word in matched_text for word in ['free', 'available', 'talk']): return 1.0 # "can't talk" or "not free" = busy # Check free patterns first (higher priority) for pattern in self.free_patterns: if pattern.search(transcript_lower): return 0.0 # Then check busy patterns for pattern in self.busy_patterns: if pattern.search(transcript_lower): return 1.0 return 0.0 def extract_explicit_free(self, transcript: str) -> float: """ T0: Explicit Free Indicators (binary: 0 or 1) IMPROVED: Uses same context-aware approach as busy detection """ if not transcript or len(transcript.strip()) < 3: return 0.0 # Use intent model if self.use_intent_model: try: result = self.intent_classifier( transcript, candidate_labels=["person is free and available", "person is busy or occupied", "unclear or neutral"], hypothesis_template="This {}." ) top_label = result['labels'][0] top_score = result['scores'][0] if top_score > 0.6 and "free" in top_label: return 1.0 return 0.0 except Exception as e: print(f"Intent classification failed: {e}") # Fallback to patterns transcript_lower = transcript.lower() for pattern in self.free_patterns: if pattern.search(transcript_lower): return 1.0 return 0.0 def extract_response_patterns(self, transcript_list: List[str]) -> Tuple[float, float]: """ T2-T3: Average Response Length and Short Response Ratio Returns: - avg_response_len: Average words per response - short_ratio: Fraction of responses with ≤3 words """ if not transcript_list: return 0.0, 0.0 word_counts = [len(response.split()) for response in transcript_list] avg_response_len = np.mean(word_counts) short_count = sum(1 for wc in word_counts if wc <= 3) short_ratio = short_count / len(word_counts) return float(avg_response_len), float(short_ratio) def extract_marker_counts(self, transcript: str) -> Tuple[float, float, float]: """ T4-T6: Cognitive Load, Time Pressure, Deflection markers Returns: - cognitive_load: Count of filler words / total words - time_pressure: Count of urgency markers / total words - deflection: Count of deflection phrases / total words """ transcript_lower = transcript.lower() words = transcript.split() total_words = len(words) if total_words == 0: return 0.0, 0.0, 0.0 # Count markers cognitive_load_count = sum( 1 for keyword in self.busy_keywords['cognitive_load'] if keyword in transcript_lower ) time_pressure_count = sum( 1 for keyword in self.busy_keywords['time_pressure'] if keyword in transcript_lower ) deflection_count = sum( 1 for keyword in self.busy_keywords['deflection'] if keyword in transcript_lower ) # Normalize by total words cognitive_load = cognitive_load_count / total_words time_pressure = time_pressure_count / total_words deflection = deflection_count / total_words return float(cognitive_load), float(time_pressure), float(deflection) def extract_sentiment(self, transcript: str) -> float: """ T7: Sentiment Polarity (-1 to +1) Negative sentiment often indicates stress/frustration """ if not transcript or len(transcript.strip()) == 0: return 0.0 try: result = self.sentiment_model(transcript[:512])[0] label = result['label'].lower() score = result['score'] if 'positive' in label: return float(score) elif 'negative' in label: return float(-score) else: return 0.0 except Exception as e: print(f"Sentiment extraction error: {e}") return 0.0 def extract_coherence(self, question: str, responses: List[str]) -> float: """ T8: Coherence Score (0 to 1) Measures how relevant responses are to the question Low coherence = distracted/not paying attention """ if not question or not responses: return 0.5 # Neutral if no data (changed from 1.0 to be more conservative) try: # Encode question and responses question_embedding = self.coherence_model.encode(question, convert_to_tensor=True) response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True) # Calculate cosine similarity from sentence_transformers import util similarities = util.cos_sim(question_embedding, response_embeddings)[0] # Average similarity as coherence score coherence = float(np.mean(similarities.cpu().numpy())) return max(0.0, min(1.0, coherence)) # Clamp to [0, 1] except Exception as e: print(f"Coherence extraction error: {e}") return 0.5 def extract_latency(self, events: List[Dict]) -> float: """ T9: Average Response Latency (seconds) ⚠️ WARNING: This feature is USELESS for single-side audio! Always returns 0.0 since we don't have agent questions. Kept for compatibility with existing models. events: List of dicts with 'timestamp' and 'speaker' keys """ # Always return 0 for single-side audio return 0.0 def extract_all( self, transcript_list: List[str], full_transcript: str = "", question: str = "", events: List[Dict] = None ) -> Dict[str, float]: """ Extract all 9 text features Args: transcript_list: List of individual responses (can be single item for one-turn) full_transcript: Complete conversation text question: The question/prompt from agent (for coherence) events: List of timestamped events (unused for single-side audio) Returns: Dict with keys: t0_explicit_free, t1_explicit_busy, t2_avg_resp_len, t3_short_ratio, t4_cognitive_load, t5_time_pressure, t6_deflection, t7_sentiment, t8_coherence, t9_latency """ features = {} # Use full transcript if not provided separately if not full_transcript: full_transcript = " ".join(transcript_list) # T0-T1: Explicit indicators (IMPROVED with NLI) features['t0_explicit_free'] = self.extract_explicit_free(full_transcript) features['t1_explicit_busy'] = self.extract_explicit_busy(full_transcript) # T2-T3: Response patterns avg_len, short_ratio = self.extract_response_patterns(transcript_list) features['t2_avg_resp_len'] = avg_len features['t3_short_ratio'] = short_ratio # T4-T6: Markers cog_load, time_press, deflect = self.extract_marker_counts(full_transcript) features['t4_cognitive_load'] = cog_load features['t5_time_pressure'] = time_press features['t6_deflection'] = deflect # T7: Sentiment features['t7_sentiment'] = self.extract_sentiment(full_transcript) # T8: Coherence (default to 0.5 if no question provided) if question: features['t8_coherence'] = self.extract_coherence(question, transcript_list) else: features['t8_coherence'] = 0.5 # Neutral # T9: Latency (ALWAYS 0 for single-side audio) features['t9_latency'] = 0.0 return features if __name__ == "__main__": # Test the extractor print("Initializing Text Feature Extractor...") extractor = TextFeatureExtractor(use_intent_model=True) # Test cases for intent classification test_cases = [ "I'm driving right now", "I'm not busy at all", "Can't talk, in a meeting", "I can talk now", "Not a good time", "I have time to chat" ] print("\nTesting intent classification:") for test in test_cases: busy_score = extractor.extract_explicit_busy(test) free_score = extractor.extract_explicit_free(test) print(f" '{test}'") print(f" → Busy: {busy_score:.1f}, Free: {free_score:.1f}") # Full feature extraction print("\nFull feature extraction:") features = extractor.extract_all( transcript_list=["I'm not busy", "I can talk now"], full_transcript="I'm not busy. I can talk now.", question="How are you doing today?" ) print("\nExtracted features:") for key, value in features.items(): print(f" {key}: {value:.3f}")