Spaces:
Sleeping
Sleeping
| """ | |
| Text Feature Extractor - IMPROVED VERSION | |
| Extracts 9 text features from conversation transcripts to detect busy/distracted states. | |
| KEY IMPROVEMENTS: | |
| 1. Uses NLI model for intent classification (understands "not busy" properly) | |
| 2. Handles negation, context, and sarcasm | |
| 3. Removes useless t9_latency for single-side audio | |
| """ | |
| import numpy as np | |
| from typing import List, Dict, Tuple | |
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| import re | |
| class TextFeatureExtractor: | |
| """Extract 9 text features for busy detection""" | |
| def __init__(self, use_intent_model: bool = True): | |
| """ | |
| Initialize NLP models | |
| Args: | |
| use_intent_model: If True, use BART-MNLI for intent classification | |
| If False, fall back to pattern matching | |
| """ | |
| self.use_intent_model = use_intent_model | |
| print("Loading NLP models...") | |
| # Sentiment model | |
| model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" | |
| self.sentiment_model = pipeline( | |
| "sentiment-analysis", | |
| model=model_name, | |
| device=-1 | |
| ) | |
| print("[OK] Sentiment model loaded") | |
| # Coherence model | |
| self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print("[OK] Coherence model loaded") | |
| # Always setup patterns — busy_keywords is needed by extract_marker_counts() | |
| self._setup_patterns() | |
| # Intent classification model (NEW - understands context!) | |
| if self.use_intent_model: | |
| try: | |
| self.intent_classifier = pipeline( | |
| "zero-shot-classification", | |
| model="facebook/bart-large-mnli", | |
| device=-1 | |
| ) | |
| print("[OK] Intent classifier loaded (BART-MNLI)") | |
| except Exception as e: | |
| print(f"[WARN] Intent classifier failed to load: {e}") | |
| print(" Falling back to pattern matching") | |
| self.use_intent_model = False | |
| def _setup_patterns(self): | |
| """Setup pattern-based matching as fallback""" | |
| # Negation pattern | |
| self.negation_pattern = re.compile( | |
| r'\b(not|no|never|neither|n\'t|dont|don\'t|cannot|can\'t|wont|won\'t)\s+\w*\s*(busy|free|available|talk|rush)', | |
| re.IGNORECASE | |
| ) | |
| # Busy patterns (positive assertions) | |
| self.busy_patterns = [ | |
| r'\b(i\'m|i am|im)\s+(busy|driving|working|cooking|rushing)\b', | |
| r'\bin a (meeting|call|hurry)\b', | |
| r'\bcan\'t talk\b', | |
| r'\bcall (you|me) back\b', | |
| r'\bnot a good time\b', | |
| r'\bbad time\b' | |
| ] | |
| # Free patterns (positive assertions) | |
| self.free_patterns = [ | |
| r'\b(i\'m|i am|im)\s+(free|available)\b', | |
| r'\bcan talk\b', | |
| r'\bhave time\b', | |
| r'\bnot busy\b', | |
| r'\bgood time\b', | |
| r'\bnow works\b' | |
| ] | |
| # Compile patterns | |
| self.busy_patterns = [re.compile(p, re.IGNORECASE) for p in self.busy_patterns] | |
| self.free_patterns = [re.compile(p, re.IGNORECASE) for p in self.free_patterns] | |
| # Legacy keywords for other features | |
| self.busy_keywords = { | |
| 'cognitive_load': [ | |
| 'um', 'uh', 'like', 'you know', 'i mean', 'kind of', | |
| 'sort of', 'basically', 'actually' | |
| ], | |
| 'time_pressure': [ | |
| 'quickly', 'hurry', 'fast', 'urgent', 'asap', 'right now', | |
| 'immediately', 'short', 'brief' | |
| ], | |
| 'deflection': [ | |
| 'later', 'another time', 'not now', 'maybe', 'i don\'t know', | |
| 'whatever', 'sure sure', 'yeah yeah' | |
| ] | |
| } | |
| def extract_explicit_busy(self, transcript: str) -> float: | |
| """ | |
| T1: Explicit Busy Indicators (binary: 0 or 1) | |
| IMPROVED: Uses NLI model to understand context and negation | |
| - "I'm busy" → 1.0 | |
| - "I'm not busy" → 0.0 | |
| - "Can't talk right now" → 1.0 | |
| - "I can talk" → 0.0 | |
| """ | |
| if not transcript or len(transcript.strip()) < 3: | |
| return 0.0 | |
| # Method 1: Use intent classification model (best) | |
| if self.use_intent_model: | |
| try: | |
| result = self.intent_classifier( | |
| transcript, | |
| candidate_labels=["person is busy or occupied", | |
| "person is free and available", | |
| "unclear or neutral"], | |
| hypothesis_template="This {}." | |
| ) | |
| top_label = result['labels'][0] | |
| top_score = result['scores'][0] | |
| # Require high confidence (>0.6) to avoid false positives | |
| if top_score > 0.6: | |
| if "busy" in top_label: | |
| return 1.0 | |
| elif "free" in top_label: | |
| return 0.0 | |
| return 0.0 # Neutral or low confidence | |
| except Exception as e: | |
| print(f"Intent classification failed: {e}") | |
| # Fall through to pattern matching | |
| # Method 2: Pattern-based with negation handling (fallback) | |
| return self._extract_busy_patterns(transcript) | |
| def _extract_busy_patterns(self, transcript: str) -> float: | |
| """Pattern-based busy detection with negation handling""" | |
| transcript_lower = transcript.lower() | |
| # Check for negated busy/free statements | |
| negation_match = self.negation_pattern.search(transcript_lower) | |
| if negation_match: | |
| matched_text = negation_match.group(0) | |
| # "not busy" or "can't be free" etc. | |
| if any(word in matched_text for word in ['busy', 'rush']): | |
| return 0.0 # "not busy" = available | |
| elif any(word in matched_text for word in ['free', 'available', 'talk']): | |
| return 1.0 # "can't talk" or "not free" = busy | |
| # Check free patterns first (higher priority) | |
| for pattern in self.free_patterns: | |
| if pattern.search(transcript_lower): | |
| return 0.0 | |
| # Then check busy patterns | |
| for pattern in self.busy_patterns: | |
| if pattern.search(transcript_lower): | |
| return 1.0 | |
| return 0.0 | |
| def extract_explicit_free(self, transcript: str) -> float: | |
| """ | |
| T0: Explicit Free Indicators (binary: 0 or 1) | |
| IMPROVED: Uses same context-aware approach as busy detection | |
| """ | |
| if not transcript or len(transcript.strip()) < 3: | |
| return 0.0 | |
| # Use intent model | |
| if self.use_intent_model: | |
| try: | |
| result = self.intent_classifier( | |
| transcript, | |
| candidate_labels=["person is free and available", | |
| "person is busy or occupied", | |
| "unclear or neutral"], | |
| hypothesis_template="This {}." | |
| ) | |
| top_label = result['labels'][0] | |
| top_score = result['scores'][0] | |
| if top_score > 0.6 and "free" in top_label: | |
| return 1.0 | |
| return 0.0 | |
| except Exception as e: | |
| print(f"Intent classification failed: {e}") | |
| # Fallback to patterns | |
| transcript_lower = transcript.lower() | |
| for pattern in self.free_patterns: | |
| if pattern.search(transcript_lower): | |
| return 1.0 | |
| return 0.0 | |
| def extract_response_patterns(self, transcript_list: List[str]) -> Tuple[float, float]: | |
| """ | |
| T2-T3: Average Response Length and Short Response Ratio | |
| Returns: | |
| - avg_response_len: Average words per response | |
| - short_ratio: Fraction of responses with ≤3 words | |
| """ | |
| if not transcript_list: | |
| return 0.0, 0.0 | |
| word_counts = [len(response.split()) for response in transcript_list] | |
| avg_response_len = np.mean(word_counts) | |
| short_count = sum(1 for wc in word_counts if wc <= 3) | |
| short_ratio = short_count / len(word_counts) | |
| return float(avg_response_len), float(short_ratio) | |
| def extract_marker_counts(self, transcript: str) -> Tuple[float, float, float]: | |
| """ | |
| T4-T6: Cognitive Load, Time Pressure, Deflection markers | |
| Returns: | |
| - cognitive_load: Count of filler words / total words | |
| - time_pressure: Count of urgency markers / total words | |
| - deflection: Count of deflection phrases / total words | |
| """ | |
| transcript_lower = transcript.lower() | |
| words = transcript.split() | |
| total_words = len(words) | |
| if total_words == 0: | |
| return 0.0, 0.0, 0.0 | |
| # Count markers | |
| cognitive_load_count = sum( | |
| 1 for keyword in self.busy_keywords['cognitive_load'] | |
| if keyword in transcript_lower | |
| ) | |
| time_pressure_count = sum( | |
| 1 for keyword in self.busy_keywords['time_pressure'] | |
| if keyword in transcript_lower | |
| ) | |
| deflection_count = sum( | |
| 1 for keyword in self.busy_keywords['deflection'] | |
| if keyword in transcript_lower | |
| ) | |
| # Normalize by total words | |
| cognitive_load = cognitive_load_count / total_words | |
| time_pressure = time_pressure_count / total_words | |
| deflection = deflection_count / total_words | |
| return float(cognitive_load), float(time_pressure), float(deflection) | |
| def extract_sentiment(self, transcript: str) -> float: | |
| """ | |
| T7: Sentiment Polarity (-1 to +1) | |
| Negative sentiment often indicates stress/frustration | |
| """ | |
| if not transcript or len(transcript.strip()) == 0: | |
| return 0.0 | |
| try: | |
| result = self.sentiment_model(transcript[:512])[0] | |
| label = result['label'].lower() | |
| score = result['score'] | |
| if 'positive' in label: | |
| return float(score) | |
| elif 'negative' in label: | |
| return float(-score) | |
| else: | |
| return 0.0 | |
| except Exception as e: | |
| print(f"Sentiment extraction error: {e}") | |
| return 0.0 | |
| def extract_coherence(self, question: str, responses: List[str]) -> float: | |
| """ | |
| T8: Coherence Score (0 to 1) | |
| Measures how relevant responses are to the question | |
| Low coherence = distracted/not paying attention | |
| """ | |
| if not question or not responses: | |
| return 0.5 # Neutral if no data (changed from 1.0 to be more conservative) | |
| try: | |
| # Encode question and responses | |
| question_embedding = self.coherence_model.encode(question, convert_to_tensor=True) | |
| response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True) | |
| # Calculate cosine similarity | |
| from sentence_transformers import util | |
| similarities = util.cos_sim(question_embedding, response_embeddings)[0] | |
| # Average similarity as coherence score | |
| coherence = float(np.mean(similarities.cpu().numpy())) | |
| return max(0.0, min(1.0, coherence)) # Clamp to [0, 1] | |
| except Exception as e: | |
| print(f"Coherence extraction error: {e}") | |
| return 0.5 | |
| def extract_latency(self, events: List[Dict]) -> float: | |
| """ | |
| T9: Average Response Latency (seconds) | |
| ⚠️ WARNING: This feature is USELESS for single-side audio! | |
| Always returns 0.0 since we don't have agent questions. | |
| Kept for compatibility with existing models. | |
| events: List of dicts with 'timestamp' and 'speaker' keys | |
| """ | |
| # Always return 0 for single-side audio | |
| return 0.0 | |
| def extract_all( | |
| self, | |
| transcript_list: List[str], | |
| full_transcript: str = "", | |
| question: str = "", | |
| events: List[Dict] = None | |
| ) -> Dict[str, float]: | |
| """ | |
| Extract all 9 text features | |
| Args: | |
| transcript_list: List of individual responses (can be single item for one-turn) | |
| full_transcript: Complete conversation text | |
| question: The question/prompt from agent (for coherence) | |
| events: List of timestamped events (unused for single-side audio) | |
| Returns: | |
| Dict with keys: t0_explicit_free, t1_explicit_busy, | |
| t2_avg_resp_len, t3_short_ratio, | |
| t4_cognitive_load, t5_time_pressure, t6_deflection, | |
| t7_sentiment, t8_coherence, t9_latency | |
| """ | |
| features = {} | |
| # Use full transcript if not provided separately | |
| if not full_transcript: | |
| full_transcript = " ".join(transcript_list) | |
| # T0-T1: Explicit indicators (IMPROVED with NLI) | |
| features['t0_explicit_free'] = self.extract_explicit_free(full_transcript) | |
| features['t1_explicit_busy'] = self.extract_explicit_busy(full_transcript) | |
| # T2-T3: Response patterns | |
| avg_len, short_ratio = self.extract_response_patterns(transcript_list) | |
| features['t2_avg_resp_len'] = avg_len | |
| features['t3_short_ratio'] = short_ratio | |
| # T4-T6: Markers | |
| cog_load, time_press, deflect = self.extract_marker_counts(full_transcript) | |
| features['t4_cognitive_load'] = cog_load | |
| features['t5_time_pressure'] = time_press | |
| features['t6_deflection'] = deflect | |
| # T7: Sentiment | |
| features['t7_sentiment'] = self.extract_sentiment(full_transcript) | |
| # T8: Coherence (default to 0.5 if no question provided) | |
| if question: | |
| features['t8_coherence'] = self.extract_coherence(question, transcript_list) | |
| else: | |
| features['t8_coherence'] = 0.5 # Neutral | |
| # T9: Latency (ALWAYS 0 for single-side audio) | |
| features['t9_latency'] = 0.0 | |
| return features | |
| if __name__ == "__main__": | |
| # Test the extractor | |
| print("Initializing Text Feature Extractor...") | |
| extractor = TextFeatureExtractor(use_intent_model=True) | |
| # Test cases for intent classification | |
| test_cases = [ | |
| "I'm driving right now", | |
| "I'm not busy at all", | |
| "Can't talk, in a meeting", | |
| "I can talk now", | |
| "Not a good time", | |
| "I have time to chat" | |
| ] | |
| print("\nTesting intent classification:") | |
| for test in test_cases: | |
| busy_score = extractor.extract_explicit_busy(test) | |
| free_score = extractor.extract_explicit_free(test) | |
| print(f" '{test}'") | |
| print(f" → Busy: {busy_score:.1f}, Free: {free_score:.1f}") | |
| # Full feature extraction | |
| print("\nFull feature extraction:") | |
| features = extractor.extract_all( | |
| transcript_list=["I'm not busy", "I can talk now"], | |
| full_transcript="I'm not busy. I can talk now.", | |
| question="How are you doing today?" | |
| ) | |
| print("\nExtracted features:") | |
| for key, value in features.items(): | |
| print(f" {key}: {value:.3f}") |