Spaces:

divAIne
/

busy-module-audio

Sleeping

File size: 16,262 Bytes

3469c65

"""

Text Feature Extractor - IMPROVED VERSION

Extracts 9 text features from conversation transcripts to detect busy/distracted states.



KEY IMPROVEMENTS:

1. Uses NLI model for intent classification (understands "not busy" properly)

2. Handles negation, context, and sarcasm

3. Removes useless t9_latency for single-side audio

"""

import numpy as np
from typing import List, Dict, Tuple
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import re


class TextFeatureExtractor:
    """Extract 9 text features for busy detection"""
    
    def __init__(self, use_intent_model: bool = True):
        """

        Initialize NLP models

        

        Args:

            use_intent_model: If True, use BART-MNLI for intent classification

                            If False, fall back to pattern matching

        """
        self.use_intent_model = use_intent_model

        print("Loading NLP models...")

        # Sentiment model
        model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
        self.sentiment_model = pipeline(
            "sentiment-analysis",
            model=model_name,
            device=-1
        )
        print("[OK] Sentiment model loaded")

        # Coherence model
        self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("[OK] Coherence model loaded")

        # Always setup patterns — busy_keywords is needed by extract_marker_counts()
        self._setup_patterns()

        # Intent classification model (NEW - understands context!)
        if self.use_intent_model:
            try:
                self.intent_classifier = pipeline(
                    "zero-shot-classification",
                    model="facebook/bart-large-mnli",
                    device=-1
                )
                print("[OK] Intent classifier loaded (BART-MNLI)")
            except Exception as e:
                print(f"[WARN] Intent classifier failed to load: {e}")
                print("  Falling back to pattern matching")
                self.use_intent_model = False

    def _setup_patterns(self):
        """Setup pattern-based matching as fallback"""
        # Negation pattern
        self.negation_pattern = re.compile(
            r'\b(not|no|never|neither|n\'t|dont|don\'t|cannot|can\'t|wont|won\'t)\s+\w*\s*(busy|free|available|talk|rush)',
            re.IGNORECASE
        )
        
        # Busy patterns (positive assertions)
        self.busy_patterns = [
            r'\b(i\'m|i am|im)\s+(busy|driving|working|cooking|rushing)\b',
            r'\bin a (meeting|call|hurry)\b',
            r'\bcan\'t talk\b',
            r'\bcall (you|me) back\b',
            r'\bnot a good time\b',
            r'\bbad time\b'
        ]
        
        # Free patterns (positive assertions)
        self.free_patterns = [
            r'\b(i\'m|i am|im)\s+(free|available)\b',
            r'\bcan talk\b',
            r'\bhave time\b',
            r'\bnot busy\b',
            r'\bgood time\b',
            r'\bnow works\b'
        ]
        
        # Compile patterns
        self.busy_patterns = [re.compile(p, re.IGNORECASE) for p in self.busy_patterns]
        self.free_patterns = [re.compile(p, re.IGNORECASE) for p in self.free_patterns]

        # Legacy keywords for other features
        self.busy_keywords = {
            'cognitive_load': [
                'um', 'uh', 'like', 'you know', 'i mean', 'kind of', 
                'sort of', 'basically', 'actually'
            ],
            'time_pressure': [
                'quickly', 'hurry', 'fast', 'urgent', 'asap', 'right now',
                'immediately', 'short', 'brief'
            ],
            'deflection': [
                'later', 'another time', 'not now', 'maybe', 'i don\'t know',
                'whatever', 'sure sure', 'yeah yeah'
            ]
        }
    
    def extract_explicit_busy(self, transcript: str) -> float:
        """

        T1: Explicit Busy Indicators (binary: 0 or 1)

        

        IMPROVED: Uses NLI model to understand context and negation

        - "I'm busy" → 1.0

        - "I'm not busy" → 0.0

        - "Can't talk right now" → 1.0

        - "I can talk" → 0.0

        """
        if not transcript or len(transcript.strip()) < 3:
            return 0.0
        
        # Method 1: Use intent classification model (best)
        if self.use_intent_model:
            try:
                result = self.intent_classifier(
                    transcript,
                    candidate_labels=["person is busy or occupied", 
                                    "person is free and available", 
                                    "unclear or neutral"],
                    hypothesis_template="This {}."
                )
                
                top_label = result['labels'][0]
                top_score = result['scores'][0]
                
                # Require high confidence (>0.6) to avoid false positives
                if top_score > 0.6:
                    if "busy" in top_label:
                        return 1.0
                    elif "free" in top_label:
                        return 0.0
                
                return 0.0  # Neutral or low confidence
                
            except Exception as e:
                print(f"Intent classification failed: {e}")
                # Fall through to pattern matching
        
        # Method 2: Pattern-based with negation handling (fallback)
        return self._extract_busy_patterns(transcript)
    
    def _extract_busy_patterns(self, transcript: str) -> float:
        """Pattern-based busy detection with negation handling"""
        transcript_lower = transcript.lower()
        
        # Check for negated busy/free statements
        negation_match = self.negation_pattern.search(transcript_lower)
        if negation_match:
            matched_text = negation_match.group(0)
            # "not busy" or "can't be free" etc.
            if any(word in matched_text for word in ['busy', 'rush']):
                return 0.0  # "not busy" = available
            elif any(word in matched_text for word in ['free', 'available', 'talk']):
                return 1.0  # "can't talk" or "not free" = busy
        
        # Check free patterns first (higher priority)
        for pattern in self.free_patterns:
            if pattern.search(transcript_lower):
                return 0.0
        
        # Then check busy patterns
        for pattern in self.busy_patterns:
            if pattern.search(transcript_lower):
                return 1.0
        
        return 0.0

    def extract_explicit_free(self, transcript: str) -> float:
        """

        T0: Explicit Free Indicators (binary: 0 or 1)

        

        IMPROVED: Uses same context-aware approach as busy detection

        """
        if not transcript or len(transcript.strip()) < 3:
            return 0.0
        
        # Use intent model
        if self.use_intent_model:
            try:
                result = self.intent_classifier(
                    transcript,
                    candidate_labels=["person is free and available",
                                    "person is busy or occupied",
                                    "unclear or neutral"],
                    hypothesis_template="This {}."
                )
                
                top_label = result['labels'][0]
                top_score = result['scores'][0]
                
                if top_score > 0.6 and "free" in top_label:
                    return 1.0
                
                return 0.0
                
            except Exception as e:
                print(f"Intent classification failed: {e}")
        
        # Fallback to patterns
        transcript_lower = transcript.lower()
        
        for pattern in self.free_patterns:
            if pattern.search(transcript_lower):
                return 1.0
        
        return 0.0
    
    def extract_response_patterns(self, transcript_list: List[str]) -> Tuple[float, float]:
        """

        T2-T3: Average Response Length and Short Response Ratio

        

        Returns:

            - avg_response_len: Average words per response

            - short_ratio: Fraction of responses with ≤3 words

        """
        if not transcript_list:
            return 0.0, 0.0
        
        word_counts = [len(response.split()) for response in transcript_list]
        
        avg_response_len = np.mean(word_counts)
        short_count = sum(1 for wc in word_counts if wc <= 3)
        short_ratio = short_count / len(word_counts)
        
        return float(avg_response_len), float(short_ratio)
    
    def extract_marker_counts(self, transcript: str) -> Tuple[float, float, float]:
        """

        T4-T6: Cognitive Load, Time Pressure, Deflection markers

        

        Returns:

            - cognitive_load: Count of filler words / total words

            - time_pressure: Count of urgency markers / total words

            - deflection: Count of deflection phrases / total words

        """
        transcript_lower = transcript.lower()
        words = transcript.split()
        total_words = len(words)
        
        if total_words == 0:
            return 0.0, 0.0, 0.0
        
        # Count markers
        cognitive_load_count = sum(
            1 for keyword in self.busy_keywords['cognitive_load']
            if keyword in transcript_lower
        )
        
        time_pressure_count = sum(
            1 for keyword in self.busy_keywords['time_pressure']
            if keyword in transcript_lower
        )
        
        deflection_count = sum(
            1 for keyword in self.busy_keywords['deflection']
            if keyword in transcript_lower
        )
        
        # Normalize by total words
        cognitive_load = cognitive_load_count / total_words
        time_pressure = time_pressure_count / total_words
        deflection = deflection_count / total_words
        
        return float(cognitive_load), float(time_pressure), float(deflection)
    
    def extract_sentiment(self, transcript: str) -> float:
        """

        T7: Sentiment Polarity (-1 to +1)

        Negative sentiment often indicates stress/frustration

        """
        if not transcript or len(transcript.strip()) == 0:
            return 0.0

        try:
            result = self.sentiment_model(transcript[:512])[0]
            label = result['label'].lower()
            score = result['score']

            if 'positive' in label:
                return float(score)
            elif 'negative' in label:
                return float(-score)
            else:
                return 0.0

        except Exception as e:
            print(f"Sentiment extraction error: {e}")
            return 0.0
    
    def extract_coherence(self, question: str, responses: List[str]) -> float:
        """

        T8: Coherence Score (0 to 1)

        Measures how relevant responses are to the question

        Low coherence = distracted/not paying attention

        """
        if not question or not responses:
            return 0.5  # Neutral if no data (changed from 1.0 to be more conservative)
        
        try:
            # Encode question and responses
            question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
            response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
            
            # Calculate cosine similarity
            from sentence_transformers import util
            similarities = util.cos_sim(question_embedding, response_embeddings)[0]
            
            # Average similarity as coherence score
            coherence = float(np.mean(similarities.cpu().numpy()))
            
            return max(0.0, min(1.0, coherence))  # Clamp to [0, 1]
        except Exception as e:
            print(f"Coherence extraction error: {e}")
            return 0.5
    
    def extract_latency(self, events: List[Dict]) -> float:
        """

        T9: Average Response Latency (seconds)

        

        ⚠️ WARNING: This feature is USELESS for single-side audio!

        Always returns 0.0 since we don't have agent questions.

        Kept for compatibility with existing models.

        

        events: List of dicts with 'timestamp' and 'speaker' keys

        """
        # Always return 0 for single-side audio
        return 0.0
    
    def extract_all(

        self, 

        transcript_list: List[str], 

        full_transcript: str = "",

        question: str = "",

        events: List[Dict] = None

    ) -> Dict[str, float]:
        """

        Extract all 9 text features

        

        Args:

            transcript_list: List of individual responses (can be single item for one-turn)

            full_transcript: Complete conversation text

            question: The question/prompt from agent (for coherence)

            events: List of timestamped events (unused for single-side audio)

        

        Returns:

            Dict with keys: t0_explicit_free, t1_explicit_busy,

                           t2_avg_resp_len, t3_short_ratio,

                           t4_cognitive_load, t5_time_pressure, t6_deflection,

                           t7_sentiment, t8_coherence, t9_latency

        """
        features = {}
        
        # Use full transcript if not provided separately
        if not full_transcript:
            full_transcript = " ".join(transcript_list)
        
        # T0-T1: Explicit indicators (IMPROVED with NLI)
        features['t0_explicit_free'] = self.extract_explicit_free(full_transcript)
        features['t1_explicit_busy'] = self.extract_explicit_busy(full_transcript)
        
        # T2-T3: Response patterns
        avg_len, short_ratio = self.extract_response_patterns(transcript_list)
        features['t2_avg_resp_len'] = avg_len
        features['t3_short_ratio'] = short_ratio
        
        # T4-T6: Markers
        cog_load, time_press, deflect = self.extract_marker_counts(full_transcript)
        features['t4_cognitive_load'] = cog_load
        features['t5_time_pressure'] = time_press
        features['t6_deflection'] = deflect
        
        # T7: Sentiment
        features['t7_sentiment'] = self.extract_sentiment(full_transcript)
        
        # T8: Coherence (default to 0.5 if no question provided)
        if question:
            features['t8_coherence'] = self.extract_coherence(question, transcript_list)
        else:
            features['t8_coherence'] = 0.5  # Neutral
        
        # T9: Latency (ALWAYS 0 for single-side audio)
        features['t9_latency'] = 0.0
        
        return features


if __name__ == "__main__":
    # Test the extractor
    print("Initializing Text Feature Extractor...")
    extractor = TextFeatureExtractor(use_intent_model=True)
    
    # Test cases for intent classification
    test_cases = [
        "I'm driving right now",
        "I'm not busy at all",
        "Can't talk, in a meeting",
        "I can talk now",
        "Not a good time",
        "I have time to chat"
    ]
    
    print("\nTesting intent classification:")
    for test in test_cases:
        busy_score = extractor.extract_explicit_busy(test)
        free_score = extractor.extract_explicit_free(test)
        print(f"  '{test}'")
        print(f"    → Busy: {busy_score:.1f}, Free: {free_score:.1f}")
    
    # Full feature extraction
    print("\nFull feature extraction:")
    features = extractor.extract_all(
        transcript_list=["I'm not busy", "I can talk now"],
        full_transcript="I'm not busy. I can talk now.",
        question="How are you doing today?"
    )
    
    print("\nExtracted features:")
    for key, value in features.items():
        print(f"  {key}: {value:.3f}")