busy-module-audio / text_features.py
EurekaPotato's picture
Upload folder using huggingface_hub
3469c65 verified
"""
Text Feature Extractor - IMPROVED VERSION
Extracts 9 text features from conversation transcripts to detect busy/distracted states.
KEY IMPROVEMENTS:
1. Uses NLI model for intent classification (understands "not busy" properly)
2. Handles negation, context, and sarcasm
3. Removes useless t9_latency for single-side audio
"""
import numpy as np
from typing import List, Dict, Tuple
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import re
class TextFeatureExtractor:
"""Extract 9 text features for busy detection"""
def __init__(self, use_intent_model: bool = True):
"""
Initialize NLP models
Args:
use_intent_model: If True, use BART-MNLI for intent classification
If False, fall back to pattern matching
"""
self.use_intent_model = use_intent_model
print("Loading NLP models...")
# Sentiment model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
self.sentiment_model = pipeline(
"sentiment-analysis",
model=model_name,
device=-1
)
print("[OK] Sentiment model loaded")
# Coherence model
self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
print("[OK] Coherence model loaded")
# Always setup patterns — busy_keywords is needed by extract_marker_counts()
self._setup_patterns()
# Intent classification model (NEW - understands context!)
if self.use_intent_model:
try:
self.intent_classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=-1
)
print("[OK] Intent classifier loaded (BART-MNLI)")
except Exception as e:
print(f"[WARN] Intent classifier failed to load: {e}")
print(" Falling back to pattern matching")
self.use_intent_model = False
def _setup_patterns(self):
"""Setup pattern-based matching as fallback"""
# Negation pattern
self.negation_pattern = re.compile(
r'\b(not|no|never|neither|n\'t|dont|don\'t|cannot|can\'t|wont|won\'t)\s+\w*\s*(busy|free|available|talk|rush)',
re.IGNORECASE
)
# Busy patterns (positive assertions)
self.busy_patterns = [
r'\b(i\'m|i am|im)\s+(busy|driving|working|cooking|rushing)\b',
r'\bin a (meeting|call|hurry)\b',
r'\bcan\'t talk\b',
r'\bcall (you|me) back\b',
r'\bnot a good time\b',
r'\bbad time\b'
]
# Free patterns (positive assertions)
self.free_patterns = [
r'\b(i\'m|i am|im)\s+(free|available)\b',
r'\bcan talk\b',
r'\bhave time\b',
r'\bnot busy\b',
r'\bgood time\b',
r'\bnow works\b'
]
# Compile patterns
self.busy_patterns = [re.compile(p, re.IGNORECASE) for p in self.busy_patterns]
self.free_patterns = [re.compile(p, re.IGNORECASE) for p in self.free_patterns]
# Legacy keywords for other features
self.busy_keywords = {
'cognitive_load': [
'um', 'uh', 'like', 'you know', 'i mean', 'kind of',
'sort of', 'basically', 'actually'
],
'time_pressure': [
'quickly', 'hurry', 'fast', 'urgent', 'asap', 'right now',
'immediately', 'short', 'brief'
],
'deflection': [
'later', 'another time', 'not now', 'maybe', 'i don\'t know',
'whatever', 'sure sure', 'yeah yeah'
]
}
def extract_explicit_busy(self, transcript: str) -> float:
"""
T1: Explicit Busy Indicators (binary: 0 or 1)
IMPROVED: Uses NLI model to understand context and negation
- "I'm busy" → 1.0
- "I'm not busy" → 0.0
- "Can't talk right now" → 1.0
- "I can talk" → 0.0
"""
if not transcript or len(transcript.strip()) < 3:
return 0.0
# Method 1: Use intent classification model (best)
if self.use_intent_model:
try:
result = self.intent_classifier(
transcript,
candidate_labels=["person is busy or occupied",
"person is free and available",
"unclear or neutral"],
hypothesis_template="This {}."
)
top_label = result['labels'][0]
top_score = result['scores'][0]
# Require high confidence (>0.6) to avoid false positives
if top_score > 0.6:
if "busy" in top_label:
return 1.0
elif "free" in top_label:
return 0.0
return 0.0 # Neutral or low confidence
except Exception as e:
print(f"Intent classification failed: {e}")
# Fall through to pattern matching
# Method 2: Pattern-based with negation handling (fallback)
return self._extract_busy_patterns(transcript)
def _extract_busy_patterns(self, transcript: str) -> float:
"""Pattern-based busy detection with negation handling"""
transcript_lower = transcript.lower()
# Check for negated busy/free statements
negation_match = self.negation_pattern.search(transcript_lower)
if negation_match:
matched_text = negation_match.group(0)
# "not busy" or "can't be free" etc.
if any(word in matched_text for word in ['busy', 'rush']):
return 0.0 # "not busy" = available
elif any(word in matched_text for word in ['free', 'available', 'talk']):
return 1.0 # "can't talk" or "not free" = busy
# Check free patterns first (higher priority)
for pattern in self.free_patterns:
if pattern.search(transcript_lower):
return 0.0
# Then check busy patterns
for pattern in self.busy_patterns:
if pattern.search(transcript_lower):
return 1.0
return 0.0
def extract_explicit_free(self, transcript: str) -> float:
"""
T0: Explicit Free Indicators (binary: 0 or 1)
IMPROVED: Uses same context-aware approach as busy detection
"""
if not transcript or len(transcript.strip()) < 3:
return 0.0
# Use intent model
if self.use_intent_model:
try:
result = self.intent_classifier(
transcript,
candidate_labels=["person is free and available",
"person is busy or occupied",
"unclear or neutral"],
hypothesis_template="This {}."
)
top_label = result['labels'][0]
top_score = result['scores'][0]
if top_score > 0.6 and "free" in top_label:
return 1.0
return 0.0
except Exception as e:
print(f"Intent classification failed: {e}")
# Fallback to patterns
transcript_lower = transcript.lower()
for pattern in self.free_patterns:
if pattern.search(transcript_lower):
return 1.0
return 0.0
def extract_response_patterns(self, transcript_list: List[str]) -> Tuple[float, float]:
"""
T2-T3: Average Response Length and Short Response Ratio
Returns:
- avg_response_len: Average words per response
- short_ratio: Fraction of responses with ≤3 words
"""
if not transcript_list:
return 0.0, 0.0
word_counts = [len(response.split()) for response in transcript_list]
avg_response_len = np.mean(word_counts)
short_count = sum(1 for wc in word_counts if wc <= 3)
short_ratio = short_count / len(word_counts)
return float(avg_response_len), float(short_ratio)
def extract_marker_counts(self, transcript: str) -> Tuple[float, float, float]:
"""
T4-T6: Cognitive Load, Time Pressure, Deflection markers
Returns:
- cognitive_load: Count of filler words / total words
- time_pressure: Count of urgency markers / total words
- deflection: Count of deflection phrases / total words
"""
transcript_lower = transcript.lower()
words = transcript.split()
total_words = len(words)
if total_words == 0:
return 0.0, 0.0, 0.0
# Count markers
cognitive_load_count = sum(
1 for keyword in self.busy_keywords['cognitive_load']
if keyword in transcript_lower
)
time_pressure_count = sum(
1 for keyword in self.busy_keywords['time_pressure']
if keyword in transcript_lower
)
deflection_count = sum(
1 for keyword in self.busy_keywords['deflection']
if keyword in transcript_lower
)
# Normalize by total words
cognitive_load = cognitive_load_count / total_words
time_pressure = time_pressure_count / total_words
deflection = deflection_count / total_words
return float(cognitive_load), float(time_pressure), float(deflection)
def extract_sentiment(self, transcript: str) -> float:
"""
T7: Sentiment Polarity (-1 to +1)
Negative sentiment often indicates stress/frustration
"""
if not transcript or len(transcript.strip()) == 0:
return 0.0
try:
result = self.sentiment_model(transcript[:512])[0]
label = result['label'].lower()
score = result['score']
if 'positive' in label:
return float(score)
elif 'negative' in label:
return float(-score)
else:
return 0.0
except Exception as e:
print(f"Sentiment extraction error: {e}")
return 0.0
def extract_coherence(self, question: str, responses: List[str]) -> float:
"""
T8: Coherence Score (0 to 1)
Measures how relevant responses are to the question
Low coherence = distracted/not paying attention
"""
if not question or not responses:
return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
try:
# Encode question and responses
question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
# Calculate cosine similarity
from sentence_transformers import util
similarities = util.cos_sim(question_embedding, response_embeddings)[0]
# Average similarity as coherence score
coherence = float(np.mean(similarities.cpu().numpy()))
return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
except Exception as e:
print(f"Coherence extraction error: {e}")
return 0.5
def extract_latency(self, events: List[Dict]) -> float:
"""
T9: Average Response Latency (seconds)
⚠️ WARNING: This feature is USELESS for single-side audio!
Always returns 0.0 since we don't have agent questions.
Kept for compatibility with existing models.
events: List of dicts with 'timestamp' and 'speaker' keys
"""
# Always return 0 for single-side audio
return 0.0
def extract_all(
self,
transcript_list: List[str],
full_transcript: str = "",
question: str = "",
events: List[Dict] = None
) -> Dict[str, float]:
"""
Extract all 9 text features
Args:
transcript_list: List of individual responses (can be single item for one-turn)
full_transcript: Complete conversation text
question: The question/prompt from agent (for coherence)
events: List of timestamped events (unused for single-side audio)
Returns:
Dict with keys: t0_explicit_free, t1_explicit_busy,
t2_avg_resp_len, t3_short_ratio,
t4_cognitive_load, t5_time_pressure, t6_deflection,
t7_sentiment, t8_coherence, t9_latency
"""
features = {}
# Use full transcript if not provided separately
if not full_transcript:
full_transcript = " ".join(transcript_list)
# T0-T1: Explicit indicators (IMPROVED with NLI)
features['t0_explicit_free'] = self.extract_explicit_free(full_transcript)
features['t1_explicit_busy'] = self.extract_explicit_busy(full_transcript)
# T2-T3: Response patterns
avg_len, short_ratio = self.extract_response_patterns(transcript_list)
features['t2_avg_resp_len'] = avg_len
features['t3_short_ratio'] = short_ratio
# T4-T6: Markers
cog_load, time_press, deflect = self.extract_marker_counts(full_transcript)
features['t4_cognitive_load'] = cog_load
features['t5_time_pressure'] = time_press
features['t6_deflection'] = deflect
# T7: Sentiment
features['t7_sentiment'] = self.extract_sentiment(full_transcript)
# T8: Coherence (default to 0.5 if no question provided)
if question:
features['t8_coherence'] = self.extract_coherence(question, transcript_list)
else:
features['t8_coherence'] = 0.5 # Neutral
# T9: Latency (ALWAYS 0 for single-side audio)
features['t9_latency'] = 0.0
return features
if __name__ == "__main__":
# Test the extractor
print("Initializing Text Feature Extractor...")
extractor = TextFeatureExtractor(use_intent_model=True)
# Test cases for intent classification
test_cases = [
"I'm driving right now",
"I'm not busy at all",
"Can't talk, in a meeting",
"I can talk now",
"Not a good time",
"I have time to chat"
]
print("\nTesting intent classification:")
for test in test_cases:
busy_score = extractor.extract_explicit_busy(test)
free_score = extractor.extract_explicit_free(test)
print(f" '{test}'")
print(f" → Busy: {busy_score:.1f}, Free: {free_score:.1f}")
# Full feature extraction
print("\nFull feature extraction:")
features = extractor.extract_all(
transcript_list=["I'm not busy", "I can talk now"],
full_transcript="I'm not busy. I can talk now.",
question="How are you doing today?"
)
print("\nExtracted features:")
for key, value in features.items():
print(f" {key}: {value:.3f}")