EurekaPotato commited on
Commit
cd2f19e
·
verified ·
1 Parent(s): e453bf9

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. text_features.py +441 -431
text_features.py CHANGED
@@ -1,431 +1,441 @@
1
- """
2
- Text Feature Extractor - IMPROVED VERSION
3
- Extracts 9 text features from conversation transcripts to detect busy/distracted states.
4
-
5
- KEY IMPROVEMENTS:
6
- 1. Uses NLI model for intent classification (understands "not busy" properly)
7
- 2. Handles negation, context, and sarcasm
8
- 3. Removes useless t9_latency for single-side audio
9
- """
10
-
11
- import numpy as np
12
- from typing import List, Dict, Tuple
13
- from transformers import pipeline
14
- from sentence_transformers import SentenceTransformer
15
- import re
16
-
17
-
18
- class TextFeatureExtractor:
19
- """Extract 9 text features for busy detection"""
20
-
21
- def __init__(self, use_intent_model: bool = True):
22
- """
23
- Initialize NLP models
24
-
25
- Args:
26
- use_intent_model: If True, use BART-MNLI for intent classification
27
- If False, fall back to pattern matching
28
- """
29
- self.use_intent_model = use_intent_model
30
-
31
- print("Loading NLP models...")
32
-
33
- # Sentiment model
34
- model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
35
- self.sentiment_model = pipeline(
36
- "sentiment-analysis",
37
- model=model_name,
38
- device=-1
39
- )
40
- print("[OK] Sentiment model loaded")
41
-
42
- # Coherence model
43
- self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
44
- print("[OK] Coherence model loaded")
45
-
46
- # Always setup patterns — busy_keywords is needed by extract_marker_counts()
47
- self._setup_patterns()
48
-
49
- # Intent classification model (NEW - understands context!)
50
- if self.use_intent_model:
51
- try:
52
- self.intent_classifier = pipeline(
53
- "zero-shot-classification",
54
- model="facebook/bart-large-mnli",
55
- device=-1
56
- )
57
- print("[OK] Intent classifier loaded (BART-MNLI)")
58
- except Exception as e:
59
- print(f"[WARN] Intent classifier failed to load: {e}")
60
- print(" Falling back to pattern matching")
61
- self.use_intent_model = False
62
-
63
- def _setup_patterns(self):
64
- """Setup pattern-based matching as fallback"""
65
- # Negation pattern
66
- self.negation_pattern = re.compile(
67
- r'\b(not|no|never|neither|n\'t|dont|don\'t|cannot|can\'t|wont|won\'t)\s+\w*\s*(busy|free|available|talk|rush)',
68
- re.IGNORECASE
69
- )
70
-
71
- # Busy patterns (positive assertions)
72
- self.busy_patterns = [
73
- r'\b(i\'m|i am|im)\s+(busy|driving|working|cooking|rushing)\b',
74
- r'\bin a (meeting|call|hurry)\b',
75
- r'\bcan\'t talk\b',
76
- r'\bcall (you|me) back\b',
77
- r'\bnot a good time\b',
78
- r'\bbad time\b'
79
- ]
80
-
81
- # Free patterns (positive assertions)
82
- self.free_patterns = [
83
- r'\b(i\'m|i am|im)\s+(free|available)\b',
84
- r'\bcan talk\b',
85
- r'\bhave time\b',
86
- r'\bnot busy\b',
87
- r'\bgood time\b',
88
- r'\bnow works\b'
89
- ]
90
-
91
- # Compile patterns
92
- self.busy_patterns = [re.compile(p, re.IGNORECASE) for p in self.busy_patterns]
93
- self.free_patterns = [re.compile(p, re.IGNORECASE) for p in self.free_patterns]
94
-
95
- # Legacy keywords for other features
96
- self.busy_keywords = {
97
- 'cognitive_load': [
98
- 'um', 'uh', 'like', 'you know', 'i mean', 'kind of',
99
- 'sort of', 'basically', 'actually'
100
- ],
101
- 'time_pressure': [
102
- 'quickly', 'hurry', 'fast', 'urgent', 'asap', 'right now',
103
- 'immediately', 'short', 'brief'
104
- ],
105
- 'deflection': [
106
- 'later', 'another time', 'not now', 'maybe', 'i don\'t know',
107
- 'whatever', 'sure sure', 'yeah yeah'
108
- ]
109
- }
110
-
111
- def extract_explicit_busy(self, transcript: str) -> float:
112
- """
113
- T1: Explicit Busy Indicators (binary: 0 or 1)
114
-
115
- IMPROVED: Uses NLI model to understand context and negation
116
- - "I'm busy" 1.0
117
- - "I'm not busy" → 0.0
118
- - "Can't talk right now" → 1.0
119
- - "I can talk" → 0.0
120
- """
121
- if not transcript or len(transcript.strip()) < 3:
122
- return 0.0
123
-
124
- # Method 1: Use intent classification model (best)
125
- if self.use_intent_model:
126
- try:
127
- result = self.intent_classifier(
128
- transcript,
129
- candidate_labels=["person is busy or occupied",
130
- "person is free and available",
131
- "unclear or neutral"],
132
- hypothesis_template="This {}."
133
- )
134
-
135
- top_label = result['labels'][0]
136
- top_score = result['scores'][0]
137
-
138
- # Require high confidence (>0.6) to avoid false positives
139
- if top_score > 0.6:
140
- if "busy" in top_label:
141
- return 1.0
142
- elif "free" in top_label:
143
- return 0.0
144
-
145
- return 0.0 # Neutral or low confidence
146
-
147
- except Exception as e:
148
- print(f"Intent classification failed: {e}")
149
- # Fall through to pattern matching
150
-
151
- # Method 2: Pattern-based with negation handling (fallback)
152
- return self._extract_busy_patterns(transcript)
153
-
154
- def _extract_busy_patterns(self, transcript: str) -> float:
155
- """Pattern-based busy detection with negation handling"""
156
- transcript_lower = transcript.lower()
157
-
158
- # Check for negated busy/free statements
159
- negation_match = self.negation_pattern.search(transcript_lower)
160
- if negation_match:
161
- matched_text = negation_match.group(0)
162
- # "not busy" or "can't be free" etc.
163
- if any(word in matched_text for word in ['busy', 'rush']):
164
- return 0.0 # "not busy" = available
165
- elif any(word in matched_text for word in ['free', 'available', 'talk']):
166
- return 1.0 # "can't talk" or "not free" = busy
167
-
168
- # Check free patterns first (higher priority)
169
- for pattern in self.free_patterns:
170
- if pattern.search(transcript_lower):
171
- return 0.0
172
-
173
- # Then check busy patterns
174
- for pattern in self.busy_patterns:
175
- if pattern.search(transcript_lower):
176
- return 1.0
177
-
178
- return 0.0
179
-
180
- def extract_explicit_free(self, transcript: str) -> float:
181
- """
182
- T0: Explicit Free Indicators (binary: 0 or 1)
183
-
184
- IMPROVED: Uses same context-aware approach as busy detection
185
- """
186
- if not transcript or len(transcript.strip()) < 3:
187
- return 0.0
188
-
189
- # Use intent model
190
- if self.use_intent_model:
191
- try:
192
- result = self.intent_classifier(
193
- transcript,
194
- candidate_labels=["person is free and available",
195
- "person is busy or occupied",
196
- "unclear or neutral"],
197
- hypothesis_template="This {}."
198
- )
199
-
200
- top_label = result['labels'][0]
201
- top_score = result['scores'][0]
202
-
203
- if top_score > 0.6 and "free" in top_label:
204
- return 1.0
205
-
206
- return 0.0
207
-
208
- except Exception as e:
209
- print(f"Intent classification failed: {e}")
210
-
211
- # Fallback to patterns
212
- transcript_lower = transcript.lower()
213
-
214
- for pattern in self.free_patterns:
215
- if pattern.search(transcript_lower):
216
- return 1.0
217
-
218
- return 0.0
219
-
220
- def extract_response_patterns(self, transcript_list: List[str]) -> Tuple[float, float]:
221
- """
222
- T2-T3: Average Response Length and Short Response Ratio
223
-
224
- Returns:
225
- - avg_response_len: Average words per response
226
- - short_ratio: Fraction of responses with ≤3 words
227
- """
228
- if not transcript_list:
229
- return 0.0, 0.0
230
-
231
- word_counts = [len(response.split()) for response in transcript_list]
232
-
233
- avg_response_len = np.mean(word_counts)
234
- short_count = sum(1 for wc in word_counts if wc <= 3)
235
- short_ratio = short_count / len(word_counts)
236
-
237
- return float(avg_response_len), float(short_ratio)
238
-
239
- def extract_marker_counts(self, transcript: str) -> Tuple[float, float, float]:
240
- """
241
- T4-T6: Cognitive Load, Time Pressure, Deflection markers
242
-
243
- Returns:
244
- - cognitive_load: Count of filler words / total words
245
- - time_pressure: Count of urgency markers / total words
246
- - deflection: Count of deflection phrases / total words
247
- """
248
- transcript_lower = transcript.lower()
249
- words = transcript.split()
250
- total_words = len(words)
251
-
252
- if total_words == 0:
253
- return 0.0, 0.0, 0.0
254
-
255
- # Count markers
256
- cognitive_load_count = sum(
257
- 1 for keyword in self.busy_keywords['cognitive_load']
258
- if keyword in transcript_lower
259
- )
260
-
261
- time_pressure_count = sum(
262
- 1 for keyword in self.busy_keywords['time_pressure']
263
- if keyword in transcript_lower
264
- )
265
-
266
- deflection_count = sum(
267
- 1 for keyword in self.busy_keywords['deflection']
268
- if keyword in transcript_lower
269
- )
270
-
271
- # Normalize by total words
272
- cognitive_load = cognitive_load_count / total_words
273
- time_pressure = time_pressure_count / total_words
274
- deflection = deflection_count / total_words
275
-
276
- return float(cognitive_load), float(time_pressure), float(deflection)
277
-
278
- def extract_sentiment(self, transcript: str) -> float:
279
- """
280
- T7: Sentiment Polarity (-1 to +1)
281
- Negative sentiment often indicates stress/frustration
282
- """
283
- if not transcript or len(transcript.strip()) == 0:
284
- return 0.0
285
-
286
- try:
287
- result = self.sentiment_model(transcript[:512])[0]
288
- label = result['label'].lower()
289
- score = result['score']
290
-
291
- if 'positive' in label:
292
- return float(score)
293
- elif 'negative' in label:
294
- return float(-score)
295
- else:
296
- return 0.0
297
-
298
- except Exception as e:
299
- print(f"Sentiment extraction error: {e}")
300
- return 0.0
301
-
302
- def extract_coherence(self, question: str, responses: List[str]) -> float:
303
- """
304
- T8: Coherence Score (0 to 1)
305
- Measures how relevant responses are to the question
306
- Low coherence = distracted/not paying attention
307
- """
308
- if not question or not responses:
309
- return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
310
-
311
- try:
312
- # Encode question and responses
313
- question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
314
- response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
315
-
316
- # Calculate cosine similarity
317
- from sentence_transformers import util
318
- similarities = util.cos_sim(question_embedding, response_embeddings)[0]
319
-
320
- # Average similarity as coherence score
321
- coherence = float(np.mean(similarities.cpu().numpy()))
322
-
323
- return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
324
- except Exception as e:
325
- print(f"Coherence extraction error: {e}")
326
- return 0.5
327
-
328
- def extract_latency(self, events: List[Dict]) -> float:
329
- """
330
- T9: Average Response Latency (seconds)
331
-
332
- ⚠️ WARNING: This feature is USELESS for single-side audio!
333
- Always returns 0.0 since we don't have agent questions.
334
- Kept for compatibility with existing models.
335
-
336
- events: List of dicts with 'timestamp' and 'speaker' keys
337
- """
338
- # Always return 0 for single-side audio
339
- return 0.0
340
-
341
- def extract_all(
342
- self,
343
- transcript_list: List[str],
344
- full_transcript: str = "",
345
- question: str = "",
346
- events: List[Dict] = None
347
- ) -> Dict[str, float]:
348
- """
349
- Extract all 9 text features
350
-
351
- Args:
352
- transcript_list: List of individual responses (can be single item for one-turn)
353
- full_transcript: Complete conversation text
354
- question: The question/prompt from agent (for coherence)
355
- events: List of timestamped events (unused for single-side audio)
356
-
357
- Returns:
358
- Dict with keys: t0_explicit_free, t1_explicit_busy,
359
- t2_avg_resp_len, t3_short_ratio,
360
- t4_cognitive_load, t5_time_pressure, t6_deflection,
361
- t7_sentiment, t8_coherence, t9_latency
362
- """
363
- features = {}
364
-
365
- # Use full transcript if not provided separately
366
- if not full_transcript:
367
- full_transcript = " ".join(transcript_list)
368
-
369
- # T0-T1: Explicit indicators (IMPROVED with NLI)
370
- features['t0_explicit_free'] = self.extract_explicit_free(full_transcript)
371
- features['t1_explicit_busy'] = self.extract_explicit_busy(full_transcript)
372
-
373
- # T2-T3: Response patterns
374
- avg_len, short_ratio = self.extract_response_patterns(transcript_list)
375
- features['t2_avg_resp_len'] = avg_len
376
- features['t3_short_ratio'] = short_ratio
377
-
378
- # T4-T6: Markers
379
- cog_load, time_press, deflect = self.extract_marker_counts(full_transcript)
380
- features['t4_cognitive_load'] = cog_load
381
- features['t5_time_pressure'] = time_press
382
- features['t6_deflection'] = deflect
383
-
384
- # T7: Sentiment
385
- features['t7_sentiment'] = self.extract_sentiment(full_transcript)
386
-
387
- # T8: Coherence (default to 0.5 if no question provided)
388
- if question:
389
- features['t8_coherence'] = self.extract_coherence(question, transcript_list)
390
- else:
391
- features['t8_coherence'] = 0.5 # Neutral
392
-
393
- # T9: Latency (ALWAYS 0 for single-side audio)
394
- features['t9_latency'] = 0.0
395
-
396
- return features
397
-
398
-
399
- if __name__ == "__main__":
400
- # Test the extractor
401
- print("Initializing Text Feature Extractor...")
402
- extractor = TextFeatureExtractor(use_intent_model=True)
403
-
404
- # Test cases for intent classification
405
- test_cases = [
406
- "I'm driving right now",
407
- "I'm not busy at all",
408
- "Can't talk, in a meeting",
409
- "I can talk now",
410
- "Not a good time",
411
- "I have time to chat"
412
- ]
413
-
414
- print("\nTesting intent classification:")
415
- for test in test_cases:
416
- busy_score = extractor.extract_explicit_busy(test)
417
- free_score = extractor.extract_explicit_free(test)
418
- print(f" '{test}'")
419
- print(f" Busy: {busy_score:.1f}, Free: {free_score:.1f}")
420
-
421
- # Full feature extraction
422
- print("\nFull feature extraction:")
423
- features = extractor.extract_all(
424
- transcript_list=["I'm not busy", "I can talk now"],
425
- full_transcript="I'm not busy. I can talk now.",
426
- question="How are you doing today?"
427
- )
428
-
429
- print("\nExtracted features:")
430
- for key, value in features.items():
431
- print(f" {key}: {value:.3f}")
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Feature Extractor - IMPROVED VERSION
3
+ Extracts 9 text features from conversation transcripts to detect busy/distracted states.
4
+
5
+ KEY IMPROVEMENTS:
6
+ 1. Uses NLI model for intent classification (understands "not busy" properly)
7
+ 2. Handles negation, context, and sarcasm
8
+ 3. Removes useless t9_latency for single-side audio
9
+ """
10
+
11
+ import numpy as np
12
+ from typing import List, Dict, Tuple
13
+ from transformers import pipeline
14
+ from sentence_transformers import SentenceTransformer
15
+ import re
16
+
17
+
18
+ class TextFeatureExtractor:
19
+ """Extract 9 text features for busy detection"""
20
+
21
+ def __init__(self, use_intent_model: bool = True):
22
+ """
23
+ Initialize NLP models
24
+
25
+ Args:
26
+ use_intent_model: If True, use BART-MNLI for intent classification
27
+ If False, fall back to pattern matching
28
+ """
29
+ self.use_intent_model = use_intent_model
30
+
31
+ print("Loading NLP models...")
32
+
33
+ # Sentiment model
34
+ model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
35
+ self.sentiment_model = pipeline(
36
+ "sentiment-analysis",
37
+ model=model_name,
38
+ device=-1
39
+ )
40
+ print("[OK] Sentiment model loaded")
41
+
42
+ # Coherence model
43
+ self.coherence_model = SentenceTransformer('all-MiniLM-L6-v2')
44
+ print("[OK] Coherence model loaded")
45
+
46
+ # Always setup patterns — busy_keywords is needed by extract_marker_counts()
47
+ self._setup_patterns()
48
+
49
+ # Intent classification model (NEW - understands context!)
50
+ if self.use_intent_model:
51
+ try:
52
+ self.intent_classifier = pipeline(
53
+ "zero-shot-classification",
54
+ model="facebook/bart-large-mnli",
55
+ device=-1
56
+ )
57
+ print("[OK] Intent classifier loaded (BART-MNLI)")
58
+ except Exception as e:
59
+ print(f"[WARN] Intent classifier failed to load: {e}")
60
+ print(" Falling back to pattern matching")
61
+ self.use_intent_model = False
62
+
63
+ def _setup_patterns(self):
64
+ """Setup pattern-based matching as fallback"""
65
+ # Negation pattern
66
+ self.negation_pattern = re.compile(
67
+ r'\b(not|no|never|neither|n\'t|dont|don\'t|cannot|can\'t|wont|won\'t)\s+\w*\s*(busy|free|available|talk|rush)',
68
+ re.IGNORECASE
69
+ )
70
+
71
+ # Busy patterns (positive assertions)
72
+ self.busy_patterns = [
73
+ r'\b(i\'m|i am|im)\s+(busy|driving|working|cooking|rushing)\b',
74
+ r'\bin a (meeting|call|hurry)\b',
75
+ r'\bcan\'t talk\b',
76
+ r'\bcall (you|me) back\b',
77
+ r'\bnot a good time\b',
78
+ r'\bbad time\b'
79
+ ]
80
+
81
+ # Free patterns (positive assertions) - includes invitation-to-talk phrases
82
+ self.free_patterns = [
83
+ r'\b(i\'m|i am|im)\s+(free|available)\b',
84
+ r'\bcan talk\b',
85
+ r'\bhave time\b',
86
+ r'\bnot busy\b',
87
+ r'\bgood time\b',
88
+ r'\bnow works\b',
89
+ # Invitation-to-talk patterns (strong availability signals)
90
+ r'\btell me (what you want|what you need|more)\b',
91
+ r'\bwhat (do you want|would you like) to talk about\b',
92
+ r'\bgo ahead\b',
93
+ r'\b(yeah|yes),?\s*sure\b',
94
+ r'\bsure,?\s*(what|go ahead|tell me)\b',
95
+ r'\bi\'?m (listening|here)\b',
96
+ r'\bfire away\b',
97
+ r'\bwhat\'?s (on your mind|up)\b',
98
+ ]
99
+
100
+ # Compile patterns
101
+ self.busy_patterns = [re.compile(p, re.IGNORECASE) for p in self.busy_patterns]
102
+ self.free_patterns = [re.compile(p, re.IGNORECASE) for p in self.free_patterns]
103
+
104
+ # Legacy keywords for other features
105
+ self.busy_keywords = {
106
+ 'cognitive_load': [
107
+ 'um', 'uh', 'like', 'you know', 'i mean', 'kind of',
108
+ 'sort of', 'basically', 'actually'
109
+ ],
110
+ 'time_pressure': [
111
+ 'quickly', 'hurry', 'fast', 'urgent', 'asap', 'right now',
112
+ 'immediately', 'short', 'brief'
113
+ ],
114
+ 'deflection': [
115
+ 'later', 'another time', 'not now', 'maybe', 'i don\'t know',
116
+ 'whatever', 'sure sure', 'yeah yeah'
117
+ ]
118
+ }
119
+
120
+ def extract_explicit_busy(self, transcript: str) -> float:
121
+ """
122
+ T1: Explicit Busy Indicators (binary: 0 or 1)
123
+
124
+ IMPROVED: Uses NLI model to understand context and negation
125
+ - "I'm busy" → 1.0
126
+ - "I'm not busy" → 0.0
127
+ - "Can't talk right now" → 1.0
128
+ - "I can talk" → 0.0
129
+ """
130
+ if not transcript or len(transcript.strip()) < 3:
131
+ return 0.0
132
+
133
+ # Method 1: Use intent classification model (best)
134
+ if self.use_intent_model:
135
+ try:
136
+ result = self.intent_classifier(
137
+ transcript,
138
+ candidate_labels=["person is busy or occupied",
139
+ "person is free and available",
140
+ "unclear or neutral"],
141
+ hypothesis_template="This {}."
142
+ )
143
+
144
+ top_label = result['labels'][0]
145
+ top_score = result['scores'][0]
146
+
147
+ # Require high confidence (>0.6) to avoid false positives
148
+ if top_score > 0.6:
149
+ if "busy" in top_label:
150
+ return 1.0
151
+ elif "free" in top_label:
152
+ return 0.0
153
+
154
+ return 0.0 # Neutral or low confidence
155
+
156
+ except Exception as e:
157
+ print(f"Intent classification failed: {e}")
158
+ # Fall through to pattern matching
159
+
160
+ # Method 2: Pattern-based with negation handling (fallback)
161
+ return self._extract_busy_patterns(transcript)
162
+
163
+ def _extract_busy_patterns(self, transcript: str) -> float:
164
+ """Pattern-based busy detection with negation handling"""
165
+ transcript_lower = transcript.lower()
166
+
167
+ # Check for negated busy/free statements
168
+ negation_match = self.negation_pattern.search(transcript_lower)
169
+ if negation_match:
170
+ matched_text = negation_match.group(0)
171
+ # "not busy" or "can't be free" etc.
172
+ if any(word in matched_text for word in ['busy', 'rush']):
173
+ return 0.0 # "not busy" = available
174
+ elif any(word in matched_text for word in ['free', 'available', 'talk']):
175
+ return 1.0 # "can't talk" or "not free" = busy
176
+
177
+ # Check free patterns first (higher priority)
178
+ for pattern in self.free_patterns:
179
+ if pattern.search(transcript_lower):
180
+ return 0.0
181
+
182
+ # Then check busy patterns
183
+ for pattern in self.busy_patterns:
184
+ if pattern.search(transcript_lower):
185
+ return 1.0
186
+
187
+ return 0.0
188
+
189
+ def extract_explicit_free(self, transcript: str) -> float:
190
+ """
191
+ T0: Explicit Free Indicators (binary: 0 or 1)
192
+
193
+ IMPROVED: Uses same context-aware approach as busy detection
194
+ """
195
+ if not transcript or len(transcript.strip()) < 3:
196
+ return 0.0
197
+
198
+ # Use intent model - include "inviting conversation" as availability signal
199
+ if self.use_intent_model:
200
+ try:
201
+ result = self.intent_classifier(
202
+ transcript,
203
+ candidate_labels=["person is free and inviting conversation",
204
+ "person is busy or occupied",
205
+ "unclear or neutral"],
206
+ hypothesis_template="This {}."
207
+ )
208
+
209
+ top_label = result['labels'][0]
210
+ top_score = result['scores'][0]
211
+
212
+ # Match "free" or "inviting" - both indicate availability
213
+ if top_score > 0.55 and ("free" in top_label or "inviting" in top_label):
214
+ return 1.0
215
+
216
+ return 0.0
217
+
218
+ except Exception as e:
219
+ print(f"Intent classification failed: {e}")
220
+
221
+ # Fallback to patterns
222
+ transcript_lower = transcript.lower()
223
+
224
+ for pattern in self.free_patterns:
225
+ if pattern.search(transcript_lower):
226
+ return 1.0
227
+
228
+ return 0.0
229
+
230
+ def extract_response_patterns(self, transcript_list: List[str]) -> Tuple[float, float]:
231
+ """
232
+ T2-T3: Average Response Length and Short Response Ratio
233
+
234
+ Returns:
235
+ - avg_response_len: Average words per response
236
+ - short_ratio: Fraction of responses with ≤3 words
237
+ """
238
+ if not transcript_list:
239
+ return 0.0, 0.0
240
+
241
+ word_counts = [len(response.split()) for response in transcript_list]
242
+
243
+ avg_response_len = np.mean(word_counts)
244
+ short_count = sum(1 for wc in word_counts if wc <= 3)
245
+ short_ratio = short_count / len(word_counts)
246
+
247
+ return float(avg_response_len), float(short_ratio)
248
+
249
+ def extract_marker_counts(self, transcript: str) -> Tuple[float, float, float]:
250
+ """
251
+ T4-T6: Cognitive Load, Time Pressure, Deflection markers
252
+
253
+ Returns:
254
+ - cognitive_load: Count of filler words / total words
255
+ - time_pressure: Count of urgency markers / total words
256
+ - deflection: Count of deflection phrases / total words
257
+ """
258
+ transcript_lower = transcript.lower()
259
+ words = transcript.split()
260
+ total_words = len(words)
261
+
262
+ if total_words == 0:
263
+ return 0.0, 0.0, 0.0
264
+
265
+ # Count markers
266
+ cognitive_load_count = sum(
267
+ 1 for keyword in self.busy_keywords['cognitive_load']
268
+ if keyword in transcript_lower
269
+ )
270
+
271
+ time_pressure_count = sum(
272
+ 1 for keyword in self.busy_keywords['time_pressure']
273
+ if keyword in transcript_lower
274
+ )
275
+
276
+ deflection_count = sum(
277
+ 1 for keyword in self.busy_keywords['deflection']
278
+ if keyword in transcript_lower
279
+ )
280
+
281
+ # Normalize by total words
282
+ cognitive_load = cognitive_load_count / total_words
283
+ time_pressure = time_pressure_count / total_words
284
+ deflection = deflection_count / total_words
285
+
286
+ return float(cognitive_load), float(time_pressure), float(deflection)
287
+
288
+ def extract_sentiment(self, transcript: str) -> float:
289
+ """
290
+ T7: Sentiment Polarity (-1 to +1)
291
+ Negative sentiment often indicates stress/frustration
292
+ """
293
+ if not transcript or len(transcript.strip()) == 0:
294
+ return 0.0
295
+
296
+ try:
297
+ result = self.sentiment_model(transcript[:512])[0]
298
+ label = result['label'].lower()
299
+ score = result['score']
300
+
301
+ if 'positive' in label:
302
+ return float(score)
303
+ elif 'negative' in label:
304
+ return float(-score)
305
+ else:
306
+ return 0.0
307
+
308
+ except Exception as e:
309
+ print(f"Sentiment extraction error: {e}")
310
+ return 0.0
311
+
312
+ def extract_coherence(self, question: str, responses: List[str]) -> float:
313
+ """
314
+ T8: Coherence Score (0 to 1)
315
+ Measures how relevant responses are to the question
316
+ Low coherence = distracted/not paying attention
317
+ """
318
+ if not question or not responses:
319
+ return 0.5 # Neutral if no data (changed from 1.0 to be more conservative)
320
+
321
+ try:
322
+ # Encode question and responses
323
+ question_embedding = self.coherence_model.encode(question, convert_to_tensor=True)
324
+ response_embeddings = self.coherence_model.encode(responses, convert_to_tensor=True)
325
+
326
+ # Calculate cosine similarity
327
+ from sentence_transformers import util
328
+ similarities = util.cos_sim(question_embedding, response_embeddings)[0]
329
+
330
+ # Average similarity as coherence score
331
+ coherence = float(np.mean(similarities.cpu().numpy()))
332
+
333
+ return max(0.0, min(1.0, coherence)) # Clamp to [0, 1]
334
+ except Exception as e:
335
+ print(f"Coherence extraction error: {e}")
336
+ return 0.5
337
+
338
+ def extract_latency(self, events: List[Dict]) -> float:
339
+ """
340
+ T9: Average Response Latency (seconds)
341
+
342
+ ⚠️ WARNING: This feature is USELESS for single-side audio!
343
+ Always returns 0.0 since we don't have agent questions.
344
+ Kept for compatibility with existing models.
345
+
346
+ events: List of dicts with 'timestamp' and 'speaker' keys
347
+ """
348
+ # Always return 0 for single-side audio
349
+ return 0.0
350
+
351
+ def extract_all(
352
+ self,
353
+ transcript_list: List[str],
354
+ full_transcript: str = "",
355
+ question: str = "",
356
+ events: List[Dict] = None
357
+ ) -> Dict[str, float]:
358
+ """
359
+ Extract all 9 text features
360
+
361
+ Args:
362
+ transcript_list: List of individual responses (can be single item for one-turn)
363
+ full_transcript: Complete conversation text
364
+ question: The question/prompt from agent (for coherence)
365
+ events: List of timestamped events (unused for single-side audio)
366
+
367
+ Returns:
368
+ Dict with keys: t0_explicit_free, t1_explicit_busy,
369
+ t2_avg_resp_len, t3_short_ratio,
370
+ t4_cognitive_load, t5_time_pressure, t6_deflection,
371
+ t7_sentiment, t8_coherence, t9_latency
372
+ """
373
+ features = {}
374
+
375
+ # Use full transcript if not provided separately
376
+ if not full_transcript:
377
+ full_transcript = " ".join(transcript_list)
378
+
379
+ # T0-T1: Explicit indicators (IMPROVED with NLI)
380
+ features['t0_explicit_free'] = self.extract_explicit_free(full_transcript)
381
+ features['t1_explicit_busy'] = self.extract_explicit_busy(full_transcript)
382
+
383
+ # T2-T3: Response patterns
384
+ avg_len, short_ratio = self.extract_response_patterns(transcript_list)
385
+ features['t2_avg_resp_len'] = avg_len
386
+ features['t3_short_ratio'] = short_ratio
387
+
388
+ # T4-T6: Markers
389
+ cog_load, time_press, deflect = self.extract_marker_counts(full_transcript)
390
+ features['t4_cognitive_load'] = cog_load
391
+ features['t5_time_pressure'] = time_press
392
+ features['t6_deflection'] = deflect
393
+
394
+ # T7: Sentiment
395
+ features['t7_sentiment'] = self.extract_sentiment(full_transcript)
396
+
397
+ # T8: Coherence (default to 0.5 if no question provided)
398
+ if question:
399
+ features['t8_coherence'] = self.extract_coherence(question, transcript_list)
400
+ else:
401
+ features['t8_coherence'] = 0.5 # Neutral
402
+
403
+ # T9: Latency (ALWAYS 0 for single-side audio)
404
+ features['t9_latency'] = 0.0
405
+
406
+ return features
407
+
408
+
409
+ if __name__ == "__main__":
410
+ # Test the extractor
411
+ print("Initializing Text Feature Extractor...")
412
+ extractor = TextFeatureExtractor(use_intent_model=True)
413
+
414
+ # Test cases for intent classification
415
+ test_cases = [
416
+ "I'm driving right now",
417
+ "I'm not busy at all",
418
+ "Can't talk, in a meeting",
419
+ "I can talk now",
420
+ "Not a good time",
421
+ "I have time to chat"
422
+ ]
423
+
424
+ print("\nTesting intent classification:")
425
+ for test in test_cases:
426
+ busy_score = extractor.extract_explicit_busy(test)
427
+ free_score = extractor.extract_explicit_free(test)
428
+ print(f" '{test}'")
429
+ print(f" Busy: {busy_score:.1f}, Free: {free_score:.1f}")
430
+
431
+ # Full feature extraction
432
+ print("\nFull feature extraction:")
433
+ features = extractor.extract_all(
434
+ transcript_list=["I'm not busy", "I can talk now"],
435
+ full_transcript="I'm not busy. I can talk now.",
436
+ question="How are you doing today?"
437
+ )
438
+
439
+ print("\nExtracted features:")
440
+ for key, value in features.items():
441
+ print(f" {key}: {value:.3f}")