File size: 9,344 Bytes
9a022ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""

Text Feature Extraction β€” Hugging Face Inference Endpoint Handler



Extracts all 9 text features from conversation transcript:

  t0_explicit_free, t1_explicit_busy, t2_avg_resp_len, t3_short_ratio,

  t4_cognitive_load, t5_time_pressure, t6_deflection, t7_sentiment,

  t8_coherence, t9_latency



Derived from: src/text_features.py

"""

import re
import numpy as np
from typing import List, Dict
from transformers import pipeline
from sentence_transformers import SentenceTransformer


# ──────────────────────────────────────────────────────────────────────── #
# TextFeatureExtractorEndpoint (mirrors src/text_features.py)
# ──────────────────────────────────────────────────────────────────────── #

class TextFeatureExtractorEndpoint:
    """Stateless text feature extraction for HF endpoint."""

    # Keywords from src/text_features.py
    BUSY_KEYWORDS = [
        "busy", "driving", "can't talk", "in a meeting", "call me later",
        "call back", "not now", "not a good time", "occupied", "running late",
        "in the middle of", "hold on", "give me a minute", "let me call you back",
        "gotta go", "heading out", "right now", "on the road", "at work",
        "hung up", "hang up", "rushing",
    ]
    FREE_KEYWORDS = [
        "free", "available", "go ahead", "i have time", "i'm listening",
        "sure", "yes", "yeah", "okay", "what's up", "tell me",
        "i can talk", "go on", "fire away",
    ]
    FILLER_WORDS = [
        "um", "uh", "hmm", "like", "you know", "sort of",
        "kind of", "i mean", "well", "so", "right", "actually",
    ]
    URGENCY_MARKERS = [
        "hurry", "quick", "fast", "rush", "soon", "asap",
        "right now", "immediately", "no time",
    ]
    DEFLECTION_PHRASES = [
        "later", "not now", "another time", "busy", "can't",
        "don't have time", "gotta go", "let me", "call me back",
    ]

    def __init__(self):
        print("Loading NLP models for text features...")

        # Sentiment β€” RoBERTa-based
        try:
            self.sentiment_model = pipeline(
                "sentiment-analysis",
                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                truncation=True,
                max_length=512,
            )
            print("βœ“ Sentiment model loaded")
        except Exception as e:
            print(f"⚠ Sentiment model fallback: {e}")
            self.sentiment_model = None

        # Coherence β€” Sentence Transformer
        try:
            self.coherence_model = SentenceTransformer("all-MiniLM-L6-v2")
            print("βœ“ Coherence model loaded")
        except Exception as e:
            print(f"⚠ Coherence model fallback: {e}")
            self.coherence_model = None

        print("βœ“ Text feature extractor ready")

    # --- T0: Explicit Free ---
    def extract_explicit_free(self, transcript: str) -> float:
        text = transcript.lower()
        for kw in self.FREE_KEYWORDS:
            if kw in text:
                return 1.0
        return 0.0

    # --- T1: Explicit Busy ---
    def extract_explicit_busy(self, transcript: str) -> float:
        text = transcript.lower()
        for kw in self.BUSY_KEYWORDS:
            if kw in text:
                return 1.0
        return 0.0

    # --- T2-T3: Response patterns ---
    def extract_response_patterns(self, transcript_list: List[str]) -> Dict[str, float]:
        if not transcript_list:
            return {"t2_avg_resp_len": 0.0, "t3_short_ratio": 0.0}
        lengths = [len(r.split()) for r in transcript_list]
        avg_len = float(np.mean(lengths))
        short_ratio = sum(1 for l in lengths if l <= 3) / len(lengths)
        return {"t2_avg_resp_len": avg_len, "t3_short_ratio": float(short_ratio)}

    # --- T4-T6: Marker counts ---
    def extract_marker_counts(self, transcript: str) -> Dict[str, float]:
        text = transcript.lower()
        words = text.split()
        total = max(len(words), 1)

        filler_count = sum(1 for w in words if w in self.FILLER_WORDS)
        urgency_count = sum(1 for phrase in self.URGENCY_MARKERS if phrase in text)
        deflection_count = sum(1 for phrase in self.DEFLECTION_PHRASES if phrase in text)

        return {
            "t4_cognitive_load": float(filler_count / total),
            "t5_time_pressure": float(urgency_count / total),
            "t6_deflection": float(deflection_count / total),
        }

    # --- T7: Sentiment ---
    def extract_sentiment(self, transcript: str) -> float:
        if self.sentiment_model is None or not transcript.strip():
            return 0.0
        try:
            result = self.sentiment_model(transcript[:512])[0]
            label = result["label"].lower()
            score = result["score"]
            if "positive" in label:
                return float(score)
            elif "negative" in label:
                return float(-score)
            else:
                return 0.0
        except Exception:
            return 0.0

    # --- T8: Coherence ---
    def extract_coherence(self, question: str, responses: List[str]) -> float:
        if self.coherence_model is None or not question or not responses:
            return 0.5
        try:
            q_emb = self.coherence_model.encode(question)
            r_embs = self.coherence_model.encode(responses)
            from sklearn.metrics.pairwise import cosine_similarity as cos_sim
            similarities = cos_sim([q_emb], r_embs)[0]
            return float(np.mean(similarities))
        except Exception:
            return 0.5

    # --- T9: Latency ---
    def extract_latency(self, events: List[Dict]) -> float:
        if not events or len(events) < 2:
            return 0.0
        latencies = []
        for i in range(1, len(events)):
            if events[i].get("speaker") != events[i - 1].get("speaker"):
                t1 = events[i - 1].get("timestamp", 0)
                t2 = events[i].get("timestamp", 0)
                if t2 > t1:
                    latencies.append(t2 - t1)
        return float(np.mean(latencies)) if latencies else 0.0

    # --- Extract all ---
    def extract_all(

        self,

        transcript_list: List[str],

        full_transcript: str = "",

        question: str = "",

        events: List[Dict] = None,

    ) -> Dict[str, float]:
        if not full_transcript and transcript_list:
            full_transcript = " ".join(transcript_list)

        features = {}
        features["t0_explicit_free"] = self.extract_explicit_free(full_transcript)
        features["t1_explicit_busy"] = self.extract_explicit_busy(full_transcript)
        patterns = self.extract_response_patterns(transcript_list)
        features.update(patterns)
        markers = self.extract_marker_counts(full_transcript)
        features.update(markers)
        features["t7_sentiment"] = self.extract_sentiment(full_transcript)
        features["t8_coherence"] = self.extract_coherence(question, transcript_list)
        features["t9_latency"] = self.extract_latency(events or [])
        return features


# ──────────────────────────────────────────────────────────────────────── #
# FastAPI handler for deployment
# ──────────────────────────────────────────────────────────────────────── #

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional

app = FastAPI(title="Text Feature Extraction API", version="1.0.0")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], allow_credentials=True,
    allow_methods=["*"], allow_headers=["*"],
)

extractor = TextFeatureExtractorEndpoint()


class TextRequest(BaseModel):
    transcript: str = ""
    utterances: List[str] = []
    question: str = ""
    events: Optional[List[Dict]] = None


@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "sentiment_loaded": extractor.sentiment_model is not None,
        "coherence_loaded": extractor.coherence_model is not None,
    }


@app.post("/extract-text-features")
async def extract_text_features(data: TextRequest):
    """Extract all 9 text features from transcript."""
    transcript_list = data.utterances if data.utterances else [data.transcript]
    features = extractor.extract_all(
        transcript_list=transcript_list,
        full_transcript=data.transcript,
        question=data.question,
        events=data.events,
    )
    return features


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7861)