File size: 9,344 Bytes

9a022ff

"""

Text Feature Extraction — Hugging Face Inference Endpoint Handler



Extracts all 9 text features from conversation transcript:

  t0_explicit_free, t1_explicit_busy, t2_avg_resp_len, t3_short_ratio,

  t4_cognitive_load, t5_time_pressure, t6_deflection, t7_sentiment,

  t8_coherence, t9_latency



Derived from: src/text_features.py

"""

import re
import numpy as np
from typing import List, Dict
from transformers import pipeline
from sentence_transformers import SentenceTransformer


# ──────────────────────────────────────────────────────────────────────── #
# TextFeatureExtractorEndpoint (mirrors src/text_features.py)
# ──────────────────────────────────────────────────────────────────────── #

class TextFeatureExtractorEndpoint:
    """Stateless text feature extraction for HF endpoint."""

    # Keywords from src/text_features.py
    BUSY_KEYWORDS = [
        "busy", "driving", "can't talk", "in a meeting", "call me later",
        "call back", "not now", "not a good time", "occupied", "running late",
        "in the middle of", "hold on", "give me a minute", "let me call you back",
        "gotta go", "heading out", "right now", "on the road", "at work",
        "hung up", "hang up", "rushing",
    ]
    FREE_KEYWORDS = [
        "free", "available", "go ahead", "i have time", "i'm listening",
        "sure", "yes", "yeah", "okay", "what's up", "tell me",
        "i can talk", "go on", "fire away",
    ]
    FILLER_WORDS = [
        "um", "uh", "hmm", "like", "you know", "sort of",
        "kind of", "i mean", "well", "so", "right", "actually",
    ]
    URGENCY_MARKERS = [
        "hurry", "quick", "fast", "rush", "soon", "asap",
        "right now", "immediately", "no time",
    ]
    DEFLECTION_PHRASES = [
        "later", "not now", "another time", "busy", "can't",
        "don't have time", "gotta go", "let me", "call me back",
    ]

    def __init__(self):
        print("Loading NLP models for text features...")

        # Sentiment — RoBERTa-based
        try:
            self.sentiment_model = pipeline(
                "sentiment-analysis",
                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                truncation=True,
                max_length=512,
            )
            print("✓ Sentiment model loaded")
        except Exception as e:
            print(f"⚠ Sentiment model fallback: {e}")
            self.sentiment_model = None

        # Coherence — Sentence Transformer
        try:
            self.coherence_model = SentenceTransformer("all-MiniLM-L6-v2")
            print("✓ Coherence model loaded")
        except Exception as e:
            print(f"⚠ Coherence model fallback: {e}")
            self.coherence_model = None

        print("✓ Text feature extractor ready")

    # --- T0: Explicit Free ---
    def extract_explicit_free(self, transcript: str) -> float:
        text = transcript.lower()
        for kw in self.FREE_KEYWORDS:
            if kw in text:
                return 1.0
        return 0.0

    # --- T1: Explicit Busy ---
    def extract_explicit_busy(self, transcript: str) -> float:
        text = transcript.lower()
        for kw in self.BUSY_KEYWORDS:
            if kw in text:
                return 1.0
        return 0.0

    # --- T2-T3: Response patterns ---
    def extract_response_patterns(self, transcript_list: List[str]) -> Dict[str, float]:
        if not transcript_list:
            return {"t2_avg_resp_len": 0.0, "t3_short_ratio": 0.0}
        lengths = [len(r.split()) for r in transcript_list]
        avg_len = float(np.mean(lengths))
        short_ratio = sum(1 for l in lengths if l <= 3) / len(lengths)
        return {"t2_avg_resp_len": avg_len, "t3_short_ratio": float(short_ratio)}

    # --- T4-T6: Marker counts ---
    def extract_marker_counts(self, transcript: str) -> Dict[str, float]:
        text = transcript.lower()
        words = text.split()
        total = max(len(words), 1)

        filler_count = sum(1 for w in words if w in self.FILLER_WORDS)
        urgency_count = sum(1 for phrase in self.URGENCY_MARKERS if phrase in text)
        deflection_count = sum(1 for phrase in self.DEFLECTION_PHRASES if phrase in text)

        return {
            "t4_cognitive_load": float(filler_count / total),
            "t5_time_pressure": float(urgency_count / total),
            "t6_deflection": float(deflection_count / total),
        }

    # --- T7: Sentiment ---
    def extract_sentiment(self, transcript: str) -> float:
        if self.sentiment_model is None or not transcript.strip():
            return 0.0
        try:
            result = self.sentiment_model(transcript[:512])[0]
            label = result["label"].lower()
            score = result["score"]
            if "positive" in label:
                return float(score)
            elif "negative" in label:
                return float(-score)
            else:
                return 0.0
        except Exception:
            return 0.0

    # --- T8: Coherence ---
    def extract_coherence(self, question: str, responses: List[str]) -> float:
        if self.coherence_model is None or not question or not responses:
            return 0.5
        try:
            q_emb = self.coherence_model.encode(question)
            r_embs = self.coherence_model.encode(responses)
            from sklearn.metrics.pairwise import cosine_similarity as cos_sim
            similarities = cos_sim([q_emb], r_embs)[0]
            return float(np.mean(similarities))
        except Exception:
            return 0.5

    # --- T9: Latency ---
    def extract_latency(self, events: List[Dict]) -> float:
        if not events or len(events) < 2:
            return 0.0
        latencies = []
        for i in range(1, len(events)):
            if events[i].get("speaker") != events[i - 1].get("speaker"):
                t1 = events[i - 1].get("timestamp", 0)
                t2 = events[i].get("timestamp", 0)
                if t2 > t1:
                    latencies.append(t2 - t1)
        return float(np.mean(latencies)) if latencies else 0.0

    # --- Extract all ---
    def extract_all(

        self,

        transcript_list: List[str],

        full_transcript: str = "",

        question: str = "",

        events: List[Dict] = None,

    ) -> Dict[str, float]:
        if not full_transcript and transcript_list:
            full_transcript = " ".join(transcript_list)

        features = {}
        features["t0_explicit_free"] = self.extract_explicit_free(full_transcript)
        features["t1_explicit_busy"] = self.extract_explicit_busy(full_transcript)
        patterns = self.extract_response_patterns(transcript_list)
        features.update(patterns)
        markers = self.extract_marker_counts(full_transcript)
        features.update(markers)
        features["t7_sentiment"] = self.extract_sentiment(full_transcript)
        features["t8_coherence"] = self.extract_coherence(question, transcript_list)
        features["t9_latency"] = self.extract_latency(events or [])
        return features


# ──────────────────────────────────────────────────────────────────────── #
# FastAPI handler for deployment
# ──────────────────────────────────────────────────────────────────────── #

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional

app = FastAPI(title="Text Feature Extraction API", version="1.0.0")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], allow_credentials=True,
    allow_methods=["*"], allow_headers=["*"],
)

extractor = TextFeatureExtractorEndpoint()


class TextRequest(BaseModel):
    transcript: str = ""
    utterances: List[str] = []
    question: str = ""
    events: Optional[List[Dict]] = None


@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "sentiment_loaded": extractor.sentiment_model is not None,
        "coherence_loaded": extractor.coherence_model is not None,
    }


@app.post("/extract-text-features")
async def extract_text_features(data: TextRequest):
    """Extract all 9 text features from transcript."""
    transcript_list = data.utterances if data.utterances else [data.transcript]
    features = extractor.extract_all(
        transcript_list=transcript_list,
        full_transcript=data.transcript,
        question=data.question,
        events=data.events,
    )
    return features


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7861)