File size: 5,345 Bytes
c960f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da01d1b
c960f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da01d1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c960f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# src/feature_extraction.py
# Feature Extraction Module — Multi-signal ticket analysis
# SupportMind v1.0 — Asmitha

import re
import logging
from typing import Dict

logger = logging.getLogger(__name__)

try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    HAS_VADER = True
except ImportError:
    HAS_VADER = False

URGENCY_KEYWORDS = [
    'urgent', 'asap', 'immediately', 'critical', 'emergency', 'blocking',
    'production down', 'outage', 'cannot access', 'locked out', 'deadline',
    'sla', 'escalate', 'priority', 'time-sensitive', 'showstopper',
]

PRODUCT_KEYWORDS = {
    'api': 'API/Integration',
    'dashboard': 'Dashboard',
    'export': 'Export Feature',
    'import': 'Import Feature',
    'billing': 'Billing System',
    'invoice': 'Invoice System',
    'sso': 'SSO/Authentication',
    'login': 'Authentication',
    'password': 'Authentication',
    'webhook': 'Webhooks',
    'integration': 'Integrations',
    'report': 'Reporting',
    'analytics': 'Analytics',
}


class FeatureExtractor:
    """
    Extracts multi-signal features from raw ticket text.
    
    Features:
        - Sentiment score (VADER or fallback)
        - Urgency keyword detection
        - Product/feature entity recognition
        - Text complexity (Flesch-Kincaid approximation)
        - Token count
        - Named entities (basic regex-based NER)
    """

    def __init__(self):
        self.sentiment_analyzer = SentimentIntensityAnalyzer() if HAS_VADER else None

    def extract(self, text: str) -> Dict:
        """Extract all features from ticket text."""
        text_lower = text.lower()
        words = text.split()
        sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]

        return {
            'sentiment_score': self._sentiment(text),
            'urgency_flags': self._urgency(text_lower),
            'urgency_score': self._urgency_score(text_lower),
            'product_entities': self._product_entities(text_lower),
            'text_complexity_score': self._flesch_kincaid(words, sentences),
            'token_count': len(words),
            'sentence_count': len(sentences),
            'has_question': '?' in text,
            'has_error_code': bool(re.search(r'error\s*(?:code\s*)?[\d#:]+|err[-_]\d+|HTTP\s*\d{3}', text, re.I)),
            'email_mentions': len(re.findall(r'[\w.+-]+@[\w-]+\.[\w.]+', text)),
            'url_mentions': len(re.findall(r'https?://\S+', text)),
            'mentioned_dates': bool(re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\blast\s+(?:week|month|tuesday|monday|wednesday|thursday|friday)\b', text_lower)),
        }

    def _sentiment(self, text: str) -> float:
        if self.sentiment_analyzer:
            return self.sentiment_analyzer.polarity_scores(text)['compound']
        neg = ['bad','terrible','broken','frustrated','angry','worst','hate','useless']
        pos = ['good','great','love','excellent','amazing','helpful','thanks']
        tl = text.lower()
        n = sum(1 for w in neg if w in tl)
        p = sum(1 for w in pos if w in tl)
        return (p - n) / max(p + n, 1)

    def _urgency(self, text_lower: str) -> list:
        return [kw for kw in URGENCY_KEYWORDS if kw in text_lower]

    def _urgency_score(self, text_lower: str) -> float:
        """Tiered urgency scoring based on keyword count.
        
        1 keyword  → 0.5  (moderate urgency — e.g. 'urgent')
        2 keywords → 0.75 (high urgency — e.g. 'urgent' + 'production down')
        3+ keywords→ 1.0  (critical — multiple severity indicators)
        """
        count = len(self._urgency(text_lower))
        if count == 0:
            return 0.0
        elif count == 1:
            return 0.5
        elif count == 2:
            return 0.75
        else:
            return 1.0

    def _product_entities(self, text_lower: str) -> list:
        found = []
        for kw, label in PRODUCT_KEYWORDS.items():
            if kw in text_lower and label not in found:
                found.append(label)
        return found

    def _flesch_kincaid(self, words: list, sentences: list) -> float:
        if not words or not sentences:
            return 0.0
        avg_sentence_len = len(words) / len(sentences)
        syllables = sum(self._count_syllables(w) for w in words)
        avg_syllables = syllables / max(len(words), 1)
        grade = 0.39 * avg_sentence_len + 11.8 * avg_syllables - 15.59
        return round(max(0, grade), 2)

    def _count_syllables(self, word: str) -> int:
        word = word.lower().strip(".,!?;:'\"")
        if len(word) <= 2:
            return 1
        vowels = 'aeiouy'
        count = 0
        prev_vowel = False
        for ch in word:
            is_vowel = ch in vowels
            if is_vowel and not prev_vowel:
                count += 1
            prev_vowel = is_vowel
        if word.endswith('e') and count > 1:
            count -= 1
        return max(count, 1)


if __name__ == '__main__':
    ext = FeatureExtractor()
    ticket = "Hey, we have been having issues with the export function since last Tuesday's update. Also our invoice from last month looks incorrect. Can someone help? We are considering upgrading but want this sorted first."
    features = ext.extract(ticket)
    for k, v in features.items():
        print(f"  {k}: {v}")