File size: 5,675 Bytes
37a70cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# src/classifier.py
from typing import Dict, List, Union
from transformers import pipeline
import math
import json
from pathlib import Path

# Lazy-loaded pipelines (module-level to reuse)
_zero_shot_clf = None
_sentiment_clf = None

def get_zero_shot_classifier():
    global _zero_shot_clf
    if _zero_shot_clf is None:
        # BART or RoBERTa NLI models are common choices
        _zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    return _zero_shot_clf

def get_sentiment_classifier():
    global _sentiment_clf
    if _sentiment_clf is None:
        # SST-2 fine-tuned model
        _sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    return _sentiment_clf

# Schema - fixed topic labels requested by the assignment
TOPIC_LABELS = [
    "How-to",
    "Product",
    "Connector",
    "Lineage",
    "API/SDK",
    "SSO",
    "Glossary",
    "Best practices",
    "Sensitive data"
]

# Optionally add synonyms/prompts to nudge zero-shot
LABEL_DESCRIPTIONS = {
    "How-to": "user asking how to perform a task or request a tutorial",
    "Product": "product feature, UI or general product question",
    "Connector": "questions about connectors, crawlers, integrations and failures",
    "Lineage": "questions about lineage, upstream/downstream or lineage exports",
    "API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
    "SSO": "authentication, SAML, SSO, Okta, login issues",
    "Glossary": "business glossary, terms, bulk import of glossary terms",
    "Best practices": "request for recommended approach, best practices or governance",
    "Sensitive data": "questions about PII, masking, DLP, secrets"
}

def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
    """
    Returns a dictionary with labels and scores from zero-shot classifier.
    """
    clf = get_zero_shot_classifier()
    # The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
    res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
    # Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
    # We'll return top N labels above a threshold
    return res

def classify_sentiment_hf(text: str) -> str:
    """
    Returns a human-friendly sentiment label, mapping HF outputs to your schema.
    HF model returns POSITIVE/NEGATIVE with a score.
    We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
    """
    clf = get_sentiment_classifier()
    out = clf(text[:1000])  # truncate long text for speed
    # out like [{'label': 'NEGATIVE', 'score': 0.999}]
    if not out:
        return "Neutral"
    lab = out[0]["label"].upper()
    score = out[0]["score"]
    # simple mapping
    if lab == "NEGATIVE":
        # distinguish angry vs frustrated by strength
        if score > 0.9:
            return "Angry"
        return "Frustrated"
    elif lab == "POSITIVE":
        if score > 0.9:
            return "Positive"
        return "Curious"
    else:
        return "Neutral"

# Keep same rule-based priority function (deterministic SLA logic)
PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]

def classify_priority(text: str, subject: str = "") -> str:
    t = (subject + " " + text).lower()
    for k in PRIORITY_KEYWORDS_P0:
        if k in t:
            return "P0"
    for k in PRIORITY_KEYWORDS_P1:
        if k in t:
            return "P1"
    return "P2"

def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
    """
    Full classification of a single ticket:
     - topic_tags: top_k labels from zero-shot (above threshold)
     - sentiment: HF sentiment mapped
     - priority: rule-based
    """
    text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
    z = classify_topic_zero_shot(text)
    labels = z.get("labels", [])
    scores = z.get("scores", [])
    # Collect top_k labels above threshold
    topic_tags = []
    for lbl, score in zip(labels, scores):
        if score >= label_score_threshold:
            topic_tags.append(lbl)
        if len(topic_tags) >= top_k:
            break
    # fallback: if nothing passes threshold, take the top label
    if not topic_tags and labels:
        topic_tags = [labels[0]]

    sentiment = classify_sentiment_hf(text)
    priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))

    return {
        "id": ticket.get("id"),
        "topic_tags": topic_tags,
        "topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
        "sentiment": sentiment,
        "priority": priority
    }

# batch classify and save JSON
def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
    p_in = Path(__file__).parent.joinpath(input_path).resolve()
    p_out = Path(__file__).parent.joinpath(output_path).resolve()
    tickets = json.loads(p_in.read_text(encoding="utf-8"))
    results = []
    for t in tickets:
        c = classify_ticket(t)
        results.append({**t, "classification": c})
    p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
    print(f"Saved {len(results)} classified tickets to {p_out}")
    return p_out

if __name__ == "__main__":
    classify_all_and_save()