Spaces:
Sleeping
Sleeping
File size: 5,675 Bytes
37a70cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# src/classifier.py
from typing import Dict, List, Union
from transformers import pipeline
import math
import json
from pathlib import Path
# Lazy-loaded pipelines (module-level to reuse)
_zero_shot_clf = None
_sentiment_clf = None
def get_zero_shot_classifier():
global _zero_shot_clf
if _zero_shot_clf is None:
# BART or RoBERTa NLI models are common choices
_zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
return _zero_shot_clf
def get_sentiment_classifier():
global _sentiment_clf
if _sentiment_clf is None:
# SST-2 fine-tuned model
_sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
return _sentiment_clf
# Schema - fixed topic labels requested by the assignment
TOPIC_LABELS = [
"How-to",
"Product",
"Connector",
"Lineage",
"API/SDK",
"SSO",
"Glossary",
"Best practices",
"Sensitive data"
]
# Optionally add synonyms/prompts to nudge zero-shot
LABEL_DESCRIPTIONS = {
"How-to": "user asking how to perform a task or request a tutorial",
"Product": "product feature, UI or general product question",
"Connector": "questions about connectors, crawlers, integrations and failures",
"Lineage": "questions about lineage, upstream/downstream or lineage exports",
"API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
"SSO": "authentication, SAML, SSO, Okta, login issues",
"Glossary": "business glossary, terms, bulk import of glossary terms",
"Best practices": "request for recommended approach, best practices or governance",
"Sensitive data": "questions about PII, masking, DLP, secrets"
}
def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
"""
Returns a dictionary with labels and scores from zero-shot classifier.
"""
clf = get_zero_shot_classifier()
# The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
# Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
# We'll return top N labels above a threshold
return res
def classify_sentiment_hf(text: str) -> str:
"""
Returns a human-friendly sentiment label, mapping HF outputs to your schema.
HF model returns POSITIVE/NEGATIVE with a score.
We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
"""
clf = get_sentiment_classifier()
out = clf(text[:1000]) # truncate long text for speed
# out like [{'label': 'NEGATIVE', 'score': 0.999}]
if not out:
return "Neutral"
lab = out[0]["label"].upper()
score = out[0]["score"]
# simple mapping
if lab == "NEGATIVE":
# distinguish angry vs frustrated by strength
if score > 0.9:
return "Angry"
return "Frustrated"
elif lab == "POSITIVE":
if score > 0.9:
return "Positive"
return "Curious"
else:
return "Neutral"
# Keep same rule-based priority function (deterministic SLA logic)
PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]
def classify_priority(text: str, subject: str = "") -> str:
t = (subject + " " + text).lower()
for k in PRIORITY_KEYWORDS_P0:
if k in t:
return "P0"
for k in PRIORITY_KEYWORDS_P1:
if k in t:
return "P1"
return "P2"
def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
"""
Full classification of a single ticket:
- topic_tags: top_k labels from zero-shot (above threshold)
- sentiment: HF sentiment mapped
- priority: rule-based
"""
text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
z = classify_topic_zero_shot(text)
labels = z.get("labels", [])
scores = z.get("scores", [])
# Collect top_k labels above threshold
topic_tags = []
for lbl, score in zip(labels, scores):
if score >= label_score_threshold:
topic_tags.append(lbl)
if len(topic_tags) >= top_k:
break
# fallback: if nothing passes threshold, take the top label
if not topic_tags and labels:
topic_tags = [labels[0]]
sentiment = classify_sentiment_hf(text)
priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))
return {
"id": ticket.get("id"),
"topic_tags": topic_tags,
"topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
"sentiment": sentiment,
"priority": priority
}
# batch classify and save JSON
def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
p_in = Path(__file__).parent.joinpath(input_path).resolve()
p_out = Path(__file__).parent.joinpath(output_path).resolve()
tickets = json.loads(p_in.read_text(encoding="utf-8"))
results = []
for t in tickets:
c = classify_ticket(t)
results.append({**t, "classification": c})
p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
print(f"Saved {len(results)} classified tickets to {p_out}")
return p_out
if __name__ == "__main__":
classify_all_and_save()
|