Spaces:
Sleeping
Sleeping
| # src/classifier.py | |
| from typing import Dict, List, Union | |
| from transformers import pipeline | |
| import math | |
| import json | |
| from pathlib import Path | |
| # Lazy-loaded pipelines (module-level to reuse) | |
| _zero_shot_clf = None | |
| _sentiment_clf = None | |
| def get_zero_shot_classifier(): | |
| global _zero_shot_clf | |
| if _zero_shot_clf is None: | |
| # BART or RoBERTa NLI models are common choices | |
| _zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| return _zero_shot_clf | |
| def get_sentiment_classifier(): | |
| global _sentiment_clf | |
| if _sentiment_clf is None: | |
| # SST-2 fine-tuned model | |
| _sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
| return _sentiment_clf | |
| # Schema - fixed topic labels requested by the assignment | |
| TOPIC_LABELS = [ | |
| "How-to", | |
| "Product", | |
| "Connector", | |
| "Lineage", | |
| "API/SDK", | |
| "SSO", | |
| "Glossary", | |
| "Best practices", | |
| "Sensitive data" | |
| ] | |
| # Optionally add synonyms/prompts to nudge zero-shot | |
| LABEL_DESCRIPTIONS = { | |
| "How-to": "user asking how to perform a task or request a tutorial", | |
| "Product": "product feature, UI or general product question", | |
| "Connector": "questions about connectors, crawlers, integrations and failures", | |
| "Lineage": "questions about lineage, upstream/downstream or lineage exports", | |
| "API/SDK": "developer questions about APIs, SDKs, endpoints, code examples", | |
| "SSO": "authentication, SAML, SSO, Okta, login issues", | |
| "Glossary": "business glossary, terms, bulk import of glossary terms", | |
| "Best practices": "request for recommended approach, best practices or governance", | |
| "Sensitive data": "questions about PII, masking, DLP, secrets" | |
| } | |
| def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict: | |
| """ | |
| Returns a dictionary with labels and scores from zero-shot classifier. | |
| """ | |
| clf = get_zero_shot_classifier() | |
| # The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results. | |
| res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template) | |
| # Example res: {'sequence':..., 'labels': [...], 'scores':[...]} | |
| # We'll return top N labels above a threshold | |
| return res | |
| def classify_sentiment_hf(text: str) -> str: | |
| """ | |
| Returns a human-friendly sentiment label, mapping HF outputs to your schema. | |
| HF model returns POSITIVE/NEGATIVE with a score. | |
| We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive. | |
| """ | |
| clf = get_sentiment_classifier() | |
| out = clf(text[:1000]) # truncate long text for speed | |
| # out like [{'label': 'NEGATIVE', 'score': 0.999}] | |
| if not out: | |
| return "Neutral" | |
| lab = out[0]["label"].upper() | |
| score = out[0]["score"] | |
| # simple mapping | |
| if lab == "NEGATIVE": | |
| # distinguish angry vs frustrated by strength | |
| if score > 0.9: | |
| return "Angry" | |
| return "Frustrated" | |
| elif lab == "POSITIVE": | |
| if score > 0.9: | |
| return "Positive" | |
| return "Curious" | |
| else: | |
| return "Neutral" | |
| # Keep same rule-based priority function (deterministic SLA logic) | |
| PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"] | |
| PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"] | |
| def classify_priority(text: str, subject: str = "") -> str: | |
| t = (subject + " " + text).lower() | |
| for k in PRIORITY_KEYWORDS_P0: | |
| if k in t: | |
| return "P0" | |
| for k in PRIORITY_KEYWORDS_P1: | |
| if k in t: | |
| return "P1" | |
| return "P2" | |
| def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict: | |
| """ | |
| Full classification of a single ticket: | |
| - topic_tags: top_k labels from zero-shot (above threshold) | |
| - sentiment: HF sentiment mapped | |
| - priority: rule-based | |
| """ | |
| text = " ".join([ticket.get("subject", ""), ticket.get("body", "")]) | |
| z = classify_topic_zero_shot(text) | |
| labels = z.get("labels", []) | |
| scores = z.get("scores", []) | |
| # Collect top_k labels above threshold | |
| topic_tags = [] | |
| for lbl, score in zip(labels, scores): | |
| if score >= label_score_threshold: | |
| topic_tags.append(lbl) | |
| if len(topic_tags) >= top_k: | |
| break | |
| # fallback: if nothing passes threshold, take the top label | |
| if not topic_tags and labels: | |
| topic_tags = [labels[0]] | |
| sentiment = classify_sentiment_hf(text) | |
| priority = classify_priority(ticket.get("body",""), ticket.get("subject","")) | |
| return { | |
| "id": ticket.get("id"), | |
| "topic_tags": topic_tags, | |
| "topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)}, | |
| "sentiment": sentiment, | |
| "priority": priority | |
| } | |
| # batch classify and save JSON | |
| def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"): | |
| p_in = Path(__file__).parent.joinpath(input_path).resolve() | |
| p_out = Path(__file__).parent.joinpath(output_path).resolve() | |
| tickets = json.loads(p_in.read_text(encoding="utf-8")) | |
| results = [] | |
| for t in tickets: | |
| c = classify_ticket(t) | |
| results.append({**t, "classification": c}) | |
| p_out.write_text(json.dumps(results, indent=2), encoding="utf-8") | |
| print(f"Saved {len(results)} classified tickets to {p_out}") | |
| return p_out | |
| if __name__ == "__main__": | |
| classify_all_and_save() | |