Spaces:

Debanna
/

Customer-Support-Copilot

Sleeping

File size: 5,675 Bytes

37a70cc

# src/classifier.py
from typing import Dict, List, Union
from transformers import pipeline
import math
import json
from pathlib import Path

# Lazy-loaded pipelines (module-level to reuse)
_zero_shot_clf = None
_sentiment_clf = None

def get_zero_shot_classifier():
    global _zero_shot_clf
    if _zero_shot_clf is None:
        # BART or RoBERTa NLI models are common choices
        _zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    return _zero_shot_clf

def get_sentiment_classifier():
    global _sentiment_clf
    if _sentiment_clf is None:
        # SST-2 fine-tuned model
        _sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    return _sentiment_clf

# Schema - fixed topic labels requested by the assignment
TOPIC_LABELS = [
    "How-to",
    "Product",
    "Connector",
    "Lineage",
    "API/SDK",
    "SSO",
    "Glossary",
    "Best practices",
    "Sensitive data"
]

# Optionally add synonyms/prompts to nudge zero-shot
LABEL_DESCRIPTIONS = {
    "How-to": "user asking how to perform a task or request a tutorial",
    "Product": "product feature, UI or general product question",
    "Connector": "questions about connectors, crawlers, integrations and failures",
    "Lineage": "questions about lineage, upstream/downstream or lineage exports",
    "API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
    "SSO": "authentication, SAML, SSO, Okta, login issues",
    "Glossary": "business glossary, terms, bulk import of glossary terms",
    "Best practices": "request for recommended approach, best practices or governance",
    "Sensitive data": "questions about PII, masking, DLP, secrets"
}

def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
    """
    Returns a dictionary with labels and scores from zero-shot classifier.
    """
    clf = get_zero_shot_classifier()
    # The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
    res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
    # Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
    # We'll return top N labels above a threshold
    return res

def classify_sentiment_hf(text: str) -> str:
    """
    Returns a human-friendly sentiment label, mapping HF outputs to your schema.
    HF model returns POSITIVE/NEGATIVE with a score.
    We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
    """
    clf = get_sentiment_classifier()
    out = clf(text[:1000])  # truncate long text for speed
    # out like [{'label': 'NEGATIVE', 'score': 0.999}]
    if not out:
        return "Neutral"
    lab = out[0]["label"].upper()
    score = out[0]["score"]
    # simple mapping
    if lab == "NEGATIVE":
        # distinguish angry vs frustrated by strength
        if score > 0.9:
            return "Angry"
        return "Frustrated"
    elif lab == "POSITIVE":
        if score > 0.9:
            return "Positive"
        return "Curious"
    else:
        return "Neutral"

# Keep same rule-based priority function (deterministic SLA logic)
PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]

def classify_priority(text: str, subject: str = "") -> str:
    t = (subject + " " + text).lower()
    for k in PRIORITY_KEYWORDS_P0:
        if k in t:
            return "P0"
    for k in PRIORITY_KEYWORDS_P1:
        if k in t:
            return "P1"
    return "P2"

def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
    """
    Full classification of a single ticket:
     - topic_tags: top_k labels from zero-shot (above threshold)
     - sentiment: HF sentiment mapped
     - priority: rule-based
    """
    text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
    z = classify_topic_zero_shot(text)
    labels = z.get("labels", [])
    scores = z.get("scores", [])
    # Collect top_k labels above threshold
    topic_tags = []
    for lbl, score in zip(labels, scores):
        if score >= label_score_threshold:
            topic_tags.append(lbl)
        if len(topic_tags) >= top_k:
            break
    # fallback: if nothing passes threshold, take the top label
    if not topic_tags and labels:
        topic_tags = [labels[0]]

    sentiment = classify_sentiment_hf(text)
    priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))

    return {
        "id": ticket.get("id"),
        "topic_tags": topic_tags,
        "topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
        "sentiment": sentiment,
        "priority": priority
    }

# batch classify and save JSON
def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
    p_in = Path(__file__).parent.joinpath(input_path).resolve()
    p_out = Path(__file__).parent.joinpath(output_path).resolve()
    tickets = json.loads(p_in.read_text(encoding="utf-8"))
    results = []
    for t in tickets:
        c = classify_ticket(t)
        results.append({**t, "classification": c})
    p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
    print(f"Saved {len(results)} classified tickets to {p_out}")
    return p_out

if __name__ == "__main__":
    classify_all_and_save()