Spaces:

cngchis
/

Support-ticket-router

Running

File size: 2,861 Bytes

import time
import re
from llama_cpp import Llama
from api.config import LABELS
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# DEFINE GGUF MODEL PATH
MODEL_PATH = "cngchis/phi4-mini-intent-GGUF"

# LOAD GGUF MODEL
try:
    llm = Llama.from_pretrained(
        repo_id=MODEL_PATH,
        filename="*q4_k_m*", 
        n_ctx=128,
        n_threads=2,
        verbose=False,
    )
    gguf_available = True
    logger.info(f"GGUF model loaded from {MODEL_PATH}")

except Exception as e:
    logger.warning(f"GGUF load failed: {e}")
    gguf_available = False

def format_prompt(text: str) -> str:
    return f"""
    Classify intent into one label:
    api, billing, cancellation, complaint, technical, upgrade
    
    Message:
    {text}
    
    Label:
    """

def extract_label(raw: str) -> str:
    raw = re.sub(r'<thought>.*?</thought>', '', raw, flags=re.DOTALL)
    raw = re.sub(r'<think>.*?</think>',    '', raw, flags=re.DOTALL)
    raw = raw.strip().lower()
    words = re.findall(
        r"\b(api|billing|cancellation|complaint|technical|upgrade)\b", raw
    )
    if words:
        return words[-1]
    return "unknown"

def infer_phi4(text: str) -> dict:
    if not gguf_available:
        return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}

    try:
        start  = time.time()
        output = llm(
            format_prompt(text),
            max_tokens=5,
            temperature=0.1,
            stop=["</s>", "[/INST]"]
        )
        raw = output["choices"][0]["text"]
        predicted = extract_label(raw)
        latency = (time.time() - start) * 1000

        return {
            "intent"    : predicted,
            "latency_ms": round(latency, 2),
            "confidence": 0.93,
            "model"     : "phi4_gguf"
        }
    except Exception as e:
        logger.error(f"GGUF ERROR: {e}")
        return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}

def infer(text: str) -> dict:
    start = time.time()

    results = [infer_phi4(text)]
    total_latency = (time.time() - start) * 1000

    valid_results = [r for r in results if r["intent"] != "unknown"]

    if not valid_results:
        final_intent = "unknown"
        confidence   = 0.0
    else:
        intent_votes = {}
        for r in valid_results:
            intent_votes[r["intent"]] = intent_votes.get(r["intent"], 0) + 1
        final_intent = max(intent_votes, key=intent_votes.get)
        confidence   = sum(r["confidence"] for r in valid_results) / len(valid_results)

    return {
        "intent" : final_intent,
        "confidence" : round(confidence, 3),
        "model_results" : results,
        "latency_ms" : round(total_latency, 2),
        "individual_latencies": {r["model"]: r["latency_ms"] for r in results}
    }